diff --git a/checkpoint-4545/config.json b/checkpoint-4545/config.json new file mode 100644 index 0000000000000000000000000000000000000000..fda0153f8ee396146a87c398da9234b3dce005be --- /dev/null +++ b/checkpoint-4545/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "./meta-llama_Llama-3.1-8B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.46.1", + "use_cache": false, + "vocab_size": 128259 +} diff --git a/checkpoint-4545/generation_config.json b/checkpoint-4545/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eab5082496e8b01f9c606a306676cbfabe0cce9d --- /dev/null +++ b/checkpoint-4545/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.46.1" +} diff --git a/checkpoint-4545/global_step4545/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-4545/global_step4545/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cea2c79ebcdc3505cd8cc73a06d9ba94259ddf09 --- /dev/null +++ b/checkpoint-4545/global_step4545/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22544425d043bcfa46ce55109fd0f9e5231412919ee11aac13a2d7f8c44837f6 +size 12045435328 diff --git a/checkpoint-4545/global_step4545/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-4545/global_step4545/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c48fa78eb3d029a1dad969e016ecda29e51b3b7 --- /dev/null +++ b/checkpoint-4545/global_step4545/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20cc55bf8077d24719a106ca9b38c2d74e52a5ef1cc50f7c34a5f5a4c2607cbc +size 12045436096 diff --git a/checkpoint-4545/global_step4545/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-4545/global_step4545/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0f6b644abf9dbc71c8c5b942d42ae9be62f9bde --- /dev/null +++ b/checkpoint-4545/global_step4545/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b102b6068ca89a365918c87334461adf555de02111ee876a4e1dc1fcc027738 +size 12045436352 diff --git a/checkpoint-4545/global_step4545/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-4545/global_step4545/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd199f11137da71505c71e2bf86caceac24cf13f --- /dev/null +++ b/checkpoint-4545/global_step4545/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb1142df017748c691c55ec7dcb61b4fafea0bf911b859279819ad9f366c4ebd +size 12045436096 diff --git a/checkpoint-4545/global_step4545/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-4545/global_step4545/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6fec8b501df3a98681f7ec3725170bc1c3e90c1 --- /dev/null +++ b/checkpoint-4545/global_step4545/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0da8557fdd3551b251d7c565be9783b860eb8fecb8ff10ddc028f43eaf470b0 +size 12045436352 diff --git a/checkpoint-4545/global_step4545/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-4545/global_step4545/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d39d8579333e96570b45edcc7d6589d2fdb674f7 --- /dev/null +++ b/checkpoint-4545/global_step4545/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:402a6620b2e6627561a8b544ef02b114ab15ec3f2c63ae68cacf484c4551514b +size 12045436416 diff --git a/checkpoint-4545/global_step4545/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-4545/global_step4545/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d220058ecdac3aab8e06372694456e0a7e5afdd5 --- /dev/null +++ b/checkpoint-4545/global_step4545/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1870503991c36528532766090519785cb41968507b8342be039582a580202d4d +size 12045436096 diff --git a/checkpoint-4545/global_step4545/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-4545/global_step4545/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..695b4259ece2471e47755b520ad095a90e110a41 --- /dev/null +++ b/checkpoint-4545/global_step4545/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31b1f7ec8ff53e2cd0275d1084a233a39a594688c247ccf70f17ecb23e2b3390 +size 12045435008 diff --git a/checkpoint-4545/global_step4545/mp_rank_00_model_states.pt b/checkpoint-4545/global_step4545/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..887869b0eb825483cb3b2ade0490787b01240e27 --- /dev/null +++ b/checkpoint-4545/global_step4545/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc2563c4ee9612355026bc4e8d354557bebfa24efdd67033c9e6d57cb15e4e85 +size 16060659704 diff --git a/checkpoint-4545/latest b/checkpoint-4545/latest new file mode 100644 index 0000000000000000000000000000000000000000..02c5998774395e519fbd9f57d1bf979ae53c3f97 --- /dev/null +++ b/checkpoint-4545/latest @@ -0,0 +1 @@ +global_step4545 \ No newline at end of file diff --git a/checkpoint-4545/model-00001-of-00004.safetensors b/checkpoint-4545/model-00001-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f18ec972a3d3f8c96e26493ea1f0680ffb025031 --- /dev/null +++ b/checkpoint-4545/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3770a255cc8357492a739b7e1c7f803bd1447bdc8eb2d6e0fb7f6cac72936c7 +size 4976723248 diff --git a/checkpoint-4545/model-00002-of-00004.safetensors b/checkpoint-4545/model-00002-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b52dc7a3f759e503bb8c6d65d18ae110ab44b6df --- /dev/null +++ b/checkpoint-4545/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3b8f3c4be77a2fda81f83029b01ec5bf0112a0f44e2770b6308395c561ada95 +size 4999802720 diff --git a/checkpoint-4545/model-00003-of-00004.safetensors b/checkpoint-4545/model-00003-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..09b473bdb286ecc5b80e83bff1b6f307dbeb9733 --- /dev/null +++ b/checkpoint-4545/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47d7fa5ba30d95a79033fa2f4cdd2bf350fd574435a72ac6337a80410313a320 +size 4915916176 diff --git a/checkpoint-4545/model-00004-of-00004.safetensors b/checkpoint-4545/model-00004-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..59843fb43cb9989c8c13f200325c6d2dac7e26fb --- /dev/null +++ b/checkpoint-4545/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ac1aceef3a55f9747148821d915d32fad318fabf4b118d6fdb103ff02a025cc +size 1168163384 diff --git a/checkpoint-4545/model.safetensors.index.json b/checkpoint-4545/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..e734f8f9bcabe95e936a11f19b77148f54640122 --- /dev/null +++ b/checkpoint-4545/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 16060571648 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/checkpoint-4545/rng_state_0.pth b/checkpoint-4545/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b6473612e41c5cfd6973c2e71fa5f3ad2b2bcad1 --- /dev/null +++ b/checkpoint-4545/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575119a228f98110923ffa2dedcb50e3317251b26054355d015e0b2240d566f2 +size 15984 diff --git a/checkpoint-4545/rng_state_1.pth b/checkpoint-4545/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..8506e00431b6ac7067699c0ea4f59adb6fa0ba20 --- /dev/null +++ b/checkpoint-4545/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0728b56dab7abb5ef8a0d4bae3519c5767c97467bdd886d26bf19cc8599d0312 +size 15984 diff --git a/checkpoint-4545/rng_state_2.pth b/checkpoint-4545/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea499e285c97cca07fedd34662c3d4ab44ff6f47 --- /dev/null +++ b/checkpoint-4545/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e481d4ef1546694da7337f6bb6c658b866dcb79b85deeb477da0d27ebe851e +size 15984 diff --git a/checkpoint-4545/rng_state_3.pth b/checkpoint-4545/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..aeb38f92f106ac3f08bae4f82179a8a12243bccb --- /dev/null +++ b/checkpoint-4545/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:353c60be37ea56fc992fca446598ceca5d1fd002aa3bd6dbb9ad740e6f47ebb3 +size 15984 diff --git a/checkpoint-4545/rng_state_4.pth b/checkpoint-4545/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d5856cb7a3f15092fa5593507022316916f648e --- /dev/null +++ b/checkpoint-4545/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9107fe964ba7205e354084b85210e5a5ea1c98cfd4d38adb9cd3926945dcae4 +size 15984 diff --git a/checkpoint-4545/rng_state_5.pth b/checkpoint-4545/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b824ee24d256695aad4a69a62d8e7125f51a17f2 --- /dev/null +++ b/checkpoint-4545/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69d1bb1abee38b92e53f3f23549b642ce0f1edcdccf7b6129847ac61636e96d5 +size 15984 diff --git a/checkpoint-4545/rng_state_6.pth b/checkpoint-4545/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9fd0364bb8f1a8e91eca45be5e1b6672b4d9afd --- /dev/null +++ b/checkpoint-4545/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd5516048e20f36959601574e29e40106085a7d3cdc7bf425ce5e84633490e6 +size 15984 diff --git a/checkpoint-4545/rng_state_7.pth b/checkpoint-4545/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e80125fd18efcb1097384319888b699f4dce7e7 --- /dev/null +++ b/checkpoint-4545/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e2c46927fc06939b4c976a01e4b95dec1f8b98ceaea86d31a5d756fc30ff006 +size 15984 diff --git a/checkpoint-4545/scheduler.pt b/checkpoint-4545/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..991739322335583234b080e667e3a3b97367ea95 --- /dev/null +++ b/checkpoint-4545/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ddb94076828f6cef70561b1a750eeee4210df166983305c1739991809f65b2 +size 1064 diff --git a/checkpoint-4545/special_tokens_map.json b/checkpoint-4545/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-4545/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-4545/tokenizer.json b/checkpoint-4545/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9d29771c68b37af9541b4c450532cb095b564ca5 --- /dev/null +++ b/checkpoint-4545/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91a36f231bc2882e8c2e1859bc27098f73c95ea211ccb73ad0cdb441a16f49c6 +size 17210280 diff --git a/checkpoint-4545/tokenizer_config.json b/checkpoint-4545/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3a695c457b54a00f10768564f6c25b0142ccc840 --- /dev/null +++ b/checkpoint-4545/tokenizer_config.json @@ -0,0 +1,2087 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|im_title|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|end_title|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|im_op|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|end_op|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|im_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|end_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|begin_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|end_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|im_khey|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|end_khey|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|im_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|end_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|autheur|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|khey|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|sujet|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128258": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|><|khey|><|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-4545/trainer_state.json b/checkpoint-4545/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..15816a8ef94881a5fe8d2db1bc522e5a06f9b570 --- /dev/null +++ b/checkpoint-4545/trainer_state.json @@ -0,0 +1,31848 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2501513567064781, + "eval_steps": 500, + "global_step": 4545, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.503880235566074e-05, + "grad_norm": 459.8753356933594, + "learning_rate": 1.0000000000000001e-07, + "loss": 3.303, + "step": 1 + }, + { + "epoch": 0.00011007760471132149, + "grad_norm": 314.2561950683594, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.8226, + "step": 2 + }, + { + "epoch": 0.0001651164070669822, + "grad_norm": 314.1292419433594, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.8517, + "step": 3 + }, + { + "epoch": 0.00022015520942264297, + "grad_norm": 312.4049072265625, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.6248, + "step": 4 + }, + { + "epoch": 0.0002751940117783037, + "grad_norm": 353.7213134765625, + "learning_rate": 5.000000000000001e-07, + "loss": 2.7883, + "step": 5 + }, + { + "epoch": 0.0003302328141339644, + "grad_norm": 278.41668701171875, + "learning_rate": 6.000000000000001e-07, + "loss": 2.5468, + "step": 6 + }, + { + "epoch": 0.0003852716164896252, + "grad_norm": 336.14532470703125, + "learning_rate": 7.000000000000001e-07, + "loss": 2.7721, + "step": 7 + }, + { + "epoch": 0.00044031041884528595, + "grad_norm": 201.19374084472656, + "learning_rate": 8.000000000000001e-07, + "loss": 2.4873, + "step": 8 + }, + { + "epoch": 0.0004953492212009466, + "grad_norm": 184.7027587890625, + "learning_rate": 9.000000000000001e-07, + "loss": 2.6647, + "step": 9 + }, + { + "epoch": 0.0005503880235566074, + "grad_norm": 154.597412109375, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.602, + "step": 10 + }, + { + "epoch": 0.0006054268259122681, + "grad_norm": 40.47785568237305, + "learning_rate": 1.1e-06, + "loss": 2.6716, + "step": 11 + }, + { + "epoch": 0.0006604656282679288, + "grad_norm": 25.338607788085938, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.2631, + "step": 12 + }, + { + "epoch": 0.0007155044306235897, + "grad_norm": 24.976919174194336, + "learning_rate": 1.3e-06, + "loss": 2.3564, + "step": 13 + }, + { + "epoch": 0.0007705432329792504, + "grad_norm": 15.239912033081055, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.3295, + "step": 14 + }, + { + "epoch": 0.0008255820353349112, + "grad_norm": 14.125042915344238, + "learning_rate": 1.5e-06, + "loss": 2.307, + "step": 15 + }, + { + "epoch": 0.0008806208376905719, + "grad_norm": 13.163726806640625, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.1493, + "step": 16 + }, + { + "epoch": 0.0009356596400462326, + "grad_norm": 8.726515769958496, + "learning_rate": 1.7000000000000002e-06, + "loss": 2.0333, + "step": 17 + }, + { + "epoch": 0.0009906984424018933, + "grad_norm": 9.072502136230469, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.2046, + "step": 18 + }, + { + "epoch": 0.001045737244757554, + "grad_norm": 9.412588119506836, + "learning_rate": 1.9000000000000002e-06, + "loss": 2.2001, + "step": 19 + }, + { + "epoch": 0.0011007760471132147, + "grad_norm": 8.67534065246582, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.7679, + "step": 20 + }, + { + "epoch": 0.0011558148494688755, + "grad_norm": 14.015918731689453, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.9566, + "step": 21 + }, + { + "epoch": 0.0012108536518245362, + "grad_norm": 7.9474687576293945, + "learning_rate": 2.2e-06, + "loss": 1.9085, + "step": 22 + }, + { + "epoch": 0.001265892454180197, + "grad_norm": 6.806368350982666, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.7918, + "step": 23 + }, + { + "epoch": 0.0013209312565358577, + "grad_norm": 5.3452582359313965, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.8321, + "step": 24 + }, + { + "epoch": 0.0013759700588915184, + "grad_norm": 8.744244575500488, + "learning_rate": 2.5e-06, + "loss": 1.6317, + "step": 25 + }, + { + "epoch": 0.0014310088612471794, + "grad_norm": 5.304683685302734, + "learning_rate": 2.6e-06, + "loss": 1.6846, + "step": 26 + }, + { + "epoch": 0.00148604766360284, + "grad_norm": 5.650127410888672, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.7449, + "step": 27 + }, + { + "epoch": 0.0015410864659585008, + "grad_norm": 5.479269504547119, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.8158, + "step": 28 + }, + { + "epoch": 0.0015961252683141616, + "grad_norm": 4.873537063598633, + "learning_rate": 2.9e-06, + "loss": 1.8015, + "step": 29 + }, + { + "epoch": 0.0016511640706698223, + "grad_norm": 4.971101760864258, + "learning_rate": 3e-06, + "loss": 1.9034, + "step": 30 + }, + { + "epoch": 0.001706202873025483, + "grad_norm": 4.407571315765381, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.9037, + "step": 31 + }, + { + "epoch": 0.0017612416753811438, + "grad_norm": 4.429073810577393, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.6812, + "step": 32 + }, + { + "epoch": 0.0018162804777368045, + "grad_norm": 5.16085147857666, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.7627, + "step": 33 + }, + { + "epoch": 0.0018713192800924653, + "grad_norm": 4.0805768966674805, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.6799, + "step": 34 + }, + { + "epoch": 0.001926358082448126, + "grad_norm": 4.548702239990234, + "learning_rate": 3.5e-06, + "loss": 1.7799, + "step": 35 + }, + { + "epoch": 0.0019813968848037865, + "grad_norm": 5.181888580322266, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.8235, + "step": 36 + }, + { + "epoch": 0.0020364356871594475, + "grad_norm": 3.9876129627227783, + "learning_rate": 3.7e-06, + "loss": 1.5999, + "step": 37 + }, + { + "epoch": 0.002091474489515108, + "grad_norm": 6.325051307678223, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.7499, + "step": 38 + }, + { + "epoch": 0.002146513291870769, + "grad_norm": 6.199049949645996, + "learning_rate": 3.900000000000001e-06, + "loss": 1.784, + "step": 39 + }, + { + "epoch": 0.0022015520942264295, + "grad_norm": 4.83912992477417, + "learning_rate": 4.000000000000001e-06, + "loss": 1.8895, + "step": 40 + }, + { + "epoch": 0.0022565908965820904, + "grad_norm": 4.515626907348633, + "learning_rate": 4.1e-06, + "loss": 1.4887, + "step": 41 + }, + { + "epoch": 0.002311629698937751, + "grad_norm": 5.032265663146973, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.7324, + "step": 42 + }, + { + "epoch": 0.002366668501293412, + "grad_norm": 4.1879048347473145, + "learning_rate": 4.3e-06, + "loss": 1.4912, + "step": 43 + }, + { + "epoch": 0.0024217073036490724, + "grad_norm": 4.128026485443115, + "learning_rate": 4.4e-06, + "loss": 1.554, + "step": 44 + }, + { + "epoch": 0.0024767461060047334, + "grad_norm": 4.527958393096924, + "learning_rate": 4.5e-06, + "loss": 1.652, + "step": 45 + }, + { + "epoch": 0.002531784908360394, + "grad_norm": 4.8388190269470215, + "learning_rate": 4.600000000000001e-06, + "loss": 1.6696, + "step": 46 + }, + { + "epoch": 0.002586823710716055, + "grad_norm": 4.2088541984558105, + "learning_rate": 4.7e-06, + "loss": 1.568, + "step": 47 + }, + { + "epoch": 0.0026418625130717154, + "grad_norm": 4.789997577667236, + "learning_rate": 4.800000000000001e-06, + "loss": 1.642, + "step": 48 + }, + { + "epoch": 0.0026969013154273763, + "grad_norm": 4.408346652984619, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.5181, + "step": 49 + }, + { + "epoch": 0.002751940117783037, + "grad_norm": 4.572340488433838, + "learning_rate": 5e-06, + "loss": 1.6698, + "step": 50 + }, + { + "epoch": 0.0028069789201386978, + "grad_norm": 4.728564739227295, + "learning_rate": 5.1e-06, + "loss": 1.5785, + "step": 51 + }, + { + "epoch": 0.0028620177224943587, + "grad_norm": 4.449855327606201, + "learning_rate": 5.2e-06, + "loss": 1.4624, + "step": 52 + }, + { + "epoch": 0.0029170565248500193, + "grad_norm": 4.127189636230469, + "learning_rate": 5.300000000000001e-06, + "loss": 1.6061, + "step": 53 + }, + { + "epoch": 0.00297209532720568, + "grad_norm": 4.244532108306885, + "learning_rate": 5.400000000000001e-06, + "loss": 1.491, + "step": 54 + }, + { + "epoch": 0.0030271341295613407, + "grad_norm": 3.437682628631592, + "learning_rate": 5.500000000000001e-06, + "loss": 1.1967, + "step": 55 + }, + { + "epoch": 0.0030821729319170017, + "grad_norm": 3.83516788482666, + "learning_rate": 5.600000000000001e-06, + "loss": 1.4731, + "step": 56 + }, + { + "epoch": 0.003137211734272662, + "grad_norm": 3.9108972549438477, + "learning_rate": 5.7e-06, + "loss": 1.4393, + "step": 57 + }, + { + "epoch": 0.003192250536628323, + "grad_norm": 3.5258419513702393, + "learning_rate": 5.8e-06, + "loss": 1.4206, + "step": 58 + }, + { + "epoch": 0.0032472893389839837, + "grad_norm": 4.124903678894043, + "learning_rate": 5.9e-06, + "loss": 1.4747, + "step": 59 + }, + { + "epoch": 0.0033023281413396446, + "grad_norm": 4.055769920349121, + "learning_rate": 6e-06, + "loss": 1.4655, + "step": 60 + }, + { + "epoch": 0.003357366943695305, + "grad_norm": 3.904837131500244, + "learning_rate": 6.1e-06, + "loss": 1.5125, + "step": 61 + }, + { + "epoch": 0.003412405746050966, + "grad_norm": 3.2904794216156006, + "learning_rate": 6.200000000000001e-06, + "loss": 1.4596, + "step": 62 + }, + { + "epoch": 0.0034674445484066266, + "grad_norm": 3.24053692817688, + "learning_rate": 6.300000000000001e-06, + "loss": 1.3851, + "step": 63 + }, + { + "epoch": 0.0035224833507622876, + "grad_norm": 3.457639217376709, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.4019, + "step": 64 + }, + { + "epoch": 0.003577522153117948, + "grad_norm": 3.073054790496826, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.2872, + "step": 65 + }, + { + "epoch": 0.003632560955473609, + "grad_norm": 2.6726694107055664, + "learning_rate": 6.600000000000001e-06, + "loss": 1.2361, + "step": 66 + }, + { + "epoch": 0.0036875997578292696, + "grad_norm": 2.9378459453582764, + "learning_rate": 6.700000000000001e-06, + "loss": 1.4452, + "step": 67 + }, + { + "epoch": 0.0037426385601849305, + "grad_norm": 2.81107234954834, + "learning_rate": 6.800000000000001e-06, + "loss": 1.4804, + "step": 68 + }, + { + "epoch": 0.003797677362540591, + "grad_norm": 2.60062313079834, + "learning_rate": 6.9e-06, + "loss": 1.3263, + "step": 69 + }, + { + "epoch": 0.003852716164896252, + "grad_norm": 2.5642921924591064, + "learning_rate": 7e-06, + "loss": 1.2751, + "step": 70 + }, + { + "epoch": 0.0039077549672519125, + "grad_norm": 2.3608031272888184, + "learning_rate": 7.100000000000001e-06, + "loss": 1.2614, + "step": 71 + }, + { + "epoch": 0.003962793769607573, + "grad_norm": 2.7201738357543945, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.5018, + "step": 72 + }, + { + "epoch": 0.004017832571963234, + "grad_norm": 2.584726095199585, + "learning_rate": 7.3e-06, + "loss": 1.3519, + "step": 73 + }, + { + "epoch": 0.004072871374318895, + "grad_norm": 1.9693044424057007, + "learning_rate": 7.4e-06, + "loss": 1.0934, + "step": 74 + }, + { + "epoch": 0.0041279101766745555, + "grad_norm": 2.220736503601074, + "learning_rate": 7.500000000000001e-06, + "loss": 1.4687, + "step": 75 + }, + { + "epoch": 0.004182948979030216, + "grad_norm": 2.2629456520080566, + "learning_rate": 7.600000000000001e-06, + "loss": 1.3328, + "step": 76 + }, + { + "epoch": 0.004237987781385877, + "grad_norm": 2.051820993423462, + "learning_rate": 7.7e-06, + "loss": 1.3058, + "step": 77 + }, + { + "epoch": 0.004293026583741538, + "grad_norm": 2.2451820373535156, + "learning_rate": 7.800000000000002e-06, + "loss": 1.3556, + "step": 78 + }, + { + "epoch": 0.004348065386097198, + "grad_norm": 3.13584303855896, + "learning_rate": 7.9e-06, + "loss": 1.3262, + "step": 79 + }, + { + "epoch": 0.004403104188452859, + "grad_norm": 5.024479866027832, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2103, + "step": 80 + }, + { + "epoch": 0.00445814299080852, + "grad_norm": 2.070889711380005, + "learning_rate": 8.1e-06, + "loss": 1.1994, + "step": 81 + }, + { + "epoch": 0.004513181793164181, + "grad_norm": 2.797286033630371, + "learning_rate": 8.2e-06, + "loss": 1.3075, + "step": 82 + }, + { + "epoch": 0.004568220595519841, + "grad_norm": 2.11370849609375, + "learning_rate": 8.3e-06, + "loss": 1.36, + "step": 83 + }, + { + "epoch": 0.004623259397875502, + "grad_norm": 2.5416152477264404, + "learning_rate": 8.400000000000001e-06, + "loss": 1.3484, + "step": 84 + }, + { + "epoch": 0.004678298200231163, + "grad_norm": 2.4702343940734863, + "learning_rate": 8.5e-06, + "loss": 1.3677, + "step": 85 + }, + { + "epoch": 0.004733337002586824, + "grad_norm": 3.670365333557129, + "learning_rate": 8.6e-06, + "loss": 1.2192, + "step": 86 + }, + { + "epoch": 0.004788375804942484, + "grad_norm": 2.282954692840576, + "learning_rate": 8.700000000000001e-06, + "loss": 1.2982, + "step": 87 + }, + { + "epoch": 0.004843414607298145, + "grad_norm": 2.3659238815307617, + "learning_rate": 8.8e-06, + "loss": 1.3206, + "step": 88 + }, + { + "epoch": 0.004898453409653806, + "grad_norm": 4.939981460571289, + "learning_rate": 8.900000000000001e-06, + "loss": 1.4328, + "step": 89 + }, + { + "epoch": 0.004953492212009467, + "grad_norm": 2.335858106613159, + "learning_rate": 9e-06, + "loss": 1.2603, + "step": 90 + }, + { + "epoch": 0.005008531014365127, + "grad_norm": 2.2165043354034424, + "learning_rate": 9.100000000000001e-06, + "loss": 1.3141, + "step": 91 + }, + { + "epoch": 0.005063569816720788, + "grad_norm": 2.7872185707092285, + "learning_rate": 9.200000000000002e-06, + "loss": 1.3314, + "step": 92 + }, + { + "epoch": 0.005118608619076449, + "grad_norm": 2.6353912353515625, + "learning_rate": 9.3e-06, + "loss": 1.2027, + "step": 93 + }, + { + "epoch": 0.00517364742143211, + "grad_norm": 3.2509102821350098, + "learning_rate": 9.4e-06, + "loss": 1.2316, + "step": 94 + }, + { + "epoch": 0.00522868622378777, + "grad_norm": 2.4560611248016357, + "learning_rate": 9.5e-06, + "loss": 1.1848, + "step": 95 + }, + { + "epoch": 0.005283725026143431, + "grad_norm": 2.338151216506958, + "learning_rate": 9.600000000000001e-06, + "loss": 1.2392, + "step": 96 + }, + { + "epoch": 0.005338763828499092, + "grad_norm": 2.231065034866333, + "learning_rate": 9.7e-06, + "loss": 1.2089, + "step": 97 + }, + { + "epoch": 0.005393802630854753, + "grad_norm": 2.278428077697754, + "learning_rate": 9.800000000000001e-06, + "loss": 1.2267, + "step": 98 + }, + { + "epoch": 0.005448841433210413, + "grad_norm": 2.4422810077667236, + "learning_rate": 9.9e-06, + "loss": 1.2041, + "step": 99 + }, + { + "epoch": 0.005503880235566074, + "grad_norm": 2.216248035430908, + "learning_rate": 1e-05, + "loss": 1.0798, + "step": 100 + }, + { + "epoch": 0.005558919037921735, + "grad_norm": 2.3301615715026855, + "learning_rate": 9.99999998121067e-06, + "loss": 1.3069, + "step": 101 + }, + { + "epoch": 0.0056139578402773956, + "grad_norm": 2.315436363220215, + "learning_rate": 9.999999924842678e-06, + "loss": 1.1589, + "step": 102 + }, + { + "epoch": 0.005668996642633056, + "grad_norm": 2.3522140979766846, + "learning_rate": 9.999999830896024e-06, + "loss": 1.0978, + "step": 103 + }, + { + "epoch": 0.0057240354449887175, + "grad_norm": 2.5798308849334717, + "learning_rate": 9.99999969937071e-06, + "loss": 1.0599, + "step": 104 + }, + { + "epoch": 0.005779074247344378, + "grad_norm": 2.456644058227539, + "learning_rate": 9.999999530266738e-06, + "loss": 1.1682, + "step": 105 + }, + { + "epoch": 0.0058341130497000385, + "grad_norm": 2.1559031009674072, + "learning_rate": 9.999999323584106e-06, + "loss": 1.0631, + "step": 106 + }, + { + "epoch": 0.005889151852055699, + "grad_norm": 2.2985048294067383, + "learning_rate": 9.99999907932282e-06, + "loss": 1.1455, + "step": 107 + }, + { + "epoch": 0.00594419065441136, + "grad_norm": 2.596167802810669, + "learning_rate": 9.999998797482877e-06, + "loss": 1.1686, + "step": 108 + }, + { + "epoch": 0.005999229456767021, + "grad_norm": 2.378618001937866, + "learning_rate": 9.999998478064283e-06, + "loss": 1.2226, + "step": 109 + }, + { + "epoch": 0.0060542682591226814, + "grad_norm": 2.228116750717163, + "learning_rate": 9.999998121067038e-06, + "loss": 1.1396, + "step": 110 + }, + { + "epoch": 0.006109307061478342, + "grad_norm": 2.4419472217559814, + "learning_rate": 9.999997726491146e-06, + "loss": 1.1401, + "step": 111 + }, + { + "epoch": 0.006164345863834003, + "grad_norm": 2.0695526599884033, + "learning_rate": 9.999997294336608e-06, + "loss": 1.1868, + "step": 112 + }, + { + "epoch": 0.006219384666189664, + "grad_norm": 2.3170363903045654, + "learning_rate": 9.99999682460343e-06, + "loss": 1.1172, + "step": 113 + }, + { + "epoch": 0.006274423468545324, + "grad_norm": 2.670466184616089, + "learning_rate": 9.999996317291615e-06, + "loss": 1.2481, + "step": 114 + }, + { + "epoch": 0.006329462270900985, + "grad_norm": 2.1214540004730225, + "learning_rate": 9.999995772401166e-06, + "loss": 0.9994, + "step": 115 + }, + { + "epoch": 0.006384501073256646, + "grad_norm": 1.9283969402313232, + "learning_rate": 9.999995189932085e-06, + "loss": 1.0692, + "step": 116 + }, + { + "epoch": 0.006439539875612307, + "grad_norm": 2.2620882987976074, + "learning_rate": 9.99999456988438e-06, + "loss": 1.0725, + "step": 117 + }, + { + "epoch": 0.006494578677967967, + "grad_norm": 2.2121341228485107, + "learning_rate": 9.999993912258055e-06, + "loss": 1.1328, + "step": 118 + }, + { + "epoch": 0.006549617480323628, + "grad_norm": 2.298126220703125, + "learning_rate": 9.999993217053113e-06, + "loss": 1.1272, + "step": 119 + }, + { + "epoch": 0.006604656282679289, + "grad_norm": 1.81593656539917, + "learning_rate": 9.99999248426956e-06, + "loss": 1.017, + "step": 120 + }, + { + "epoch": 0.00665969508503495, + "grad_norm": 2.1174378395080566, + "learning_rate": 9.999991713907403e-06, + "loss": 1.0557, + "step": 121 + }, + { + "epoch": 0.00671473388739061, + "grad_norm": 1.9061017036437988, + "learning_rate": 9.999990905966647e-06, + "loss": 1.0379, + "step": 122 + }, + { + "epoch": 0.006769772689746271, + "grad_norm": 1.912500023841858, + "learning_rate": 9.999990060447297e-06, + "loss": 1.104, + "step": 123 + }, + { + "epoch": 0.006824811492101932, + "grad_norm": 1.9249529838562012, + "learning_rate": 9.99998917734936e-06, + "loss": 1.0136, + "step": 124 + }, + { + "epoch": 0.006879850294457593, + "grad_norm": 1.8504948616027832, + "learning_rate": 9.999988256672843e-06, + "loss": 0.99, + "step": 125 + }, + { + "epoch": 0.006934889096813253, + "grad_norm": 1.720042109489441, + "learning_rate": 9.999987298417753e-06, + "loss": 1.0666, + "step": 126 + }, + { + "epoch": 0.006989927899168914, + "grad_norm": 1.778251051902771, + "learning_rate": 9.999986302584097e-06, + "loss": 1.0424, + "step": 127 + }, + { + "epoch": 0.007044966701524575, + "grad_norm": 1.9485961198806763, + "learning_rate": 9.999985269171881e-06, + "loss": 1.105, + "step": 128 + }, + { + "epoch": 0.007100005503880236, + "grad_norm": 3.0802104473114014, + "learning_rate": 9.999984198181114e-06, + "loss": 1.1081, + "step": 129 + }, + { + "epoch": 0.007155044306235896, + "grad_norm": 1.7476954460144043, + "learning_rate": 9.999983089611806e-06, + "loss": 0.9677, + "step": 130 + }, + { + "epoch": 0.007210083108591557, + "grad_norm": 1.6127299070358276, + "learning_rate": 9.999981943463963e-06, + "loss": 0.9937, + "step": 131 + }, + { + "epoch": 0.007265121910947218, + "grad_norm": 2.1477208137512207, + "learning_rate": 9.999980759737594e-06, + "loss": 1.0319, + "step": 132 + }, + { + "epoch": 0.007320160713302879, + "grad_norm": 1.531163215637207, + "learning_rate": 9.999979538432707e-06, + "loss": 0.8696, + "step": 133 + }, + { + "epoch": 0.007375199515658539, + "grad_norm": 1.8226820230484009, + "learning_rate": 9.999978279549313e-06, + "loss": 1.2061, + "step": 134 + }, + { + "epoch": 0.0074302383180142, + "grad_norm": 1.481895923614502, + "learning_rate": 9.99997698308742e-06, + "loss": 0.949, + "step": 135 + }, + { + "epoch": 0.007485277120369861, + "grad_norm": 1.6715927124023438, + "learning_rate": 9.99997564904704e-06, + "loss": 1.1579, + "step": 136 + }, + { + "epoch": 0.0075403159227255215, + "grad_norm": 1.4235272407531738, + "learning_rate": 9.999974277428179e-06, + "loss": 1.064, + "step": 137 + }, + { + "epoch": 0.007595354725081182, + "grad_norm": 1.3524872064590454, + "learning_rate": 9.999972868230852e-06, + "loss": 0.9141, + "step": 138 + }, + { + "epoch": 0.007650393527436843, + "grad_norm": 1.3741765022277832, + "learning_rate": 9.999971421455066e-06, + "loss": 1.0256, + "step": 139 + }, + { + "epoch": 0.007705432329792504, + "grad_norm": 1.9869598150253296, + "learning_rate": 9.999969937100835e-06, + "loss": 0.9489, + "step": 140 + }, + { + "epoch": 0.0077604711321481645, + "grad_norm": 1.4785465002059937, + "learning_rate": 9.999968415168166e-06, + "loss": 0.9243, + "step": 141 + }, + { + "epoch": 0.007815509934503825, + "grad_norm": 1.5476176738739014, + "learning_rate": 9.999966855657074e-06, + "loss": 1.178, + "step": 142 + }, + { + "epoch": 0.007870548736859486, + "grad_norm": 1.500401258468628, + "learning_rate": 9.99996525856757e-06, + "loss": 0.9837, + "step": 143 + }, + { + "epoch": 0.007925587539215146, + "grad_norm": 1.3777157068252563, + "learning_rate": 9.999963623899664e-06, + "loss": 1.0732, + "step": 144 + }, + { + "epoch": 0.007980626341570807, + "grad_norm": 1.4466841220855713, + "learning_rate": 9.99996195165337e-06, + "loss": 0.9779, + "step": 145 + }, + { + "epoch": 0.008035665143926469, + "grad_norm": 1.5304051637649536, + "learning_rate": 9.9999602418287e-06, + "loss": 1.196, + "step": 146 + }, + { + "epoch": 0.008090703946282128, + "grad_norm": 1.9012362957000732, + "learning_rate": 9.99995849442567e-06, + "loss": 0.9797, + "step": 147 + }, + { + "epoch": 0.00814574274863779, + "grad_norm": 1.430679202079773, + "learning_rate": 9.999956709444289e-06, + "loss": 0.9869, + "step": 148 + }, + { + "epoch": 0.00820078155099345, + "grad_norm": 1.3489817380905151, + "learning_rate": 9.99995488688457e-06, + "loss": 1.0137, + "step": 149 + }, + { + "epoch": 0.008255820353349111, + "grad_norm": 1.1878125667572021, + "learning_rate": 9.999953026746531e-06, + "loss": 0.9355, + "step": 150 + }, + { + "epoch": 0.008310859155704772, + "grad_norm": 1.3481942415237427, + "learning_rate": 9.999951129030182e-06, + "loss": 1.1235, + "step": 151 + }, + { + "epoch": 0.008365897958060432, + "grad_norm": 1.7335314750671387, + "learning_rate": 9.999949193735539e-06, + "loss": 0.9382, + "step": 152 + }, + { + "epoch": 0.008420936760416093, + "grad_norm": 1.2029480934143066, + "learning_rate": 9.999947220862615e-06, + "loss": 0.9419, + "step": 153 + }, + { + "epoch": 0.008475975562771755, + "grad_norm": 1.2104203701019287, + "learning_rate": 9.999945210411428e-06, + "loss": 0.9196, + "step": 154 + }, + { + "epoch": 0.008531014365127414, + "grad_norm": 1.1857126951217651, + "learning_rate": 9.999943162381991e-06, + "loss": 0.9421, + "step": 155 + }, + { + "epoch": 0.008586053167483076, + "grad_norm": 1.115027904510498, + "learning_rate": 9.999941076774319e-06, + "loss": 0.9634, + "step": 156 + }, + { + "epoch": 0.008641091969838737, + "grad_norm": 1.4227553606033325, + "learning_rate": 9.999938953588428e-06, + "loss": 1.0036, + "step": 157 + }, + { + "epoch": 0.008696130772194397, + "grad_norm": 1.2913776636123657, + "learning_rate": 9.999936792824334e-06, + "loss": 0.9232, + "step": 158 + }, + { + "epoch": 0.008751169574550058, + "grad_norm": 1.2817318439483643, + "learning_rate": 9.999934594482055e-06, + "loss": 0.9691, + "step": 159 + }, + { + "epoch": 0.008806208376905718, + "grad_norm": 1.5647841691970825, + "learning_rate": 9.999932358561604e-06, + "loss": 1.1842, + "step": 160 + }, + { + "epoch": 0.00886124717926138, + "grad_norm": 1.368135929107666, + "learning_rate": 9.999930085063002e-06, + "loss": 1.0873, + "step": 161 + }, + { + "epoch": 0.00891628598161704, + "grad_norm": 1.2297240495681763, + "learning_rate": 9.999927773986262e-06, + "loss": 1.0778, + "step": 162 + }, + { + "epoch": 0.0089713247839727, + "grad_norm": 1.0658279657363892, + "learning_rate": 9.999925425331405e-06, + "loss": 0.9008, + "step": 163 + }, + { + "epoch": 0.009026363586328362, + "grad_norm": 1.3484326601028442, + "learning_rate": 9.999923039098445e-06, + "loss": 1.0664, + "step": 164 + }, + { + "epoch": 0.009081402388684023, + "grad_norm": 1.1839075088500977, + "learning_rate": 9.999920615287401e-06, + "loss": 0.9257, + "step": 165 + }, + { + "epoch": 0.009136441191039683, + "grad_norm": 1.2757254838943481, + "learning_rate": 9.999918153898295e-06, + "loss": 0.9473, + "step": 166 + }, + { + "epoch": 0.009191479993395344, + "grad_norm": 1.2414579391479492, + "learning_rate": 9.99991565493114e-06, + "loss": 1.1091, + "step": 167 + }, + { + "epoch": 0.009246518795751004, + "grad_norm": 1.2802611589431763, + "learning_rate": 9.999913118385959e-06, + "loss": 1.063, + "step": 168 + }, + { + "epoch": 0.009301557598106665, + "grad_norm": 1.2055327892303467, + "learning_rate": 9.99991054426277e-06, + "loss": 0.8, + "step": 169 + }, + { + "epoch": 0.009356596400462327, + "grad_norm": 1.0391098260879517, + "learning_rate": 9.99990793256159e-06, + "loss": 0.8672, + "step": 170 + }, + { + "epoch": 0.009411635202817986, + "grad_norm": 1.131536602973938, + "learning_rate": 9.99990528328244e-06, + "loss": 0.9569, + "step": 171 + }, + { + "epoch": 0.009466674005173648, + "grad_norm": 1.164307951927185, + "learning_rate": 9.999902596425342e-06, + "loss": 0.9999, + "step": 172 + }, + { + "epoch": 0.009521712807529309, + "grad_norm": 1.2099504470825195, + "learning_rate": 9.999899871990313e-06, + "loss": 0.9994, + "step": 173 + }, + { + "epoch": 0.009576751609884969, + "grad_norm": 1.7294539213180542, + "learning_rate": 9.999897109977376e-06, + "loss": 1.0265, + "step": 174 + }, + { + "epoch": 0.00963179041224063, + "grad_norm": 1.3009883165359497, + "learning_rate": 9.99989431038655e-06, + "loss": 0.9022, + "step": 175 + }, + { + "epoch": 0.00968682921459629, + "grad_norm": 1.1014611721038818, + "learning_rate": 9.999891473217857e-06, + "loss": 0.8476, + "step": 176 + }, + { + "epoch": 0.009741868016951951, + "grad_norm": 1.2410900592803955, + "learning_rate": 9.99988859847132e-06, + "loss": 1.0272, + "step": 177 + }, + { + "epoch": 0.009796906819307612, + "grad_norm": 1.336348295211792, + "learning_rate": 9.999885686146957e-06, + "loss": 0.9456, + "step": 178 + }, + { + "epoch": 0.009851945621663272, + "grad_norm": 1.2931095361709595, + "learning_rate": 9.99988273624479e-06, + "loss": 0.9554, + "step": 179 + }, + { + "epoch": 0.009906984424018933, + "grad_norm": 1.2647838592529297, + "learning_rate": 9.999879748764845e-06, + "loss": 1.0394, + "step": 180 + }, + { + "epoch": 0.009962023226374595, + "grad_norm": 1.3485127687454224, + "learning_rate": 9.99987672370714e-06, + "loss": 1.1016, + "step": 181 + }, + { + "epoch": 0.010017062028730254, + "grad_norm": 1.110187292098999, + "learning_rate": 9.999873661071702e-06, + "loss": 0.946, + "step": 182 + }, + { + "epoch": 0.010072100831085916, + "grad_norm": 1.0991623401641846, + "learning_rate": 9.999870560858551e-06, + "loss": 1.0084, + "step": 183 + }, + { + "epoch": 0.010127139633441576, + "grad_norm": 1.049804449081421, + "learning_rate": 9.999867423067713e-06, + "loss": 0.8264, + "step": 184 + }, + { + "epoch": 0.010182178435797237, + "grad_norm": 1.0947058200836182, + "learning_rate": 9.999864247699207e-06, + "loss": 0.8884, + "step": 185 + }, + { + "epoch": 0.010237217238152898, + "grad_norm": 1.1147902011871338, + "learning_rate": 9.999861034753061e-06, + "loss": 0.9657, + "step": 186 + }, + { + "epoch": 0.010292256040508558, + "grad_norm": 1.260027527809143, + "learning_rate": 9.999857784229298e-06, + "loss": 1.0102, + "step": 187 + }, + { + "epoch": 0.01034729484286422, + "grad_norm": 1.1275582313537598, + "learning_rate": 9.999854496127942e-06, + "loss": 1.028, + "step": 188 + }, + { + "epoch": 0.01040233364521988, + "grad_norm": 1.1377174854278564, + "learning_rate": 9.999851170449018e-06, + "loss": 1.032, + "step": 189 + }, + { + "epoch": 0.01045737244757554, + "grad_norm": 1.1734225749969482, + "learning_rate": 9.999847807192552e-06, + "loss": 1.0009, + "step": 190 + }, + { + "epoch": 0.010512411249931202, + "grad_norm": 1.1934596300125122, + "learning_rate": 9.999844406358565e-06, + "loss": 1.0432, + "step": 191 + }, + { + "epoch": 0.010567450052286861, + "grad_norm": 1.0638024806976318, + "learning_rate": 9.99984096794709e-06, + "loss": 0.8651, + "step": 192 + }, + { + "epoch": 0.010622488854642523, + "grad_norm": 1.2381829023361206, + "learning_rate": 9.999837491958147e-06, + "loss": 1.0088, + "step": 193 + }, + { + "epoch": 0.010677527656998184, + "grad_norm": 1.030246615409851, + "learning_rate": 9.999833978391763e-06, + "loss": 0.9488, + "step": 194 + }, + { + "epoch": 0.010732566459353844, + "grad_norm": 1.1640657186508179, + "learning_rate": 9.999830427247965e-06, + "loss": 1.0588, + "step": 195 + }, + { + "epoch": 0.010787605261709505, + "grad_norm": 1.0431616306304932, + "learning_rate": 9.99982683852678e-06, + "loss": 0.8728, + "step": 196 + }, + { + "epoch": 0.010842644064065167, + "grad_norm": 1.032263159751892, + "learning_rate": 9.999823212228235e-06, + "loss": 0.9498, + "step": 197 + }, + { + "epoch": 0.010897682866420826, + "grad_norm": 1.1383745670318604, + "learning_rate": 9.999819548352358e-06, + "loss": 0.9498, + "step": 198 + }, + { + "epoch": 0.010952721668776488, + "grad_norm": 1.1324639320373535, + "learning_rate": 9.999815846899175e-06, + "loss": 1.0432, + "step": 199 + }, + { + "epoch": 0.011007760471132147, + "grad_norm": 1.188672661781311, + "learning_rate": 9.999812107868714e-06, + "loss": 0.982, + "step": 200 + }, + { + "epoch": 0.011062799273487809, + "grad_norm": 1.1011098623275757, + "learning_rate": 9.999808331261005e-06, + "loss": 0.9587, + "step": 201 + }, + { + "epoch": 0.01111783807584347, + "grad_norm": 1.1782938241958618, + "learning_rate": 9.999804517076073e-06, + "loss": 1.0659, + "step": 202 + }, + { + "epoch": 0.01117287687819913, + "grad_norm": 1.0520117282867432, + "learning_rate": 9.99980066531395e-06, + "loss": 1.0056, + "step": 203 + }, + { + "epoch": 0.011227915680554791, + "grad_norm": 1.1584919691085815, + "learning_rate": 9.999796775974663e-06, + "loss": 0.9435, + "step": 204 + }, + { + "epoch": 0.011282954482910452, + "grad_norm": 1.2201849222183228, + "learning_rate": 9.999792849058242e-06, + "loss": 1.0562, + "step": 205 + }, + { + "epoch": 0.011337993285266112, + "grad_norm": 1.2985976934432983, + "learning_rate": 9.999788884564715e-06, + "loss": 1.0126, + "step": 206 + }, + { + "epoch": 0.011393032087621774, + "grad_norm": 0.9926307201385498, + "learning_rate": 9.999784882494115e-06, + "loss": 0.7875, + "step": 207 + }, + { + "epoch": 0.011448070889977435, + "grad_norm": 1.103365182876587, + "learning_rate": 9.99978084284647e-06, + "loss": 0.9833, + "step": 208 + }, + { + "epoch": 0.011503109692333095, + "grad_norm": 1.1798462867736816, + "learning_rate": 9.99977676562181e-06, + "loss": 0.8479, + "step": 209 + }, + { + "epoch": 0.011558148494688756, + "grad_norm": 1.2887194156646729, + "learning_rate": 9.999772650820168e-06, + "loss": 0.9606, + "step": 210 + }, + { + "epoch": 0.011613187297044416, + "grad_norm": 1.1120634078979492, + "learning_rate": 9.99976849844157e-06, + "loss": 0.9604, + "step": 211 + }, + { + "epoch": 0.011668226099400077, + "grad_norm": 1.1248979568481445, + "learning_rate": 9.999764308486052e-06, + "loss": 0.9428, + "step": 212 + }, + { + "epoch": 0.011723264901755738, + "grad_norm": 1.274610161781311, + "learning_rate": 9.999760080953643e-06, + "loss": 0.9044, + "step": 213 + }, + { + "epoch": 0.011778303704111398, + "grad_norm": 1.1746865510940552, + "learning_rate": 9.999755815844377e-06, + "loss": 0.9114, + "step": 214 + }, + { + "epoch": 0.01183334250646706, + "grad_norm": 1.2531086206436157, + "learning_rate": 9.999751513158282e-06, + "loss": 1.0785, + "step": 215 + }, + { + "epoch": 0.01188838130882272, + "grad_norm": 1.0789539813995361, + "learning_rate": 9.999747172895395e-06, + "loss": 0.9794, + "step": 216 + }, + { + "epoch": 0.01194342011117838, + "grad_norm": 1.1805329322814941, + "learning_rate": 9.999742795055746e-06, + "loss": 0.9602, + "step": 217 + }, + { + "epoch": 0.011998458913534042, + "grad_norm": 2.309329032897949, + "learning_rate": 9.99973837963937e-06, + "loss": 0.9482, + "step": 218 + }, + { + "epoch": 0.012053497715889702, + "grad_norm": 1.2379088401794434, + "learning_rate": 9.999733926646296e-06, + "loss": 1.0237, + "step": 219 + }, + { + "epoch": 0.012108536518245363, + "grad_norm": 1.1581377983093262, + "learning_rate": 9.999729436076562e-06, + "loss": 1.0583, + "step": 220 + }, + { + "epoch": 0.012163575320601024, + "grad_norm": 1.3006727695465088, + "learning_rate": 9.999724907930199e-06, + "loss": 0.9581, + "step": 221 + }, + { + "epoch": 0.012218614122956684, + "grad_norm": 1.3215982913970947, + "learning_rate": 9.999720342207243e-06, + "loss": 0.9438, + "step": 222 + }, + { + "epoch": 0.012273652925312345, + "grad_norm": 1.1107337474822998, + "learning_rate": 9.999715738907727e-06, + "loss": 0.9987, + "step": 223 + }, + { + "epoch": 0.012328691727668007, + "grad_norm": 1.0745457410812378, + "learning_rate": 9.999711098031685e-06, + "loss": 0.9637, + "step": 224 + }, + { + "epoch": 0.012383730530023666, + "grad_norm": 1.110861897468567, + "learning_rate": 9.999706419579154e-06, + "loss": 1.0225, + "step": 225 + }, + { + "epoch": 0.012438769332379328, + "grad_norm": 1.0755527019500732, + "learning_rate": 9.999701703550167e-06, + "loss": 1.0204, + "step": 226 + }, + { + "epoch": 0.012493808134734987, + "grad_norm": 1.1694976091384888, + "learning_rate": 9.99969694994476e-06, + "loss": 1.0566, + "step": 227 + }, + { + "epoch": 0.012548846937090649, + "grad_norm": 1.455856442451477, + "learning_rate": 9.99969215876297e-06, + "loss": 0.9397, + "step": 228 + }, + { + "epoch": 0.01260388573944631, + "grad_norm": 1.0707073211669922, + "learning_rate": 9.99968733000483e-06, + "loss": 0.8286, + "step": 229 + }, + { + "epoch": 0.01265892454180197, + "grad_norm": 1.189548134803772, + "learning_rate": 9.99968246367038e-06, + "loss": 0.8762, + "step": 230 + }, + { + "epoch": 0.012713963344157631, + "grad_norm": 1.1439214944839478, + "learning_rate": 9.999677559759655e-06, + "loss": 0.9187, + "step": 231 + }, + { + "epoch": 0.012769002146513293, + "grad_norm": 1.2329761981964111, + "learning_rate": 9.999672618272691e-06, + "loss": 1.0374, + "step": 232 + }, + { + "epoch": 0.012824040948868952, + "grad_norm": 1.1545134782791138, + "learning_rate": 9.999667639209527e-06, + "loss": 0.9343, + "step": 233 + }, + { + "epoch": 0.012879079751224614, + "grad_norm": 1.0946775674819946, + "learning_rate": 9.999662622570198e-06, + "loss": 0.9568, + "step": 234 + }, + { + "epoch": 0.012934118553580273, + "grad_norm": 1.2099589109420776, + "learning_rate": 9.999657568354743e-06, + "loss": 1.0364, + "step": 235 + }, + { + "epoch": 0.012989157355935935, + "grad_norm": 1.09062922000885, + "learning_rate": 9.999652476563202e-06, + "loss": 1.0289, + "step": 236 + }, + { + "epoch": 0.013044196158291596, + "grad_norm": 1.154557228088379, + "learning_rate": 9.999647347195612e-06, + "loss": 0.9925, + "step": 237 + }, + { + "epoch": 0.013099234960647256, + "grad_norm": 1.025374174118042, + "learning_rate": 9.999642180252008e-06, + "loss": 0.9346, + "step": 238 + }, + { + "epoch": 0.013154273763002917, + "grad_norm": 1.1473641395568848, + "learning_rate": 9.999636975732433e-06, + "loss": 1.0244, + "step": 239 + }, + { + "epoch": 0.013209312565358578, + "grad_norm": 1.0421240329742432, + "learning_rate": 9.999631733636923e-06, + "loss": 0.9368, + "step": 240 + }, + { + "epoch": 0.013264351367714238, + "grad_norm": 1.1076610088348389, + "learning_rate": 9.99962645396552e-06, + "loss": 1.0276, + "step": 241 + }, + { + "epoch": 0.0133193901700699, + "grad_norm": 1.143559455871582, + "learning_rate": 9.999621136718266e-06, + "loss": 0.9626, + "step": 242 + }, + { + "epoch": 0.01337442897242556, + "grad_norm": 1.0958378314971924, + "learning_rate": 9.999615781895195e-06, + "loss": 1.0254, + "step": 243 + }, + { + "epoch": 0.01342946777478122, + "grad_norm": 1.117688536643982, + "learning_rate": 9.99961038949635e-06, + "loss": 0.9685, + "step": 244 + }, + { + "epoch": 0.013484506577136882, + "grad_norm": 1.1645647287368774, + "learning_rate": 9.999604959521771e-06, + "loss": 1.0666, + "step": 245 + }, + { + "epoch": 0.013539545379492542, + "grad_norm": 1.1238516569137573, + "learning_rate": 9.999599491971502e-06, + "loss": 1.0252, + "step": 246 + }, + { + "epoch": 0.013594584181848203, + "grad_norm": 1.0196914672851562, + "learning_rate": 9.999593986845579e-06, + "loss": 0.9389, + "step": 247 + }, + { + "epoch": 0.013649622984203864, + "grad_norm": 1.0231372117996216, + "learning_rate": 9.999588444144049e-06, + "loss": 0.8786, + "step": 248 + }, + { + "epoch": 0.013704661786559524, + "grad_norm": 1.2504147291183472, + "learning_rate": 9.999582863866947e-06, + "loss": 1.0969, + "step": 249 + }, + { + "epoch": 0.013759700588915185, + "grad_norm": 1.1123549938201904, + "learning_rate": 9.99957724601432e-06, + "loss": 0.8833, + "step": 250 + }, + { + "epoch": 0.013814739391270847, + "grad_norm": 1.1068202257156372, + "learning_rate": 9.999571590586208e-06, + "loss": 0.9709, + "step": 251 + }, + { + "epoch": 0.013869778193626506, + "grad_norm": 0.9891651272773743, + "learning_rate": 9.999565897582655e-06, + "loss": 0.8598, + "step": 252 + }, + { + "epoch": 0.013924816995982168, + "grad_norm": 0.9866491556167603, + "learning_rate": 9.999560167003703e-06, + "loss": 0.8101, + "step": 253 + }, + { + "epoch": 0.013979855798337828, + "grad_norm": 1.0862594842910767, + "learning_rate": 9.999554398849396e-06, + "loss": 0.9411, + "step": 254 + }, + { + "epoch": 0.014034894600693489, + "grad_norm": 1.1898949146270752, + "learning_rate": 9.999548593119774e-06, + "loss": 0.9548, + "step": 255 + }, + { + "epoch": 0.01408993340304915, + "grad_norm": 1.2167880535125732, + "learning_rate": 9.999542749814886e-06, + "loss": 1.0302, + "step": 256 + }, + { + "epoch": 0.01414497220540481, + "grad_norm": 1.0784146785736084, + "learning_rate": 9.999536868934771e-06, + "loss": 0.8875, + "step": 257 + }, + { + "epoch": 0.014200011007760471, + "grad_norm": 1.1128027439117432, + "learning_rate": 9.999530950479475e-06, + "loss": 0.9498, + "step": 258 + }, + { + "epoch": 0.014255049810116133, + "grad_norm": 1.1311595439910889, + "learning_rate": 9.999524994449044e-06, + "loss": 0.9035, + "step": 259 + }, + { + "epoch": 0.014310088612471792, + "grad_norm": 1.225615382194519, + "learning_rate": 9.999519000843521e-06, + "loss": 1.0104, + "step": 260 + }, + { + "epoch": 0.014365127414827454, + "grad_norm": 1.2347793579101562, + "learning_rate": 9.99951296966295e-06, + "loss": 1.0288, + "step": 261 + }, + { + "epoch": 0.014420166217183113, + "grad_norm": 1.1837103366851807, + "learning_rate": 9.99950690090738e-06, + "loss": 0.9553, + "step": 262 + }, + { + "epoch": 0.014475205019538775, + "grad_norm": 1.1985397338867188, + "learning_rate": 9.999500794576852e-06, + "loss": 0.9561, + "step": 263 + }, + { + "epoch": 0.014530243821894436, + "grad_norm": 1.036928415298462, + "learning_rate": 9.999494650671418e-06, + "loss": 0.8906, + "step": 264 + }, + { + "epoch": 0.014585282624250096, + "grad_norm": 1.0797842741012573, + "learning_rate": 9.999488469191116e-06, + "loss": 0.8975, + "step": 265 + }, + { + "epoch": 0.014640321426605757, + "grad_norm": 1.0571156740188599, + "learning_rate": 9.999482250136e-06, + "loss": 0.9334, + "step": 266 + }, + { + "epoch": 0.014695360228961419, + "grad_norm": 1.2065023183822632, + "learning_rate": 9.999475993506114e-06, + "loss": 0.8986, + "step": 267 + }, + { + "epoch": 0.014750399031317078, + "grad_norm": 1.201586127281189, + "learning_rate": 9.999469699301502e-06, + "loss": 0.9192, + "step": 268 + }, + { + "epoch": 0.01480543783367274, + "grad_norm": 1.0470168590545654, + "learning_rate": 9.999463367522216e-06, + "loss": 0.8604, + "step": 269 + }, + { + "epoch": 0.0148604766360284, + "grad_norm": 1.1142147779464722, + "learning_rate": 9.9994569981683e-06, + "loss": 0.9847, + "step": 270 + }, + { + "epoch": 0.01491551543838406, + "grad_norm": 1.0352061986923218, + "learning_rate": 9.999450591239805e-06, + "loss": 0.8927, + "step": 271 + }, + { + "epoch": 0.014970554240739722, + "grad_norm": 1.0353184938430786, + "learning_rate": 9.999444146736779e-06, + "loss": 0.8435, + "step": 272 + }, + { + "epoch": 0.015025593043095382, + "grad_norm": 1.2091951370239258, + "learning_rate": 9.999437664659267e-06, + "loss": 0.8959, + "step": 273 + }, + { + "epoch": 0.015080631845451043, + "grad_norm": 1.006361722946167, + "learning_rate": 9.999431145007319e-06, + "loss": 0.8579, + "step": 274 + }, + { + "epoch": 0.015135670647806704, + "grad_norm": 1.1265509128570557, + "learning_rate": 9.999424587780985e-06, + "loss": 0.8808, + "step": 275 + }, + { + "epoch": 0.015190709450162364, + "grad_norm": 1.060882568359375, + "learning_rate": 9.999417992980317e-06, + "loss": 1.044, + "step": 276 + }, + { + "epoch": 0.015245748252518026, + "grad_norm": 1.0216747522354126, + "learning_rate": 9.999411360605358e-06, + "loss": 0.7773, + "step": 277 + }, + { + "epoch": 0.015300787054873685, + "grad_norm": 1.1382462978363037, + "learning_rate": 9.999404690656163e-06, + "loss": 0.8954, + "step": 278 + }, + { + "epoch": 0.015355825857229347, + "grad_norm": 1.113815188407898, + "learning_rate": 9.99939798313278e-06, + "loss": 0.8143, + "step": 279 + }, + { + "epoch": 0.015410864659585008, + "grad_norm": 1.123530387878418, + "learning_rate": 9.99939123803526e-06, + "loss": 0.8872, + "step": 280 + }, + { + "epoch": 0.015465903461940668, + "grad_norm": 1.0873669385910034, + "learning_rate": 9.999384455363656e-06, + "loss": 1.008, + "step": 281 + }, + { + "epoch": 0.015520942264296329, + "grad_norm": 1.5956637859344482, + "learning_rate": 9.999377635118014e-06, + "loss": 0.9456, + "step": 282 + }, + { + "epoch": 0.01557598106665199, + "grad_norm": 1.1471425294876099, + "learning_rate": 9.999370777298389e-06, + "loss": 0.9897, + "step": 283 + }, + { + "epoch": 0.01563101986900765, + "grad_norm": 0.9960193634033203, + "learning_rate": 9.999363881904831e-06, + "loss": 0.8196, + "step": 284 + }, + { + "epoch": 0.01568605867136331, + "grad_norm": 1.1033951044082642, + "learning_rate": 9.999356948937393e-06, + "loss": 0.879, + "step": 285 + }, + { + "epoch": 0.015741097473718973, + "grad_norm": 1.157765507698059, + "learning_rate": 9.999349978396126e-06, + "loss": 1.0116, + "step": 286 + }, + { + "epoch": 0.015796136276074634, + "grad_norm": 1.0472352504730225, + "learning_rate": 9.999342970281084e-06, + "loss": 0.8657, + "step": 287 + }, + { + "epoch": 0.015851175078430292, + "grad_norm": 1.1346659660339355, + "learning_rate": 9.999335924592315e-06, + "loss": 0.8482, + "step": 288 + }, + { + "epoch": 0.015906213880785953, + "grad_norm": 1.1164487600326538, + "learning_rate": 9.999328841329879e-06, + "loss": 1.0542, + "step": 289 + }, + { + "epoch": 0.015961252683141615, + "grad_norm": 1.1890591382980347, + "learning_rate": 9.999321720493825e-06, + "loss": 0.9598, + "step": 290 + }, + { + "epoch": 0.016016291485497276, + "grad_norm": 1.0419867038726807, + "learning_rate": 9.999314562084205e-06, + "loss": 0.9548, + "step": 291 + }, + { + "epoch": 0.016071330287852938, + "grad_norm": 1.0652042627334595, + "learning_rate": 9.999307366101077e-06, + "loss": 0.9359, + "step": 292 + }, + { + "epoch": 0.016126369090208596, + "grad_norm": 1.0166404247283936, + "learning_rate": 9.999300132544492e-06, + "loss": 0.9276, + "step": 293 + }, + { + "epoch": 0.016181407892564257, + "grad_norm": 1.1638866662979126, + "learning_rate": 9.999292861414507e-06, + "loss": 0.957, + "step": 294 + }, + { + "epoch": 0.01623644669491992, + "grad_norm": 1.5505993366241455, + "learning_rate": 9.999285552711173e-06, + "loss": 0.9878, + "step": 295 + }, + { + "epoch": 0.01629148549727558, + "grad_norm": 1.177262783050537, + "learning_rate": 9.999278206434549e-06, + "loss": 0.8631, + "step": 296 + }, + { + "epoch": 0.01634652429963124, + "grad_norm": 1.8578168153762817, + "learning_rate": 9.999270822584687e-06, + "loss": 0.9684, + "step": 297 + }, + { + "epoch": 0.0164015631019869, + "grad_norm": 1.2617360353469849, + "learning_rate": 9.999263401161643e-06, + "loss": 1.014, + "step": 298 + }, + { + "epoch": 0.01645660190434256, + "grad_norm": 0.9740132689476013, + "learning_rate": 9.999255942165475e-06, + "loss": 0.8606, + "step": 299 + }, + { + "epoch": 0.016511640706698222, + "grad_norm": 0.9821745753288269, + "learning_rate": 9.999248445596238e-06, + "loss": 0.8241, + "step": 300 + }, + { + "epoch": 0.016566679509053883, + "grad_norm": 1.0200445652008057, + "learning_rate": 9.999240911453986e-06, + "loss": 0.8256, + "step": 301 + }, + { + "epoch": 0.016621718311409545, + "grad_norm": 1.4100390672683716, + "learning_rate": 9.999233339738779e-06, + "loss": 0.9057, + "step": 302 + }, + { + "epoch": 0.016676757113765206, + "grad_norm": 1.056544303894043, + "learning_rate": 9.99922573045067e-06, + "loss": 1.0808, + "step": 303 + }, + { + "epoch": 0.016731795916120864, + "grad_norm": 0.9271026253700256, + "learning_rate": 9.99921808358972e-06, + "loss": 0.878, + "step": 304 + }, + { + "epoch": 0.016786834718476525, + "grad_norm": 0.9864157438278198, + "learning_rate": 9.999210399155987e-06, + "loss": 0.9198, + "step": 305 + }, + { + "epoch": 0.016841873520832187, + "grad_norm": 1.093995451927185, + "learning_rate": 9.999202677149525e-06, + "loss": 0.9794, + "step": 306 + }, + { + "epoch": 0.016896912323187848, + "grad_norm": 0.9717912077903748, + "learning_rate": 9.999194917570395e-06, + "loss": 0.8764, + "step": 307 + }, + { + "epoch": 0.01695195112554351, + "grad_norm": 1.0026428699493408, + "learning_rate": 9.999187120418653e-06, + "loss": 0.8526, + "step": 308 + }, + { + "epoch": 0.017006989927899167, + "grad_norm": 1.122870922088623, + "learning_rate": 9.999179285694359e-06, + "loss": 0.9773, + "step": 309 + }, + { + "epoch": 0.01706202873025483, + "grad_norm": 1.0522836446762085, + "learning_rate": 9.999171413397572e-06, + "loss": 1.0183, + "step": 310 + }, + { + "epoch": 0.01711706753261049, + "grad_norm": 0.9303658604621887, + "learning_rate": 9.99916350352835e-06, + "loss": 0.8402, + "step": 311 + }, + { + "epoch": 0.01717210633496615, + "grad_norm": 0.9606096148490906, + "learning_rate": 9.999155556086755e-06, + "loss": 0.9692, + "step": 312 + }, + { + "epoch": 0.017227145137321813, + "grad_norm": 1.176992416381836, + "learning_rate": 9.999147571072844e-06, + "loss": 0.8172, + "step": 313 + }, + { + "epoch": 0.017282183939677474, + "grad_norm": 1.1948801279067993, + "learning_rate": 9.999139548486678e-06, + "loss": 1.0205, + "step": 314 + }, + { + "epoch": 0.017337222742033132, + "grad_norm": 1.0064897537231445, + "learning_rate": 9.999131488328318e-06, + "loss": 0.9479, + "step": 315 + }, + { + "epoch": 0.017392261544388794, + "grad_norm": 1.048242449760437, + "learning_rate": 9.999123390597822e-06, + "loss": 0.9862, + "step": 316 + }, + { + "epoch": 0.017447300346744455, + "grad_norm": 1.12875497341156, + "learning_rate": 9.999115255295256e-06, + "loss": 0.9743, + "step": 317 + }, + { + "epoch": 0.017502339149100116, + "grad_norm": 1.0607460737228394, + "learning_rate": 9.999107082420674e-06, + "loss": 0.8878, + "step": 318 + }, + { + "epoch": 0.017557377951455778, + "grad_norm": 1.1480191946029663, + "learning_rate": 9.999098871974144e-06, + "loss": 0.8769, + "step": 319 + }, + { + "epoch": 0.017612416753811436, + "grad_norm": 1.1150004863739014, + "learning_rate": 9.999090623955724e-06, + "loss": 0.8615, + "step": 320 + }, + { + "epoch": 0.017667455556167097, + "grad_norm": 1.137839913368225, + "learning_rate": 9.999082338365478e-06, + "loss": 0.9703, + "step": 321 + }, + { + "epoch": 0.01772249435852276, + "grad_norm": 1.0883489847183228, + "learning_rate": 9.999074015203467e-06, + "loss": 0.9273, + "step": 322 + }, + { + "epoch": 0.01777753316087842, + "grad_norm": 1.0999557971954346, + "learning_rate": 9.999065654469752e-06, + "loss": 0.9605, + "step": 323 + }, + { + "epoch": 0.01783257196323408, + "grad_norm": 0.9911689758300781, + "learning_rate": 9.999057256164401e-06, + "loss": 0.9117, + "step": 324 + }, + { + "epoch": 0.01788761076558974, + "grad_norm": 1.040933609008789, + "learning_rate": 9.999048820287472e-06, + "loss": 0.9229, + "step": 325 + }, + { + "epoch": 0.0179426495679454, + "grad_norm": 1.4341392517089844, + "learning_rate": 9.999040346839031e-06, + "loss": 1.0718, + "step": 326 + }, + { + "epoch": 0.017997688370301062, + "grad_norm": 1.0246332883834839, + "learning_rate": 9.99903183581914e-06, + "loss": 0.9617, + "step": 327 + }, + { + "epoch": 0.018052727172656723, + "grad_norm": 10.162322998046875, + "learning_rate": 9.999023287227863e-06, + "loss": 1.0391, + "step": 328 + }, + { + "epoch": 0.018107765975012385, + "grad_norm": 1.3370027542114258, + "learning_rate": 9.999014701065266e-06, + "loss": 1.0211, + "step": 329 + }, + { + "epoch": 0.018162804777368046, + "grad_norm": 1.0146219730377197, + "learning_rate": 9.999006077331413e-06, + "loss": 0.8611, + "step": 330 + }, + { + "epoch": 0.018217843579723704, + "grad_norm": 1.0899269580841064, + "learning_rate": 9.998997416026368e-06, + "loss": 0.9209, + "step": 331 + }, + { + "epoch": 0.018272882382079365, + "grad_norm": 1.1343204975128174, + "learning_rate": 9.998988717150198e-06, + "loss": 0.9405, + "step": 332 + }, + { + "epoch": 0.018327921184435027, + "grad_norm": 1.2308380603790283, + "learning_rate": 9.998979980702965e-06, + "loss": 0.9579, + "step": 333 + }, + { + "epoch": 0.018382959986790688, + "grad_norm": 1.1433519124984741, + "learning_rate": 9.998971206684737e-06, + "loss": 1.0045, + "step": 334 + }, + { + "epoch": 0.01843799878914635, + "grad_norm": 1.0585781335830688, + "learning_rate": 9.99896239509558e-06, + "loss": 0.9171, + "step": 335 + }, + { + "epoch": 0.018493037591502007, + "grad_norm": 1.2735164165496826, + "learning_rate": 9.99895354593556e-06, + "loss": 1.1001, + "step": 336 + }, + { + "epoch": 0.01854807639385767, + "grad_norm": 1.2905755043029785, + "learning_rate": 9.998944659204744e-06, + "loss": 1.0294, + "step": 337 + }, + { + "epoch": 0.01860311519621333, + "grad_norm": 1.1442075967788696, + "learning_rate": 9.998935734903198e-06, + "loss": 0.9385, + "step": 338 + }, + { + "epoch": 0.01865815399856899, + "grad_norm": 1.1005232334136963, + "learning_rate": 9.998926773030987e-06, + "loss": 1.026, + "step": 339 + }, + { + "epoch": 0.018713192800924653, + "grad_norm": 1.2770785093307495, + "learning_rate": 9.998917773588182e-06, + "loss": 1.0015, + "step": 340 + }, + { + "epoch": 0.01876823160328031, + "grad_norm": 1.0963070392608643, + "learning_rate": 9.998908736574849e-06, + "loss": 0.9347, + "step": 341 + }, + { + "epoch": 0.018823270405635972, + "grad_norm": 1.10364830493927, + "learning_rate": 9.998899661991055e-06, + "loss": 0.869, + "step": 342 + }, + { + "epoch": 0.018878309207991634, + "grad_norm": 1.0364975929260254, + "learning_rate": 9.99889054983687e-06, + "loss": 0.9855, + "step": 343 + }, + { + "epoch": 0.018933348010347295, + "grad_norm": 1.104702115058899, + "learning_rate": 9.998881400112362e-06, + "loss": 0.9555, + "step": 344 + }, + { + "epoch": 0.018988386812702956, + "grad_norm": 0.9957441687583923, + "learning_rate": 9.998872212817599e-06, + "loss": 0.9634, + "step": 345 + }, + { + "epoch": 0.019043425615058618, + "grad_norm": 1.262271523475647, + "learning_rate": 9.998862987952651e-06, + "loss": 1.0133, + "step": 346 + }, + { + "epoch": 0.019098464417414276, + "grad_norm": 1.2075226306915283, + "learning_rate": 9.998853725517587e-06, + "loss": 1.0588, + "step": 347 + }, + { + "epoch": 0.019153503219769937, + "grad_norm": 1.0609898567199707, + "learning_rate": 9.998844425512477e-06, + "loss": 0.9952, + "step": 348 + }, + { + "epoch": 0.0192085420221256, + "grad_norm": 1.1930195093154907, + "learning_rate": 9.998835087937389e-06, + "loss": 0.9617, + "step": 349 + }, + { + "epoch": 0.01926358082448126, + "grad_norm": 1.2359932661056519, + "learning_rate": 9.998825712792396e-06, + "loss": 0.8768, + "step": 350 + }, + { + "epoch": 0.01931861962683692, + "grad_norm": 0.9984115362167358, + "learning_rate": 9.998816300077566e-06, + "loss": 0.8205, + "step": 351 + }, + { + "epoch": 0.01937365842919258, + "grad_norm": 1.6853677034378052, + "learning_rate": 9.998806849792972e-06, + "loss": 0.9066, + "step": 352 + }, + { + "epoch": 0.01942869723154824, + "grad_norm": 1.2869856357574463, + "learning_rate": 9.998797361938683e-06, + "loss": 1.0054, + "step": 353 + }, + { + "epoch": 0.019483736033903902, + "grad_norm": 1.2791584730148315, + "learning_rate": 9.99878783651477e-06, + "loss": 0.7627, + "step": 354 + }, + { + "epoch": 0.019538774836259563, + "grad_norm": 1.0795867443084717, + "learning_rate": 9.998778273521307e-06, + "loss": 0.9343, + "step": 355 + }, + { + "epoch": 0.019593813638615225, + "grad_norm": 1.0926088094711304, + "learning_rate": 9.998768672958365e-06, + "loss": 0.943, + "step": 356 + }, + { + "epoch": 0.019648852440970886, + "grad_norm": 1.0530847311019897, + "learning_rate": 9.998759034826015e-06, + "loss": 0.9656, + "step": 357 + }, + { + "epoch": 0.019703891243326544, + "grad_norm": 1.1793400049209595, + "learning_rate": 9.99874935912433e-06, + "loss": 0.9799, + "step": 358 + }, + { + "epoch": 0.019758930045682205, + "grad_norm": 1.0726191997528076, + "learning_rate": 9.998739645853383e-06, + "loss": 0.8739, + "step": 359 + }, + { + "epoch": 0.019813968848037867, + "grad_norm": 1.0488981008529663, + "learning_rate": 9.998729895013246e-06, + "loss": 0.8986, + "step": 360 + }, + { + "epoch": 0.019869007650393528, + "grad_norm": 1.8267477750778198, + "learning_rate": 9.998720106603993e-06, + "loss": 0.9175, + "step": 361 + }, + { + "epoch": 0.01992404645274919, + "grad_norm": 0.9868306517601013, + "learning_rate": 9.9987102806257e-06, + "loss": 0.9609, + "step": 362 + }, + { + "epoch": 0.019979085255104848, + "grad_norm": 1.0171183347702026, + "learning_rate": 9.998700417078438e-06, + "loss": 0.8904, + "step": 363 + }, + { + "epoch": 0.02003412405746051, + "grad_norm": 0.9800812602043152, + "learning_rate": 9.998690515962282e-06, + "loss": 0.8344, + "step": 364 + }, + { + "epoch": 0.02008916285981617, + "grad_norm": 1.024707317352295, + "learning_rate": 9.998680577277304e-06, + "loss": 0.9026, + "step": 365 + }, + { + "epoch": 0.02014420166217183, + "grad_norm": 1.1056619882583618, + "learning_rate": 9.998670601023584e-06, + "loss": 1.017, + "step": 366 + }, + { + "epoch": 0.020199240464527493, + "grad_norm": 1.0555908679962158, + "learning_rate": 9.998660587201191e-06, + "loss": 0.9627, + "step": 367 + }, + { + "epoch": 0.02025427926688315, + "grad_norm": 0.9502031803131104, + "learning_rate": 9.998650535810204e-06, + "loss": 0.935, + "step": 368 + }, + { + "epoch": 0.020309318069238812, + "grad_norm": 1.0355613231658936, + "learning_rate": 9.998640446850699e-06, + "loss": 0.9946, + "step": 369 + }, + { + "epoch": 0.020364356871594474, + "grad_norm": 0.9906355142593384, + "learning_rate": 9.99863032032275e-06, + "loss": 0.9389, + "step": 370 + }, + { + "epoch": 0.020419395673950135, + "grad_norm": 0.9483911395072937, + "learning_rate": 9.99862015622643e-06, + "loss": 0.979, + "step": 371 + }, + { + "epoch": 0.020474434476305797, + "grad_norm": 0.9769986271858215, + "learning_rate": 9.998609954561822e-06, + "loss": 0.8972, + "step": 372 + }, + { + "epoch": 0.020529473278661458, + "grad_norm": 1.1682699918746948, + "learning_rate": 9.998599715329e-06, + "loss": 0.943, + "step": 373 + }, + { + "epoch": 0.020584512081017116, + "grad_norm": 1.007912516593933, + "learning_rate": 9.99858943852804e-06, + "loss": 0.8825, + "step": 374 + }, + { + "epoch": 0.020639550883372777, + "grad_norm": 0.9788785576820374, + "learning_rate": 9.99857912415902e-06, + "loss": 0.9667, + "step": 375 + }, + { + "epoch": 0.02069458968572844, + "grad_norm": 1.0804275274276733, + "learning_rate": 9.998568772222017e-06, + "loss": 1.0026, + "step": 376 + }, + { + "epoch": 0.0207496284880841, + "grad_norm": 1.0859237909317017, + "learning_rate": 9.998558382717109e-06, + "loss": 0.9592, + "step": 377 + }, + { + "epoch": 0.02080466729043976, + "grad_norm": 1.2925337553024292, + "learning_rate": 9.998547955644373e-06, + "loss": 0.9067, + "step": 378 + }, + { + "epoch": 0.02085970609279542, + "grad_norm": 0.9853373765945435, + "learning_rate": 9.99853749100389e-06, + "loss": 0.9538, + "step": 379 + }, + { + "epoch": 0.02091474489515108, + "grad_norm": 1.0461076498031616, + "learning_rate": 9.998526988795738e-06, + "loss": 0.9261, + "step": 380 + }, + { + "epoch": 0.020969783697506742, + "grad_norm": 1.024559497833252, + "learning_rate": 9.998516449019995e-06, + "loss": 0.9117, + "step": 381 + }, + { + "epoch": 0.021024822499862404, + "grad_norm": 1.1474825143814087, + "learning_rate": 9.998505871676739e-06, + "loss": 1.0177, + "step": 382 + }, + { + "epoch": 0.021079861302218065, + "grad_norm": 0.9587596654891968, + "learning_rate": 9.998495256766051e-06, + "loss": 0.8809, + "step": 383 + }, + { + "epoch": 0.021134900104573723, + "grad_norm": 0.9505122303962708, + "learning_rate": 9.998484604288013e-06, + "loss": 0.9266, + "step": 384 + }, + { + "epoch": 0.021189938906929384, + "grad_norm": 0.9625647664070129, + "learning_rate": 9.9984739142427e-06, + "loss": 0.9073, + "step": 385 + }, + { + "epoch": 0.021244977709285046, + "grad_norm": 0.9650934338569641, + "learning_rate": 9.998463186630196e-06, + "loss": 0.9042, + "step": 386 + }, + { + "epoch": 0.021300016511640707, + "grad_norm": 1.0289491415023804, + "learning_rate": 9.99845242145058e-06, + "loss": 0.929, + "step": 387 + }, + { + "epoch": 0.02135505531399637, + "grad_norm": 0.9543869495391846, + "learning_rate": 9.998441618703935e-06, + "loss": 0.9406, + "step": 388 + }, + { + "epoch": 0.02141009411635203, + "grad_norm": 0.9276942610740662, + "learning_rate": 9.99843077839034e-06, + "loss": 0.8982, + "step": 389 + }, + { + "epoch": 0.021465132918707688, + "grad_norm": 0.9264664053916931, + "learning_rate": 9.998419900509877e-06, + "loss": 0.7255, + "step": 390 + }, + { + "epoch": 0.02152017172106335, + "grad_norm": 0.9961187243461609, + "learning_rate": 9.998408985062628e-06, + "loss": 0.9826, + "step": 391 + }, + { + "epoch": 0.02157521052341901, + "grad_norm": 0.966596245765686, + "learning_rate": 9.998398032048676e-06, + "loss": 0.8159, + "step": 392 + }, + { + "epoch": 0.021630249325774672, + "grad_norm": 1.1336095333099365, + "learning_rate": 9.998387041468102e-06, + "loss": 0.9289, + "step": 393 + }, + { + "epoch": 0.021685288128130333, + "grad_norm": 1.0453619956970215, + "learning_rate": 9.998376013320989e-06, + "loss": 0.8816, + "step": 394 + }, + { + "epoch": 0.02174032693048599, + "grad_norm": 0.8961821794509888, + "learning_rate": 9.998364947607419e-06, + "loss": 0.871, + "step": 395 + }, + { + "epoch": 0.021795365732841653, + "grad_norm": 1.3420332670211792, + "learning_rate": 9.998353844327477e-06, + "loss": 0.9338, + "step": 396 + }, + { + "epoch": 0.021850404535197314, + "grad_norm": 0.9635335206985474, + "learning_rate": 9.998342703481246e-06, + "loss": 0.9592, + "step": 397 + }, + { + "epoch": 0.021905443337552975, + "grad_norm": 1.3322341442108154, + "learning_rate": 9.998331525068807e-06, + "loss": 1.0974, + "step": 398 + }, + { + "epoch": 0.021960482139908637, + "grad_norm": 1.017220377922058, + "learning_rate": 9.998320309090247e-06, + "loss": 0.9827, + "step": 399 + }, + { + "epoch": 0.022015520942264295, + "grad_norm": 1.0080329179763794, + "learning_rate": 9.99830905554565e-06, + "loss": 0.877, + "step": 400 + }, + { + "epoch": 0.022070559744619956, + "grad_norm": 0.9883211255073547, + "learning_rate": 9.998297764435101e-06, + "loss": 0.9625, + "step": 401 + }, + { + "epoch": 0.022125598546975617, + "grad_norm": 1.0948412418365479, + "learning_rate": 9.998286435758684e-06, + "loss": 0.9058, + "step": 402 + }, + { + "epoch": 0.02218063734933128, + "grad_norm": 0.9402000308036804, + "learning_rate": 9.998275069516482e-06, + "loss": 0.8882, + "step": 403 + }, + { + "epoch": 0.02223567615168694, + "grad_norm": 0.9858806133270264, + "learning_rate": 9.998263665708583e-06, + "loss": 0.9086, + "step": 404 + }, + { + "epoch": 0.0222907149540426, + "grad_norm": 1.0556131601333618, + "learning_rate": 9.998252224335073e-06, + "loss": 0.9583, + "step": 405 + }, + { + "epoch": 0.02234575375639826, + "grad_norm": 1.092766284942627, + "learning_rate": 9.998240745396037e-06, + "loss": 0.9124, + "step": 406 + }, + { + "epoch": 0.02240079255875392, + "grad_norm": 1.1902250051498413, + "learning_rate": 9.998229228891563e-06, + "loss": 1.0566, + "step": 407 + }, + { + "epoch": 0.022455831361109582, + "grad_norm": 1.067906141281128, + "learning_rate": 9.998217674821734e-06, + "loss": 0.9823, + "step": 408 + }, + { + "epoch": 0.022510870163465244, + "grad_norm": 1.0051710605621338, + "learning_rate": 9.998206083186638e-06, + "loss": 0.9141, + "step": 409 + }, + { + "epoch": 0.022565908965820905, + "grad_norm": 1.046412467956543, + "learning_rate": 9.998194453986367e-06, + "loss": 0.9439, + "step": 410 + }, + { + "epoch": 0.022620947768176563, + "grad_norm": 1.1103553771972656, + "learning_rate": 9.998182787221e-06, + "loss": 0.9494, + "step": 411 + }, + { + "epoch": 0.022675986570532224, + "grad_norm": 1.0508466958999634, + "learning_rate": 9.998171082890632e-06, + "loss": 0.9202, + "step": 412 + }, + { + "epoch": 0.022731025372887886, + "grad_norm": 1.1364226341247559, + "learning_rate": 9.998159340995347e-06, + "loss": 0.9859, + "step": 413 + }, + { + "epoch": 0.022786064175243547, + "grad_norm": 1.2073607444763184, + "learning_rate": 9.998147561535234e-06, + "loss": 0.8883, + "step": 414 + }, + { + "epoch": 0.02284110297759921, + "grad_norm": 1.0657012462615967, + "learning_rate": 9.998135744510384e-06, + "loss": 0.8321, + "step": 415 + }, + { + "epoch": 0.02289614177995487, + "grad_norm": 1.0101548433303833, + "learning_rate": 9.998123889920881e-06, + "loss": 0.9374, + "step": 416 + }, + { + "epoch": 0.022951180582310528, + "grad_norm": 1.057455062866211, + "learning_rate": 9.998111997766817e-06, + "loss": 0.8831, + "step": 417 + }, + { + "epoch": 0.02300621938466619, + "grad_norm": 1.206092357635498, + "learning_rate": 9.998100068048282e-06, + "loss": 0.8812, + "step": 418 + }, + { + "epoch": 0.02306125818702185, + "grad_norm": 1.0709773302078247, + "learning_rate": 9.998088100765366e-06, + "loss": 0.9486, + "step": 419 + }, + { + "epoch": 0.023116296989377512, + "grad_norm": 1.066469669342041, + "learning_rate": 9.998076095918156e-06, + "loss": 1.0229, + "step": 420 + }, + { + "epoch": 0.023171335791733173, + "grad_norm": 1.0443583726882935, + "learning_rate": 9.998064053506744e-06, + "loss": 0.8615, + "step": 421 + }, + { + "epoch": 0.02322637459408883, + "grad_norm": 1.103096842765808, + "learning_rate": 9.99805197353122e-06, + "loss": 0.9909, + "step": 422 + }, + { + "epoch": 0.023281413396444493, + "grad_norm": 0.9804643392562866, + "learning_rate": 9.998039855991677e-06, + "loss": 0.9214, + "step": 423 + }, + { + "epoch": 0.023336452198800154, + "grad_norm": 0.9880676865577698, + "learning_rate": 9.998027700888202e-06, + "loss": 0.9345, + "step": 424 + }, + { + "epoch": 0.023391491001155815, + "grad_norm": 0.9633826017379761, + "learning_rate": 9.99801550822089e-06, + "loss": 0.9897, + "step": 425 + }, + { + "epoch": 0.023446529803511477, + "grad_norm": 1.0159331560134888, + "learning_rate": 9.998003277989831e-06, + "loss": 0.9385, + "step": 426 + }, + { + "epoch": 0.023501568605867135, + "grad_norm": 1.009667158126831, + "learning_rate": 9.99799101019512e-06, + "loss": 0.9013, + "step": 427 + }, + { + "epoch": 0.023556607408222796, + "grad_norm": 0.9478578567504883, + "learning_rate": 9.997978704836842e-06, + "loss": 0.8775, + "step": 428 + }, + { + "epoch": 0.023611646210578457, + "grad_norm": 1.013181447982788, + "learning_rate": 9.997966361915096e-06, + "loss": 0.8797, + "step": 429 + }, + { + "epoch": 0.02366668501293412, + "grad_norm": 1.0337481498718262, + "learning_rate": 9.997953981429974e-06, + "loss": 1.0047, + "step": 430 + }, + { + "epoch": 0.02372172381528978, + "grad_norm": 0.9423721432685852, + "learning_rate": 9.997941563381566e-06, + "loss": 0.8639, + "step": 431 + }, + { + "epoch": 0.02377676261764544, + "grad_norm": 1.100492000579834, + "learning_rate": 9.997929107769968e-06, + "loss": 1.0022, + "step": 432 + }, + { + "epoch": 0.0238318014200011, + "grad_norm": 1.1232364177703857, + "learning_rate": 9.997916614595272e-06, + "loss": 0.9145, + "step": 433 + }, + { + "epoch": 0.02388684022235676, + "grad_norm": 0.9466833472251892, + "learning_rate": 9.997904083857572e-06, + "loss": 0.9397, + "step": 434 + }, + { + "epoch": 0.023941879024712422, + "grad_norm": 0.9514566659927368, + "learning_rate": 9.997891515556963e-06, + "loss": 0.8025, + "step": 435 + }, + { + "epoch": 0.023996917827068084, + "grad_norm": 0.9292222261428833, + "learning_rate": 9.997878909693539e-06, + "loss": 0.7739, + "step": 436 + }, + { + "epoch": 0.024051956629423745, + "grad_norm": 1.1049963235855103, + "learning_rate": 9.997866266267397e-06, + "loss": 0.9439, + "step": 437 + }, + { + "epoch": 0.024106995431779403, + "grad_norm": 1.0938019752502441, + "learning_rate": 9.997853585278627e-06, + "loss": 0.9479, + "step": 438 + }, + { + "epoch": 0.024162034234135064, + "grad_norm": 1.0423611402511597, + "learning_rate": 9.997840866727331e-06, + "loss": 0.9309, + "step": 439 + }, + { + "epoch": 0.024217073036490726, + "grad_norm": 1.0584756135940552, + "learning_rate": 9.997828110613598e-06, + "loss": 1.0218, + "step": 440 + }, + { + "epoch": 0.024272111838846387, + "grad_norm": 0.9986408948898315, + "learning_rate": 9.997815316937527e-06, + "loss": 0.9734, + "step": 441 + }, + { + "epoch": 0.02432715064120205, + "grad_norm": 0.9680983424186707, + "learning_rate": 9.997802485699215e-06, + "loss": 0.9286, + "step": 442 + }, + { + "epoch": 0.024382189443557706, + "grad_norm": 1.2231700420379639, + "learning_rate": 9.997789616898757e-06, + "loss": 0.8083, + "step": 443 + }, + { + "epoch": 0.024437228245913368, + "grad_norm": 1.0064021348953247, + "learning_rate": 9.99777671053625e-06, + "loss": 0.9161, + "step": 444 + }, + { + "epoch": 0.02449226704826903, + "grad_norm": 0.9658541679382324, + "learning_rate": 9.99776376661179e-06, + "loss": 0.8027, + "step": 445 + }, + { + "epoch": 0.02454730585062469, + "grad_norm": 0.9440343379974365, + "learning_rate": 9.997750785125477e-06, + "loss": 0.9124, + "step": 446 + }, + { + "epoch": 0.024602344652980352, + "grad_norm": 0.998792827129364, + "learning_rate": 9.997737766077404e-06, + "loss": 0.8699, + "step": 447 + }, + { + "epoch": 0.024657383455336013, + "grad_norm": 1.430880069732666, + "learning_rate": 9.997724709467676e-06, + "loss": 0.9158, + "step": 448 + }, + { + "epoch": 0.02471242225769167, + "grad_norm": 0.9737820029258728, + "learning_rate": 9.997711615296384e-06, + "loss": 0.9496, + "step": 449 + }, + { + "epoch": 0.024767461060047333, + "grad_norm": 0.9710075855255127, + "learning_rate": 9.997698483563629e-06, + "loss": 0.8714, + "step": 450 + }, + { + "epoch": 0.024822499862402994, + "grad_norm": 1.5286253690719604, + "learning_rate": 9.997685314269511e-06, + "loss": 0.8421, + "step": 451 + }, + { + "epoch": 0.024877538664758655, + "grad_norm": 1.0269445180892944, + "learning_rate": 9.99767210741413e-06, + "loss": 1.0131, + "step": 452 + }, + { + "epoch": 0.024932577467114317, + "grad_norm": 0.9780508279800415, + "learning_rate": 9.99765886299758e-06, + "loss": 0.9897, + "step": 453 + }, + { + "epoch": 0.024987616269469975, + "grad_norm": 0.998332679271698, + "learning_rate": 9.997645581019965e-06, + "loss": 0.9647, + "step": 454 + }, + { + "epoch": 0.025042655071825636, + "grad_norm": 1.7062602043151855, + "learning_rate": 9.997632261481383e-06, + "loss": 1.0729, + "step": 455 + }, + { + "epoch": 0.025097693874181298, + "grad_norm": 0.9793694615364075, + "learning_rate": 9.997618904381936e-06, + "loss": 0.9556, + "step": 456 + }, + { + "epoch": 0.02515273267653696, + "grad_norm": 1.0183895826339722, + "learning_rate": 9.997605509721721e-06, + "loss": 0.9194, + "step": 457 + }, + { + "epoch": 0.02520777147889262, + "grad_norm": 1.0288400650024414, + "learning_rate": 9.997592077500844e-06, + "loss": 0.955, + "step": 458 + }, + { + "epoch": 0.025262810281248282, + "grad_norm": 0.9551253914833069, + "learning_rate": 9.997578607719401e-06, + "loss": 0.8498, + "step": 459 + }, + { + "epoch": 0.02531784908360394, + "grad_norm": 0.9648008942604065, + "learning_rate": 9.997565100377494e-06, + "loss": 0.9306, + "step": 460 + }, + { + "epoch": 0.0253728878859596, + "grad_norm": 0.9206677675247192, + "learning_rate": 9.997551555475225e-06, + "loss": 0.7874, + "step": 461 + }, + { + "epoch": 0.025427926688315262, + "grad_norm": 1.0479545593261719, + "learning_rate": 9.997537973012698e-06, + "loss": 0.9201, + "step": 462 + }, + { + "epoch": 0.025482965490670924, + "grad_norm": 1.0329946279525757, + "learning_rate": 9.997524352990013e-06, + "loss": 0.9577, + "step": 463 + }, + { + "epoch": 0.025538004293026585, + "grad_norm": 1.1177828311920166, + "learning_rate": 9.997510695407273e-06, + "loss": 1.0041, + "step": 464 + }, + { + "epoch": 0.025593043095382243, + "grad_norm": 1.0351577997207642, + "learning_rate": 9.99749700026458e-06, + "loss": 0.9952, + "step": 465 + }, + { + "epoch": 0.025648081897737905, + "grad_norm": 0.905274510383606, + "learning_rate": 9.997483267562035e-06, + "loss": 0.8185, + "step": 466 + }, + { + "epoch": 0.025703120700093566, + "grad_norm": 1.0749776363372803, + "learning_rate": 9.997469497299747e-06, + "loss": 1.0611, + "step": 467 + }, + { + "epoch": 0.025758159502449227, + "grad_norm": 0.8972223401069641, + "learning_rate": 9.997455689477815e-06, + "loss": 0.8994, + "step": 468 + }, + { + "epoch": 0.02581319830480489, + "grad_norm": 1.0669914484024048, + "learning_rate": 9.997441844096342e-06, + "loss": 1.06, + "step": 469 + }, + { + "epoch": 0.025868237107160547, + "grad_norm": 1.0431914329528809, + "learning_rate": 9.997427961155435e-06, + "loss": 0.8657, + "step": 470 + }, + { + "epoch": 0.025923275909516208, + "grad_norm": 0.9609962701797485, + "learning_rate": 9.997414040655198e-06, + "loss": 0.8864, + "step": 471 + }, + { + "epoch": 0.02597831471187187, + "grad_norm": 1.0829721689224243, + "learning_rate": 9.997400082595735e-06, + "loss": 0.9221, + "step": 472 + }, + { + "epoch": 0.02603335351422753, + "grad_norm": 0.992082953453064, + "learning_rate": 9.99738608697715e-06, + "loss": 0.8455, + "step": 473 + }, + { + "epoch": 0.026088392316583192, + "grad_norm": 1.0486301183700562, + "learning_rate": 9.997372053799547e-06, + "loss": 0.8729, + "step": 474 + }, + { + "epoch": 0.026143431118938854, + "grad_norm": 1.0328491926193237, + "learning_rate": 9.997357983063036e-06, + "loss": 0.8788, + "step": 475 + }, + { + "epoch": 0.02619846992129451, + "grad_norm": 0.963333249092102, + "learning_rate": 9.997343874767719e-06, + "loss": 0.892, + "step": 476 + }, + { + "epoch": 0.026253508723650173, + "grad_norm": 1.1606497764587402, + "learning_rate": 9.997329728913704e-06, + "loss": 0.9984, + "step": 477 + }, + { + "epoch": 0.026308547526005834, + "grad_norm": 1.241650104522705, + "learning_rate": 9.997315545501096e-06, + "loss": 0.946, + "step": 478 + }, + { + "epoch": 0.026363586328361496, + "grad_norm": 1.008004069328308, + "learning_rate": 9.99730132453e-06, + "loss": 0.849, + "step": 479 + }, + { + "epoch": 0.026418625130717157, + "grad_norm": 0.9883478879928589, + "learning_rate": 9.997287066000527e-06, + "loss": 0.9478, + "step": 480 + }, + { + "epoch": 0.026473663933072815, + "grad_norm": 1.0224446058273315, + "learning_rate": 9.997272769912783e-06, + "loss": 1.0318, + "step": 481 + }, + { + "epoch": 0.026528702735428476, + "grad_norm": 0.9412569403648376, + "learning_rate": 9.997258436266874e-06, + "loss": 0.9119, + "step": 482 + }, + { + "epoch": 0.026583741537784138, + "grad_norm": 0.9214537739753723, + "learning_rate": 9.997244065062906e-06, + "loss": 0.8785, + "step": 483 + }, + { + "epoch": 0.0266387803401398, + "grad_norm": 1.0015628337860107, + "learning_rate": 9.997229656300991e-06, + "loss": 0.8869, + "step": 484 + }, + { + "epoch": 0.02669381914249546, + "grad_norm": 0.8965190052986145, + "learning_rate": 9.997215209981237e-06, + "loss": 0.7009, + "step": 485 + }, + { + "epoch": 0.02674885794485112, + "grad_norm": 1.1976135969161987, + "learning_rate": 9.997200726103749e-06, + "loss": 0.9795, + "step": 486 + }, + { + "epoch": 0.02680389674720678, + "grad_norm": 0.864780843257904, + "learning_rate": 9.997186204668639e-06, + "loss": 0.7687, + "step": 487 + }, + { + "epoch": 0.02685893554956244, + "grad_norm": 0.9946566820144653, + "learning_rate": 9.997171645676013e-06, + "loss": 0.9672, + "step": 488 + }, + { + "epoch": 0.026913974351918103, + "grad_norm": 1.043835997581482, + "learning_rate": 9.997157049125985e-06, + "loss": 0.862, + "step": 489 + }, + { + "epoch": 0.026969013154273764, + "grad_norm": 0.9697456955909729, + "learning_rate": 9.99714241501866e-06, + "loss": 0.8368, + "step": 490 + }, + { + "epoch": 0.027024051956629425, + "grad_norm": 0.9975618124008179, + "learning_rate": 9.997127743354153e-06, + "loss": 0.8739, + "step": 491 + }, + { + "epoch": 0.027079090758985083, + "grad_norm": 1.0055313110351562, + "learning_rate": 9.99711303413257e-06, + "loss": 0.9227, + "step": 492 + }, + { + "epoch": 0.027134129561340745, + "grad_norm": 1.0418384075164795, + "learning_rate": 9.997098287354024e-06, + "loss": 0.9978, + "step": 493 + }, + { + "epoch": 0.027189168363696406, + "grad_norm": 0.8648970723152161, + "learning_rate": 9.997083503018625e-06, + "loss": 0.8363, + "step": 494 + }, + { + "epoch": 0.027244207166052067, + "grad_norm": 1.13506019115448, + "learning_rate": 9.997068681126483e-06, + "loss": 0.8851, + "step": 495 + }, + { + "epoch": 0.02729924596840773, + "grad_norm": 0.974400520324707, + "learning_rate": 9.997053821677712e-06, + "loss": 0.8533, + "step": 496 + }, + { + "epoch": 0.027354284770763387, + "grad_norm": 1.226507544517517, + "learning_rate": 9.997038924672419e-06, + "loss": 0.8586, + "step": 497 + }, + { + "epoch": 0.027409323573119048, + "grad_norm": 1.004753589630127, + "learning_rate": 9.997023990110721e-06, + "loss": 0.8974, + "step": 498 + }, + { + "epoch": 0.02746436237547471, + "grad_norm": 1.0492571592330933, + "learning_rate": 9.997009017992729e-06, + "loss": 0.8457, + "step": 499 + }, + { + "epoch": 0.02751940117783037, + "grad_norm": 1.0068167448043823, + "learning_rate": 9.996994008318554e-06, + "loss": 0.9608, + "step": 500 + }, + { + "epoch": 0.027574439980186032, + "grad_norm": 0.9686044454574585, + "learning_rate": 9.996978961088311e-06, + "loss": 0.9041, + "step": 501 + }, + { + "epoch": 0.027629478782541694, + "grad_norm": 1.281728744506836, + "learning_rate": 9.99696387630211e-06, + "loss": 0.9739, + "step": 502 + }, + { + "epoch": 0.02768451758489735, + "grad_norm": 0.9069758653640747, + "learning_rate": 9.996948753960065e-06, + "loss": 0.8467, + "step": 503 + }, + { + "epoch": 0.027739556387253013, + "grad_norm": 1.0337222814559937, + "learning_rate": 9.996933594062293e-06, + "loss": 0.9638, + "step": 504 + }, + { + "epoch": 0.027794595189608674, + "grad_norm": 0.9695359468460083, + "learning_rate": 9.996918396608905e-06, + "loss": 0.8986, + "step": 505 + }, + { + "epoch": 0.027849633991964336, + "grad_norm": 0.9120615124702454, + "learning_rate": 9.996903161600016e-06, + "loss": 0.9103, + "step": 506 + }, + { + "epoch": 0.027904672794319997, + "grad_norm": 0.9736546874046326, + "learning_rate": 9.996887889035741e-06, + "loss": 0.9308, + "step": 507 + }, + { + "epoch": 0.027959711596675655, + "grad_norm": 1.0184897184371948, + "learning_rate": 9.996872578916192e-06, + "loss": 0.8978, + "step": 508 + }, + { + "epoch": 0.028014750399031316, + "grad_norm": 0.9791838526725769, + "learning_rate": 9.996857231241489e-06, + "loss": 0.8639, + "step": 509 + }, + { + "epoch": 0.028069789201386978, + "grad_norm": 1.2985681295394897, + "learning_rate": 9.996841846011742e-06, + "loss": 0.9581, + "step": 510 + }, + { + "epoch": 0.02812482800374264, + "grad_norm": 1.0647368431091309, + "learning_rate": 9.996826423227071e-06, + "loss": 1.0565, + "step": 511 + }, + { + "epoch": 0.0281798668060983, + "grad_norm": 1.0336421728134155, + "learning_rate": 9.996810962887591e-06, + "loss": 1.008, + "step": 512 + }, + { + "epoch": 0.02823490560845396, + "grad_norm": 1.1838933229446411, + "learning_rate": 9.996795464993416e-06, + "loss": 0.8359, + "step": 513 + }, + { + "epoch": 0.02828994441080962, + "grad_norm": 0.9898360371589661, + "learning_rate": 9.996779929544663e-06, + "loss": 0.8501, + "step": 514 + }, + { + "epoch": 0.02834498321316528, + "grad_norm": 0.9836066365242004, + "learning_rate": 9.99676435654145e-06, + "loss": 0.8795, + "step": 515 + }, + { + "epoch": 0.028400022015520943, + "grad_norm": 1.0621601343154907, + "learning_rate": 9.996748745983895e-06, + "loss": 0.8746, + "step": 516 + }, + { + "epoch": 0.028455060817876604, + "grad_norm": 1.0082437992095947, + "learning_rate": 9.996733097872113e-06, + "loss": 0.9278, + "step": 517 + }, + { + "epoch": 0.028510099620232265, + "grad_norm": 0.9903931617736816, + "learning_rate": 9.996717412206222e-06, + "loss": 0.8264, + "step": 518 + }, + { + "epoch": 0.028565138422587923, + "grad_norm": 1.0797243118286133, + "learning_rate": 9.996701688986342e-06, + "loss": 1.0077, + "step": 519 + }, + { + "epoch": 0.028620177224943585, + "grad_norm": 1.147133231163025, + "learning_rate": 9.99668592821259e-06, + "loss": 0.9374, + "step": 520 + }, + { + "epoch": 0.028675216027299246, + "grad_norm": 0.9993947744369507, + "learning_rate": 9.996670129885082e-06, + "loss": 0.9562, + "step": 521 + }, + { + "epoch": 0.028730254829654907, + "grad_norm": 0.8580895066261292, + "learning_rate": 9.99665429400394e-06, + "loss": 0.7985, + "step": 522 + }, + { + "epoch": 0.02878529363201057, + "grad_norm": 0.9251388907432556, + "learning_rate": 9.996638420569281e-06, + "loss": 0.7323, + "step": 523 + }, + { + "epoch": 0.028840332434366227, + "grad_norm": 1.0010193586349487, + "learning_rate": 9.996622509581227e-06, + "loss": 0.9316, + "step": 524 + }, + { + "epoch": 0.028895371236721888, + "grad_norm": 0.9822579026222229, + "learning_rate": 9.996606561039894e-06, + "loss": 0.8978, + "step": 525 + }, + { + "epoch": 0.02895041003907755, + "grad_norm": 1.0760595798492432, + "learning_rate": 9.996590574945403e-06, + "loss": 0.9125, + "step": 526 + }, + { + "epoch": 0.02900544884143321, + "grad_norm": 1.138869285583496, + "learning_rate": 9.996574551297876e-06, + "loss": 0.8185, + "step": 527 + }, + { + "epoch": 0.029060487643788872, + "grad_norm": 1.002994179725647, + "learning_rate": 9.996558490097433e-06, + "loss": 0.9404, + "step": 528 + }, + { + "epoch": 0.02911552644614453, + "grad_norm": 0.9550611972808838, + "learning_rate": 9.996542391344194e-06, + "loss": 0.859, + "step": 529 + }, + { + "epoch": 0.02917056524850019, + "grad_norm": 0.9236055612564087, + "learning_rate": 9.996526255038277e-06, + "loss": 0.7758, + "step": 530 + }, + { + "epoch": 0.029225604050855853, + "grad_norm": 1.103966474533081, + "learning_rate": 9.996510081179808e-06, + "loss": 1.0147, + "step": 531 + }, + { + "epoch": 0.029280642853211514, + "grad_norm": 0.9884665012359619, + "learning_rate": 9.996493869768906e-06, + "loss": 0.8784, + "step": 532 + }, + { + "epoch": 0.029335681655567176, + "grad_norm": 0.9173223376274109, + "learning_rate": 9.996477620805694e-06, + "loss": 0.8741, + "step": 533 + }, + { + "epoch": 0.029390720457922837, + "grad_norm": 0.965548574924469, + "learning_rate": 9.996461334290294e-06, + "loss": 0.8989, + "step": 534 + }, + { + "epoch": 0.029445759260278495, + "grad_norm": 0.9939296245574951, + "learning_rate": 9.996445010222828e-06, + "loss": 0.8552, + "step": 535 + }, + { + "epoch": 0.029500798062634156, + "grad_norm": 1.0081578493118286, + "learning_rate": 9.996428648603417e-06, + "loss": 0.9138, + "step": 536 + }, + { + "epoch": 0.029555836864989818, + "grad_norm": 1.0139487981796265, + "learning_rate": 9.996412249432188e-06, + "loss": 0.9452, + "step": 537 + }, + { + "epoch": 0.02961087566734548, + "grad_norm": 0.9463647603988647, + "learning_rate": 9.996395812709262e-06, + "loss": 0.8721, + "step": 538 + }, + { + "epoch": 0.02966591446970114, + "grad_norm": 0.9981473684310913, + "learning_rate": 9.99637933843476e-06, + "loss": 0.7791, + "step": 539 + }, + { + "epoch": 0.0297209532720568, + "grad_norm": 1.1637190580368042, + "learning_rate": 9.996362826608812e-06, + "loss": 0.8798, + "step": 540 + }, + { + "epoch": 0.02977599207441246, + "grad_norm": 2.2887051105499268, + "learning_rate": 9.996346277231536e-06, + "loss": 0.9303, + "step": 541 + }, + { + "epoch": 0.02983103087676812, + "grad_norm": 0.9173391461372375, + "learning_rate": 9.99632969030306e-06, + "loss": 0.8627, + "step": 542 + }, + { + "epoch": 0.029886069679123783, + "grad_norm": 1.033355474472046, + "learning_rate": 9.996313065823506e-06, + "loss": 0.9906, + "step": 543 + }, + { + "epoch": 0.029941108481479444, + "grad_norm": 0.9286639094352722, + "learning_rate": 9.996296403793002e-06, + "loss": 0.7043, + "step": 544 + }, + { + "epoch": 0.029996147283835102, + "grad_norm": 0.963238000869751, + "learning_rate": 9.996279704211671e-06, + "loss": 1.0236, + "step": 545 + }, + { + "epoch": 0.030051186086190763, + "grad_norm": 1.0275089740753174, + "learning_rate": 9.99626296707964e-06, + "loss": 0.976, + "step": 546 + }, + { + "epoch": 0.030106224888546425, + "grad_norm": 1.0944674015045166, + "learning_rate": 9.996246192397032e-06, + "loss": 0.9209, + "step": 547 + }, + { + "epoch": 0.030161263690902086, + "grad_norm": 0.9620945453643799, + "learning_rate": 9.996229380163976e-06, + "loss": 0.8973, + "step": 548 + }, + { + "epoch": 0.030216302493257748, + "grad_norm": 1.032549500465393, + "learning_rate": 9.996212530380597e-06, + "loss": 0.892, + "step": 549 + }, + { + "epoch": 0.03027134129561341, + "grad_norm": 1.0433719158172607, + "learning_rate": 9.996195643047023e-06, + "loss": 0.8428, + "step": 550 + }, + { + "epoch": 0.030326380097969067, + "grad_norm": 1.1541085243225098, + "learning_rate": 9.996178718163378e-06, + "loss": 0.9084, + "step": 551 + }, + { + "epoch": 0.03038141890032473, + "grad_norm": 0.9386873245239258, + "learning_rate": 9.996161755729793e-06, + "loss": 0.9246, + "step": 552 + }, + { + "epoch": 0.03043645770268039, + "grad_norm": 1.092236042022705, + "learning_rate": 9.996144755746393e-06, + "loss": 0.8419, + "step": 553 + }, + { + "epoch": 0.03049149650503605, + "grad_norm": 0.9517606496810913, + "learning_rate": 9.996127718213306e-06, + "loss": 0.9002, + "step": 554 + }, + { + "epoch": 0.030546535307391712, + "grad_norm": 0.965972900390625, + "learning_rate": 9.996110643130661e-06, + "loss": 0.9197, + "step": 555 + }, + { + "epoch": 0.03060157410974737, + "grad_norm": 0.9396095275878906, + "learning_rate": 9.996093530498586e-06, + "loss": 0.8686, + "step": 556 + }, + { + "epoch": 0.030656612912103032, + "grad_norm": 1.0154120922088623, + "learning_rate": 9.99607638031721e-06, + "loss": 0.9773, + "step": 557 + }, + { + "epoch": 0.030711651714458693, + "grad_norm": 1.3572301864624023, + "learning_rate": 9.99605919258666e-06, + "loss": 0.911, + "step": 558 + }, + { + "epoch": 0.030766690516814355, + "grad_norm": 0.968278169631958, + "learning_rate": 9.996041967307066e-06, + "loss": 0.7704, + "step": 559 + }, + { + "epoch": 0.030821729319170016, + "grad_norm": 0.9867869019508362, + "learning_rate": 9.99602470447856e-06, + "loss": 0.873, + "step": 560 + }, + { + "epoch": 0.030876768121525677, + "grad_norm": 1.056450605392456, + "learning_rate": 9.996007404101269e-06, + "loss": 0.941, + "step": 561 + }, + { + "epoch": 0.030931806923881335, + "grad_norm": 1.0419799089431763, + "learning_rate": 9.995990066175321e-06, + "loss": 0.957, + "step": 562 + }, + { + "epoch": 0.030986845726236997, + "grad_norm": 0.9789314866065979, + "learning_rate": 9.995972690700852e-06, + "loss": 0.9229, + "step": 563 + }, + { + "epoch": 0.031041884528592658, + "grad_norm": 0.917783796787262, + "learning_rate": 9.995955277677989e-06, + "loss": 0.8186, + "step": 564 + }, + { + "epoch": 0.03109692333094832, + "grad_norm": 1.0231432914733887, + "learning_rate": 9.995937827106863e-06, + "loss": 0.8624, + "step": 565 + }, + { + "epoch": 0.03115196213330398, + "grad_norm": 0.9552083015441895, + "learning_rate": 9.995920338987605e-06, + "loss": 0.7967, + "step": 566 + }, + { + "epoch": 0.03120700093565964, + "grad_norm": 0.9441083669662476, + "learning_rate": 9.995902813320349e-06, + "loss": 0.8471, + "step": 567 + }, + { + "epoch": 0.0312620397380153, + "grad_norm": 1.0025299787521362, + "learning_rate": 9.995885250105223e-06, + "loss": 0.8646, + "step": 568 + }, + { + "epoch": 0.03131707854037096, + "grad_norm": 0.8997280597686768, + "learning_rate": 9.99586764934236e-06, + "loss": 0.8736, + "step": 569 + }, + { + "epoch": 0.03137211734272662, + "grad_norm": 0.9090663194656372, + "learning_rate": 9.995850011031896e-06, + "loss": 0.8548, + "step": 570 + }, + { + "epoch": 0.031427156145082284, + "grad_norm": 0.9641294479370117, + "learning_rate": 9.995832335173959e-06, + "loss": 0.8667, + "step": 571 + }, + { + "epoch": 0.031482194947437946, + "grad_norm": 0.9165804982185364, + "learning_rate": 9.995814621768682e-06, + "loss": 0.803, + "step": 572 + }, + { + "epoch": 0.03153723374979361, + "grad_norm": 0.9672492742538452, + "learning_rate": 9.995796870816202e-06, + "loss": 0.8335, + "step": 573 + }, + { + "epoch": 0.03159227255214927, + "grad_norm": 0.9359404444694519, + "learning_rate": 9.995779082316648e-06, + "loss": 0.8294, + "step": 574 + }, + { + "epoch": 0.03164731135450492, + "grad_norm": 0.926925003528595, + "learning_rate": 9.995761256270157e-06, + "loss": 0.7714, + "step": 575 + }, + { + "epoch": 0.031702350156860584, + "grad_norm": 1.1848629713058472, + "learning_rate": 9.995743392676862e-06, + "loss": 0.8925, + "step": 576 + }, + { + "epoch": 0.031757388959216246, + "grad_norm": 0.9624786972999573, + "learning_rate": 9.995725491536897e-06, + "loss": 0.9292, + "step": 577 + }, + { + "epoch": 0.03181242776157191, + "grad_norm": 0.9479736089706421, + "learning_rate": 9.995707552850396e-06, + "loss": 0.8797, + "step": 578 + }, + { + "epoch": 0.03186746656392757, + "grad_norm": 0.9551546573638916, + "learning_rate": 9.995689576617494e-06, + "loss": 0.8793, + "step": 579 + }, + { + "epoch": 0.03192250536628323, + "grad_norm": 0.9210056662559509, + "learning_rate": 9.995671562838325e-06, + "loss": 0.9714, + "step": 580 + }, + { + "epoch": 0.03197754416863889, + "grad_norm": 1.063117504119873, + "learning_rate": 9.995653511513029e-06, + "loss": 0.9608, + "step": 581 + }, + { + "epoch": 0.03203258297099455, + "grad_norm": 0.9426459670066833, + "learning_rate": 9.995635422641736e-06, + "loss": 0.9102, + "step": 582 + }, + { + "epoch": 0.032087621773350214, + "grad_norm": 1.0176693201065063, + "learning_rate": 9.995617296224584e-06, + "loss": 0.9109, + "step": 583 + }, + { + "epoch": 0.032142660575705875, + "grad_norm": 0.9457042217254639, + "learning_rate": 9.995599132261711e-06, + "loss": 0.9017, + "step": 584 + }, + { + "epoch": 0.03219769937806154, + "grad_norm": 1.5851638317108154, + "learning_rate": 9.995580930753252e-06, + "loss": 0.967, + "step": 585 + }, + { + "epoch": 0.03225273818041719, + "grad_norm": 0.9961487054824829, + "learning_rate": 9.995562691699345e-06, + "loss": 0.9396, + "step": 586 + }, + { + "epoch": 0.03230777698277285, + "grad_norm": 0.9892112016677856, + "learning_rate": 9.995544415100125e-06, + "loss": 0.9058, + "step": 587 + }, + { + "epoch": 0.032362815785128514, + "grad_norm": 0.9052272439002991, + "learning_rate": 9.99552610095573e-06, + "loss": 0.9194, + "step": 588 + }, + { + "epoch": 0.032417854587484175, + "grad_norm": 0.8381399512290955, + "learning_rate": 9.995507749266297e-06, + "loss": 0.7465, + "step": 589 + }, + { + "epoch": 0.03247289338983984, + "grad_norm": 1.018964171409607, + "learning_rate": 9.995489360031969e-06, + "loss": 0.841, + "step": 590 + }, + { + "epoch": 0.0325279321921955, + "grad_norm": 0.908311128616333, + "learning_rate": 9.995470933252876e-06, + "loss": 0.8592, + "step": 591 + }, + { + "epoch": 0.03258297099455116, + "grad_norm": 1.2986040115356445, + "learning_rate": 9.995452468929162e-06, + "loss": 0.8341, + "step": 592 + }, + { + "epoch": 0.03263800979690682, + "grad_norm": 1.6565190553665161, + "learning_rate": 9.995433967060966e-06, + "loss": 0.8681, + "step": 593 + }, + { + "epoch": 0.03269304859926248, + "grad_norm": 0.9725674390792847, + "learning_rate": 9.995415427648423e-06, + "loss": 0.8449, + "step": 594 + }, + { + "epoch": 0.032748087401618144, + "grad_norm": 0.8683852553367615, + "learning_rate": 9.995396850691677e-06, + "loss": 0.8478, + "step": 595 + }, + { + "epoch": 0.0328031262039738, + "grad_norm": 0.9912856817245483, + "learning_rate": 9.995378236190862e-06, + "loss": 0.8912, + "step": 596 + }, + { + "epoch": 0.03285816500632946, + "grad_norm": 0.9396800398826599, + "learning_rate": 9.995359584146125e-06, + "loss": 0.856, + "step": 597 + }, + { + "epoch": 0.03291320380868512, + "grad_norm": 1.385006308555603, + "learning_rate": 9.995340894557601e-06, + "loss": 0.9633, + "step": 598 + }, + { + "epoch": 0.03296824261104078, + "grad_norm": 0.8982875943183899, + "learning_rate": 9.995322167425433e-06, + "loss": 0.9244, + "step": 599 + }, + { + "epoch": 0.033023281413396444, + "grad_norm": 0.8981022834777832, + "learning_rate": 9.995303402749759e-06, + "loss": 0.8854, + "step": 600 + }, + { + "epoch": 0.033078320215752105, + "grad_norm": 0.9917197227478027, + "learning_rate": 9.995284600530724e-06, + "loss": 1.0086, + "step": 601 + }, + { + "epoch": 0.033133359018107766, + "grad_norm": 1.0540626049041748, + "learning_rate": 9.995265760768464e-06, + "loss": 1.0022, + "step": 602 + }, + { + "epoch": 0.03318839782046343, + "grad_norm": 0.9523479342460632, + "learning_rate": 9.995246883463126e-06, + "loss": 0.9893, + "step": 603 + }, + { + "epoch": 0.03324343662281909, + "grad_norm": 0.9824770092964172, + "learning_rate": 9.99522796861485e-06, + "loss": 0.8385, + "step": 604 + }, + { + "epoch": 0.03329847542517475, + "grad_norm": 1.0968893766403198, + "learning_rate": 9.995209016223776e-06, + "loss": 1.0109, + "step": 605 + }, + { + "epoch": 0.03335351422753041, + "grad_norm": 0.9115625023841858, + "learning_rate": 9.995190026290049e-06, + "loss": 0.8656, + "step": 606 + }, + { + "epoch": 0.033408553029886066, + "grad_norm": 0.9795814156532288, + "learning_rate": 9.99517099881381e-06, + "loss": 0.8941, + "step": 607 + }, + { + "epoch": 0.03346359183224173, + "grad_norm": 0.9317291378974915, + "learning_rate": 9.995151933795204e-06, + "loss": 0.7819, + "step": 608 + }, + { + "epoch": 0.03351863063459739, + "grad_norm": 0.9936283230781555, + "learning_rate": 9.995132831234373e-06, + "loss": 0.8674, + "step": 609 + }, + { + "epoch": 0.03357366943695305, + "grad_norm": 0.9872812032699585, + "learning_rate": 9.995113691131462e-06, + "loss": 0.9038, + "step": 610 + }, + { + "epoch": 0.03362870823930871, + "grad_norm": 0.9516895413398743, + "learning_rate": 9.995094513486611e-06, + "loss": 0.9038, + "step": 611 + }, + { + "epoch": 0.03368374704166437, + "grad_norm": 1.090579867362976, + "learning_rate": 9.995075298299968e-06, + "loss": 0.9587, + "step": 612 + }, + { + "epoch": 0.033738785844020035, + "grad_norm": 1.021398663520813, + "learning_rate": 9.995056045571677e-06, + "loss": 0.9569, + "step": 613 + }, + { + "epoch": 0.033793824646375696, + "grad_norm": 1.009657382965088, + "learning_rate": 9.99503675530188e-06, + "loss": 0.8346, + "step": 614 + }, + { + "epoch": 0.03384886344873136, + "grad_norm": 1.0478712320327759, + "learning_rate": 9.995017427490725e-06, + "loss": 1.0566, + "step": 615 + }, + { + "epoch": 0.03390390225108702, + "grad_norm": 1.1391830444335938, + "learning_rate": 9.994998062138355e-06, + "loss": 1.0727, + "step": 616 + }, + { + "epoch": 0.03395894105344268, + "grad_norm": 1.0172302722930908, + "learning_rate": 9.994978659244918e-06, + "loss": 0.7869, + "step": 617 + }, + { + "epoch": 0.034013979855798335, + "grad_norm": 1.0532630681991577, + "learning_rate": 9.994959218810558e-06, + "loss": 0.8626, + "step": 618 + }, + { + "epoch": 0.034069018658153996, + "grad_norm": 0.8300478458404541, + "learning_rate": 9.99493974083542e-06, + "loss": 0.8166, + "step": 619 + }, + { + "epoch": 0.03412405746050966, + "grad_norm": 1.0613664388656616, + "learning_rate": 9.994920225319656e-06, + "loss": 0.8899, + "step": 620 + }, + { + "epoch": 0.03417909626286532, + "grad_norm": 0.9827042818069458, + "learning_rate": 9.994900672263406e-06, + "loss": 0.8243, + "step": 621 + }, + { + "epoch": 0.03423413506522098, + "grad_norm": 0.8790082931518555, + "learning_rate": 9.994881081666818e-06, + "loss": 0.8153, + "step": 622 + }, + { + "epoch": 0.03428917386757664, + "grad_norm": 1.033378005027771, + "learning_rate": 9.994861453530044e-06, + "loss": 0.8916, + "step": 623 + }, + { + "epoch": 0.0343442126699323, + "grad_norm": 0.9547238349914551, + "learning_rate": 9.994841787853227e-06, + "loss": 0.9141, + "step": 624 + }, + { + "epoch": 0.034399251472287964, + "grad_norm": 0.9606438279151917, + "learning_rate": 9.994822084636514e-06, + "loss": 0.9435, + "step": 625 + }, + { + "epoch": 0.034454290274643626, + "grad_norm": 0.8461503982543945, + "learning_rate": 9.994802343880059e-06, + "loss": 0.7914, + "step": 626 + }, + { + "epoch": 0.03450932907699929, + "grad_norm": 1.144538402557373, + "learning_rate": 9.994782565584004e-06, + "loss": 0.8025, + "step": 627 + }, + { + "epoch": 0.03456436787935495, + "grad_norm": 1.0099962949752808, + "learning_rate": 9.994762749748502e-06, + "loss": 0.9607, + "step": 628 + }, + { + "epoch": 0.0346194066817106, + "grad_norm": 0.9822041988372803, + "learning_rate": 9.9947428963737e-06, + "loss": 0.9216, + "step": 629 + }, + { + "epoch": 0.034674445484066264, + "grad_norm": 0.9056866765022278, + "learning_rate": 9.994723005459746e-06, + "loss": 0.7913, + "step": 630 + }, + { + "epoch": 0.034729484286421926, + "grad_norm": 1.0099287033081055, + "learning_rate": 9.994703077006792e-06, + "loss": 0.9937, + "step": 631 + }, + { + "epoch": 0.03478452308877759, + "grad_norm": 0.9559167623519897, + "learning_rate": 9.994683111014984e-06, + "loss": 0.9774, + "step": 632 + }, + { + "epoch": 0.03483956189113325, + "grad_norm": 1.0359059572219849, + "learning_rate": 9.994663107484478e-06, + "loss": 0.9062, + "step": 633 + }, + { + "epoch": 0.03489460069348891, + "grad_norm": 0.8803057074546814, + "learning_rate": 9.99464306641542e-06, + "loss": 0.9638, + "step": 634 + }, + { + "epoch": 0.03494963949584457, + "grad_norm": 1.0926579236984253, + "learning_rate": 9.994622987807962e-06, + "loss": 1.0467, + "step": 635 + }, + { + "epoch": 0.03500467829820023, + "grad_norm": 1.0051401853561401, + "learning_rate": 9.994602871662253e-06, + "loss": 0.8717, + "step": 636 + }, + { + "epoch": 0.035059717100555894, + "grad_norm": 1.2007508277893066, + "learning_rate": 9.994582717978448e-06, + "loss": 0.8004, + "step": 637 + }, + { + "epoch": 0.035114755902911556, + "grad_norm": 0.8826266527175903, + "learning_rate": 9.994562526756695e-06, + "loss": 0.8888, + "step": 638 + }, + { + "epoch": 0.03516979470526721, + "grad_norm": 0.9953717589378357, + "learning_rate": 9.994542297997147e-06, + "loss": 0.8999, + "step": 639 + }, + { + "epoch": 0.03522483350762287, + "grad_norm": 1.0203614234924316, + "learning_rate": 9.994522031699958e-06, + "loss": 0.8241, + "step": 640 + }, + { + "epoch": 0.03527987230997853, + "grad_norm": 0.8760203719139099, + "learning_rate": 9.994501727865276e-06, + "loss": 0.7893, + "step": 641 + }, + { + "epoch": 0.035334911112334194, + "grad_norm": 1.024888277053833, + "learning_rate": 9.994481386493257e-06, + "loss": 0.9865, + "step": 642 + }, + { + "epoch": 0.035389949914689856, + "grad_norm": 0.907454788684845, + "learning_rate": 9.994461007584052e-06, + "loss": 0.891, + "step": 643 + }, + { + "epoch": 0.03544498871704552, + "grad_norm": 1.0400965213775635, + "learning_rate": 9.994440591137816e-06, + "loss": 0.9345, + "step": 644 + }, + { + "epoch": 0.03550002751940118, + "grad_norm": 0.9816616177558899, + "learning_rate": 9.9944201371547e-06, + "loss": 0.91, + "step": 645 + }, + { + "epoch": 0.03555506632175684, + "grad_norm": 1.0528117418289185, + "learning_rate": 9.99439964563486e-06, + "loss": 0.952, + "step": 646 + }, + { + "epoch": 0.0356101051241125, + "grad_norm": 0.9802080988883972, + "learning_rate": 9.99437911657845e-06, + "loss": 0.9392, + "step": 647 + }, + { + "epoch": 0.03566514392646816, + "grad_norm": 0.9580393433570862, + "learning_rate": 9.994358549985623e-06, + "loss": 0.874, + "step": 648 + }, + { + "epoch": 0.035720182728823824, + "grad_norm": 0.8935576677322388, + "learning_rate": 9.994337945856533e-06, + "loss": 0.8435, + "step": 649 + }, + { + "epoch": 0.03577522153117948, + "grad_norm": 1.009699821472168, + "learning_rate": 9.994317304191337e-06, + "loss": 0.9436, + "step": 650 + }, + { + "epoch": 0.03583026033353514, + "grad_norm": 0.9126121401786804, + "learning_rate": 9.994296624990188e-06, + "loss": 0.8424, + "step": 651 + }, + { + "epoch": 0.0358852991358908, + "grad_norm": 0.9555553197860718, + "learning_rate": 9.994275908253243e-06, + "loss": 0.93, + "step": 652 + }, + { + "epoch": 0.03594033793824646, + "grad_norm": 0.8359857797622681, + "learning_rate": 9.994255153980658e-06, + "loss": 0.6326, + "step": 653 + }, + { + "epoch": 0.035995376740602124, + "grad_norm": 0.8918783664703369, + "learning_rate": 9.994234362172587e-06, + "loss": 0.8287, + "step": 654 + }, + { + "epoch": 0.036050415542957785, + "grad_norm": 0.9878549575805664, + "learning_rate": 9.994213532829188e-06, + "loss": 0.8841, + "step": 655 + }, + { + "epoch": 0.03610545434531345, + "grad_norm": 0.9504040479660034, + "learning_rate": 9.994192665950617e-06, + "loss": 1.0182, + "step": 656 + }, + { + "epoch": 0.03616049314766911, + "grad_norm": 0.9531422257423401, + "learning_rate": 9.99417176153703e-06, + "loss": 0.8504, + "step": 657 + }, + { + "epoch": 0.03621553195002477, + "grad_norm": 0.9580292105674744, + "learning_rate": 9.994150819588587e-06, + "loss": 0.8048, + "step": 658 + }, + { + "epoch": 0.03627057075238043, + "grad_norm": 0.9786819815635681, + "learning_rate": 9.99412984010544e-06, + "loss": 0.9124, + "step": 659 + }, + { + "epoch": 0.03632560955473609, + "grad_norm": 0.9733422994613647, + "learning_rate": 9.994108823087751e-06, + "loss": 0.8868, + "step": 660 + }, + { + "epoch": 0.03638064835709175, + "grad_norm": 1.093173623085022, + "learning_rate": 9.994087768535679e-06, + "loss": 0.9428, + "step": 661 + }, + { + "epoch": 0.03643568715944741, + "grad_norm": 0.9067148566246033, + "learning_rate": 9.994066676449378e-06, + "loss": 0.8838, + "step": 662 + }, + { + "epoch": 0.03649072596180307, + "grad_norm": 0.9509521722793579, + "learning_rate": 9.99404554682901e-06, + "loss": 0.9034, + "step": 663 + }, + { + "epoch": 0.03654576476415873, + "grad_norm": 0.9523824453353882, + "learning_rate": 9.994024379674731e-06, + "loss": 0.9623, + "step": 664 + }, + { + "epoch": 0.03660080356651439, + "grad_norm": 0.987276554107666, + "learning_rate": 9.994003174986703e-06, + "loss": 0.8817, + "step": 665 + }, + { + "epoch": 0.036655842368870054, + "grad_norm": 0.9500744342803955, + "learning_rate": 9.993981932765083e-06, + "loss": 0.9742, + "step": 666 + }, + { + "epoch": 0.036710881171225715, + "grad_norm": 0.9420705437660217, + "learning_rate": 9.993960653010034e-06, + "loss": 0.9657, + "step": 667 + }, + { + "epoch": 0.036765919973581376, + "grad_norm": 0.9443248510360718, + "learning_rate": 9.99393933572171e-06, + "loss": 0.8468, + "step": 668 + }, + { + "epoch": 0.03682095877593704, + "grad_norm": 0.9666558504104614, + "learning_rate": 9.993917980900276e-06, + "loss": 0.9871, + "step": 669 + }, + { + "epoch": 0.0368759975782927, + "grad_norm": 1.0236201286315918, + "learning_rate": 9.993896588545892e-06, + "loss": 0.9814, + "step": 670 + }, + { + "epoch": 0.03693103638064836, + "grad_norm": 1.016190528869629, + "learning_rate": 9.993875158658716e-06, + "loss": 1.0156, + "step": 671 + }, + { + "epoch": 0.036986075183004015, + "grad_norm": 0.9296661019325256, + "learning_rate": 9.993853691238913e-06, + "loss": 0.7956, + "step": 672 + }, + { + "epoch": 0.037041113985359676, + "grad_norm": 0.9276684522628784, + "learning_rate": 9.993832186286643e-06, + "loss": 0.9253, + "step": 673 + }, + { + "epoch": 0.03709615278771534, + "grad_norm": 0.8588787913322449, + "learning_rate": 9.993810643802065e-06, + "loss": 0.7878, + "step": 674 + }, + { + "epoch": 0.037151191590071, + "grad_norm": 0.9955212473869324, + "learning_rate": 9.993789063785344e-06, + "loss": 0.8711, + "step": 675 + }, + { + "epoch": 0.03720623039242666, + "grad_norm": 0.925578236579895, + "learning_rate": 9.993767446236642e-06, + "loss": 0.9431, + "step": 676 + }, + { + "epoch": 0.03726126919478232, + "grad_norm": 0.9610552787780762, + "learning_rate": 9.99374579115612e-06, + "loss": 0.887, + "step": 677 + }, + { + "epoch": 0.03731630799713798, + "grad_norm": 1.0052428245544434, + "learning_rate": 9.99372409854394e-06, + "loss": 0.8751, + "step": 678 + }, + { + "epoch": 0.037371346799493645, + "grad_norm": 0.9503066539764404, + "learning_rate": 9.99370236840027e-06, + "loss": 0.8556, + "step": 679 + }, + { + "epoch": 0.037426385601849306, + "grad_norm": 2.426232099533081, + "learning_rate": 9.993680600725266e-06, + "loss": 0.9077, + "step": 680 + }, + { + "epoch": 0.03748142440420497, + "grad_norm": 0.9119723439216614, + "learning_rate": 9.993658795519096e-06, + "loss": 0.8575, + "step": 681 + }, + { + "epoch": 0.03753646320656062, + "grad_norm": 0.9688286781311035, + "learning_rate": 9.993636952781923e-06, + "loss": 0.8921, + "step": 682 + }, + { + "epoch": 0.03759150200891628, + "grad_norm": 1.030013084411621, + "learning_rate": 9.993615072513913e-06, + "loss": 0.8622, + "step": 683 + }, + { + "epoch": 0.037646540811271945, + "grad_norm": 1.055187463760376, + "learning_rate": 9.993593154715228e-06, + "loss": 0.9251, + "step": 684 + }, + { + "epoch": 0.037701579613627606, + "grad_norm": 1.0518591403961182, + "learning_rate": 9.993571199386032e-06, + "loss": 0.9575, + "step": 685 + }, + { + "epoch": 0.03775661841598327, + "grad_norm": 0.9232666492462158, + "learning_rate": 9.993549206526495e-06, + "loss": 0.8522, + "step": 686 + }, + { + "epoch": 0.03781165721833893, + "grad_norm": 1.0212332010269165, + "learning_rate": 9.993527176136775e-06, + "loss": 0.9358, + "step": 687 + }, + { + "epoch": 0.03786669602069459, + "grad_norm": 0.9137141108512878, + "learning_rate": 9.993505108217045e-06, + "loss": 0.8561, + "step": 688 + }, + { + "epoch": 0.03792173482305025, + "grad_norm": 1.0069375038146973, + "learning_rate": 9.993483002767465e-06, + "loss": 0.8274, + "step": 689 + }, + { + "epoch": 0.03797677362540591, + "grad_norm": 0.9820672869682312, + "learning_rate": 9.993460859788204e-06, + "loss": 0.907, + "step": 690 + }, + { + "epoch": 0.038031812427761574, + "grad_norm": 1.0042002201080322, + "learning_rate": 9.993438679279428e-06, + "loss": 0.9263, + "step": 691 + }, + { + "epoch": 0.038086851230117236, + "grad_norm": 0.9733695983886719, + "learning_rate": 9.993416461241304e-06, + "loss": 0.8455, + "step": 692 + }, + { + "epoch": 0.03814189003247289, + "grad_norm": 0.9106015563011169, + "learning_rate": 9.993394205673996e-06, + "loss": 0.8469, + "step": 693 + }, + { + "epoch": 0.03819692883482855, + "grad_norm": 0.9802660346031189, + "learning_rate": 9.993371912577677e-06, + "loss": 0.8662, + "step": 694 + }, + { + "epoch": 0.03825196763718421, + "grad_norm": 0.9183964729309082, + "learning_rate": 9.99334958195251e-06, + "loss": 0.8968, + "step": 695 + }, + { + "epoch": 0.038307006439539874, + "grad_norm": 0.9572185277938843, + "learning_rate": 9.993327213798663e-06, + "loss": 0.953, + "step": 696 + }, + { + "epoch": 0.038362045241895536, + "grad_norm": 1.4480071067810059, + "learning_rate": 9.993304808116307e-06, + "loss": 1.1131, + "step": 697 + }, + { + "epoch": 0.0384170840442512, + "grad_norm": 0.9297361969947815, + "learning_rate": 9.993282364905607e-06, + "loss": 0.884, + "step": 698 + }, + { + "epoch": 0.03847212284660686, + "grad_norm": 0.9400073885917664, + "learning_rate": 9.993259884166735e-06, + "loss": 0.932, + "step": 699 + }, + { + "epoch": 0.03852716164896252, + "grad_norm": 0.9231798052787781, + "learning_rate": 9.993237365899858e-06, + "loss": 0.8981, + "step": 700 + }, + { + "epoch": 0.03858220045131818, + "grad_norm": 0.8233712911605835, + "learning_rate": 9.993214810105144e-06, + "loss": 0.8218, + "step": 701 + }, + { + "epoch": 0.03863723925367384, + "grad_norm": 1.0997854471206665, + "learning_rate": 9.993192216782768e-06, + "loss": 0.9298, + "step": 702 + }, + { + "epoch": 0.038692278056029504, + "grad_norm": 0.9570802450180054, + "learning_rate": 9.993169585932893e-06, + "loss": 0.7815, + "step": 703 + }, + { + "epoch": 0.03874731685838516, + "grad_norm": 0.9913730025291443, + "learning_rate": 9.993146917555692e-06, + "loss": 0.9621, + "step": 704 + }, + { + "epoch": 0.03880235566074082, + "grad_norm": 1.088767409324646, + "learning_rate": 9.993124211651334e-06, + "loss": 0.9295, + "step": 705 + }, + { + "epoch": 0.03885739446309648, + "grad_norm": 0.8199124336242676, + "learning_rate": 9.993101468219995e-06, + "loss": 0.7613, + "step": 706 + }, + { + "epoch": 0.03891243326545214, + "grad_norm": 1.112566351890564, + "learning_rate": 9.99307868726184e-06, + "loss": 0.791, + "step": 707 + }, + { + "epoch": 0.038967472067807804, + "grad_norm": 0.9372578859329224, + "learning_rate": 9.99305586877704e-06, + "loss": 0.8567, + "step": 708 + }, + { + "epoch": 0.039022510870163465, + "grad_norm": 1.0167721509933472, + "learning_rate": 9.99303301276577e-06, + "loss": 0.9787, + "step": 709 + }, + { + "epoch": 0.03907754967251913, + "grad_norm": 1.3526856899261475, + "learning_rate": 9.993010119228202e-06, + "loss": 1.2215, + "step": 710 + }, + { + "epoch": 0.03913258847487479, + "grad_norm": 0.8819016814231873, + "learning_rate": 9.992987188164505e-06, + "loss": 0.7736, + "step": 711 + }, + { + "epoch": 0.03918762727723045, + "grad_norm": 1.0033677816390991, + "learning_rate": 9.992964219574852e-06, + "loss": 0.9919, + "step": 712 + }, + { + "epoch": 0.03924266607958611, + "grad_norm": 0.894926130771637, + "learning_rate": 9.992941213459417e-06, + "loss": 0.9058, + "step": 713 + }, + { + "epoch": 0.03929770488194177, + "grad_norm": 0.9481377005577087, + "learning_rate": 9.992918169818373e-06, + "loss": 0.8436, + "step": 714 + }, + { + "epoch": 0.03935274368429743, + "grad_norm": 0.9312933087348938, + "learning_rate": 9.992895088651893e-06, + "loss": 0.8869, + "step": 715 + }, + { + "epoch": 0.03940778248665309, + "grad_norm": 0.9765705466270447, + "learning_rate": 9.99287196996015e-06, + "loss": 0.9512, + "step": 716 + }, + { + "epoch": 0.03946282128900875, + "grad_norm": 0.9610235691070557, + "learning_rate": 9.992848813743317e-06, + "loss": 0.8005, + "step": 717 + }, + { + "epoch": 0.03951786009136441, + "grad_norm": 1.102995753288269, + "learning_rate": 9.99282562000157e-06, + "loss": 0.8017, + "step": 718 + }, + { + "epoch": 0.03957289889372007, + "grad_norm": 1.023317575454712, + "learning_rate": 9.99280238873508e-06, + "loss": 0.911, + "step": 719 + }, + { + "epoch": 0.039627937696075734, + "grad_norm": 1.0531049966812134, + "learning_rate": 9.992779119944025e-06, + "loss": 0.8562, + "step": 720 + }, + { + "epoch": 0.039682976498431395, + "grad_norm": 0.918250322341919, + "learning_rate": 9.992755813628579e-06, + "loss": 0.92, + "step": 721 + }, + { + "epoch": 0.039738015300787057, + "grad_norm": 0.8508251309394836, + "learning_rate": 9.992732469788915e-06, + "loss": 0.7347, + "step": 722 + }, + { + "epoch": 0.03979305410314272, + "grad_norm": 0.9184926152229309, + "learning_rate": 9.992709088425211e-06, + "loss": 0.8732, + "step": 723 + }, + { + "epoch": 0.03984809290549838, + "grad_norm": 1.1613929271697998, + "learning_rate": 9.992685669537643e-06, + "loss": 0.9522, + "step": 724 + }, + { + "epoch": 0.039903131707854034, + "grad_norm": 1.091513752937317, + "learning_rate": 9.992662213126386e-06, + "loss": 0.9646, + "step": 725 + }, + { + "epoch": 0.039958170510209695, + "grad_norm": 1.057803750038147, + "learning_rate": 9.992638719191615e-06, + "loss": 0.7032, + "step": 726 + }, + { + "epoch": 0.040013209312565357, + "grad_norm": 0.8771823644638062, + "learning_rate": 9.992615187733508e-06, + "loss": 0.8577, + "step": 727 + }, + { + "epoch": 0.04006824811492102, + "grad_norm": 0.9471028447151184, + "learning_rate": 9.992591618752244e-06, + "loss": 0.9057, + "step": 728 + }, + { + "epoch": 0.04012328691727668, + "grad_norm": 0.9547705054283142, + "learning_rate": 9.992568012247995e-06, + "loss": 0.9549, + "step": 729 + }, + { + "epoch": 0.04017832571963234, + "grad_norm": 0.8862974047660828, + "learning_rate": 9.992544368220941e-06, + "loss": 0.8593, + "step": 730 + }, + { + "epoch": 0.040233364521988, + "grad_norm": 0.906334400177002, + "learning_rate": 9.992520686671261e-06, + "loss": 0.8832, + "step": 731 + }, + { + "epoch": 0.04028840332434366, + "grad_norm": 1.07270085811615, + "learning_rate": 9.992496967599133e-06, + "loss": 0.9409, + "step": 732 + }, + { + "epoch": 0.040343442126699325, + "grad_norm": 0.9026005268096924, + "learning_rate": 9.992473211004734e-06, + "loss": 0.8326, + "step": 733 + }, + { + "epoch": 0.040398480929054986, + "grad_norm": 0.9762942790985107, + "learning_rate": 9.992449416888241e-06, + "loss": 0.9048, + "step": 734 + }, + { + "epoch": 0.04045351973141065, + "grad_norm": 0.9658033847808838, + "learning_rate": 9.992425585249837e-06, + "loss": 0.9219, + "step": 735 + }, + { + "epoch": 0.0405085585337663, + "grad_norm": 0.8909044861793518, + "learning_rate": 9.992401716089698e-06, + "loss": 0.8564, + "step": 736 + }, + { + "epoch": 0.04056359733612196, + "grad_norm": 1.0387929677963257, + "learning_rate": 9.992377809408001e-06, + "loss": 0.9533, + "step": 737 + }, + { + "epoch": 0.040618636138477625, + "grad_norm": 0.9044275879859924, + "learning_rate": 9.99235386520493e-06, + "loss": 0.8508, + "step": 738 + }, + { + "epoch": 0.040673674940833286, + "grad_norm": 1.019377589225769, + "learning_rate": 9.992329883480667e-06, + "loss": 0.8684, + "step": 739 + }, + { + "epoch": 0.04072871374318895, + "grad_norm": 0.9394627213478088, + "learning_rate": 9.992305864235385e-06, + "loss": 0.7665, + "step": 740 + }, + { + "epoch": 0.04078375254554461, + "grad_norm": 0.8652323484420776, + "learning_rate": 9.99228180746927e-06, + "loss": 0.8576, + "step": 741 + }, + { + "epoch": 0.04083879134790027, + "grad_norm": 0.9347619414329529, + "learning_rate": 9.992257713182502e-06, + "loss": 0.9586, + "step": 742 + }, + { + "epoch": 0.04089383015025593, + "grad_norm": 0.9510203003883362, + "learning_rate": 9.99223358137526e-06, + "loss": 0.9092, + "step": 743 + }, + { + "epoch": 0.04094886895261159, + "grad_norm": 0.8242866396903992, + "learning_rate": 9.992209412047729e-06, + "loss": 0.6997, + "step": 744 + }, + { + "epoch": 0.041003907754967255, + "grad_norm": 0.8842730522155762, + "learning_rate": 9.992185205200087e-06, + "loss": 0.8873, + "step": 745 + }, + { + "epoch": 0.041058946557322916, + "grad_norm": 1.0813730955123901, + "learning_rate": 9.992160960832518e-06, + "loss": 1.0162, + "step": 746 + }, + { + "epoch": 0.04111398535967857, + "grad_norm": 1.1276283264160156, + "learning_rate": 9.9921366789452e-06, + "loss": 1.0004, + "step": 747 + }, + { + "epoch": 0.04116902416203423, + "grad_norm": 0.8810326457023621, + "learning_rate": 9.992112359538323e-06, + "loss": 0.7823, + "step": 748 + }, + { + "epoch": 0.04122406296438989, + "grad_norm": 0.9939407110214233, + "learning_rate": 9.992088002612066e-06, + "loss": 1.0016, + "step": 749 + }, + { + "epoch": 0.041279101766745555, + "grad_norm": 1.0963523387908936, + "learning_rate": 9.99206360816661e-06, + "loss": 0.9252, + "step": 750 + }, + { + "epoch": 0.041334140569101216, + "grad_norm": 1.1346478462219238, + "learning_rate": 9.99203917620214e-06, + "loss": 0.9608, + "step": 751 + }, + { + "epoch": 0.04138917937145688, + "grad_norm": 1.0108580589294434, + "learning_rate": 9.992014706718841e-06, + "loss": 0.9179, + "step": 752 + }, + { + "epoch": 0.04144421817381254, + "grad_norm": 0.897293210029602, + "learning_rate": 9.991990199716894e-06, + "loss": 0.9295, + "step": 753 + }, + { + "epoch": 0.0414992569761682, + "grad_norm": 1.0152363777160645, + "learning_rate": 9.991965655196488e-06, + "loss": 0.8467, + "step": 754 + }, + { + "epoch": 0.04155429577852386, + "grad_norm": 0.8655388355255127, + "learning_rate": 9.9919410731578e-06, + "loss": 0.796, + "step": 755 + }, + { + "epoch": 0.04160933458087952, + "grad_norm": 1.0140331983566284, + "learning_rate": 9.991916453601023e-06, + "loss": 0.8444, + "step": 756 + }, + { + "epoch": 0.041664373383235184, + "grad_norm": 0.9387341141700745, + "learning_rate": 9.991891796526338e-06, + "loss": 0.8669, + "step": 757 + }, + { + "epoch": 0.04171941218559084, + "grad_norm": 0.9395696520805359, + "learning_rate": 9.991867101933928e-06, + "loss": 0.8376, + "step": 758 + }, + { + "epoch": 0.0417744509879465, + "grad_norm": 1.0856634378433228, + "learning_rate": 9.991842369823983e-06, + "loss": 0.9271, + "step": 759 + }, + { + "epoch": 0.04182948979030216, + "grad_norm": 0.8777190446853638, + "learning_rate": 9.991817600196687e-06, + "loss": 0.9197, + "step": 760 + }, + { + "epoch": 0.04188452859265782, + "grad_norm": 0.9639917016029358, + "learning_rate": 9.991792793052225e-06, + "loss": 0.8835, + "step": 761 + }, + { + "epoch": 0.041939567395013484, + "grad_norm": 0.9384773969650269, + "learning_rate": 9.991767948390785e-06, + "loss": 0.8403, + "step": 762 + }, + { + "epoch": 0.041994606197369146, + "grad_norm": 0.8987650275230408, + "learning_rate": 9.991743066212554e-06, + "loss": 0.7948, + "step": 763 + }, + { + "epoch": 0.04204964499972481, + "grad_norm": 1.0545049905776978, + "learning_rate": 9.991718146517717e-06, + "loss": 0.9359, + "step": 764 + }, + { + "epoch": 0.04210468380208047, + "grad_norm": 0.9840022325515747, + "learning_rate": 9.991693189306463e-06, + "loss": 0.9188, + "step": 765 + }, + { + "epoch": 0.04215972260443613, + "grad_norm": 0.8769927620887756, + "learning_rate": 9.991668194578981e-06, + "loss": 0.8647, + "step": 766 + }, + { + "epoch": 0.04221476140679179, + "grad_norm": 0.9268791675567627, + "learning_rate": 9.991643162335455e-06, + "loss": 0.897, + "step": 767 + }, + { + "epoch": 0.042269800209147446, + "grad_norm": 0.9316747784614563, + "learning_rate": 9.991618092576075e-06, + "loss": 0.9341, + "step": 768 + }, + { + "epoch": 0.04232483901150311, + "grad_norm": 0.8348364233970642, + "learning_rate": 9.991592985301031e-06, + "loss": 0.7528, + "step": 769 + }, + { + "epoch": 0.04237987781385877, + "grad_norm": 0.9139068126678467, + "learning_rate": 9.99156784051051e-06, + "loss": 0.8596, + "step": 770 + }, + { + "epoch": 0.04243491661621443, + "grad_norm": 0.9403928518295288, + "learning_rate": 9.991542658204701e-06, + "loss": 0.974, + "step": 771 + }, + { + "epoch": 0.04248995541857009, + "grad_norm": 0.993549108505249, + "learning_rate": 9.991517438383793e-06, + "loss": 0.9479, + "step": 772 + }, + { + "epoch": 0.04254499422092575, + "grad_norm": 0.8494916558265686, + "learning_rate": 9.991492181047975e-06, + "loss": 0.9149, + "step": 773 + }, + { + "epoch": 0.042600033023281414, + "grad_norm": 1.0351910591125488, + "learning_rate": 9.991466886197441e-06, + "loss": 0.9552, + "step": 774 + }, + { + "epoch": 0.042655071825637075, + "grad_norm": 0.916829526424408, + "learning_rate": 9.991441553832375e-06, + "loss": 0.8781, + "step": 775 + }, + { + "epoch": 0.04271011062799274, + "grad_norm": 1.113476276397705, + "learning_rate": 9.991416183952972e-06, + "loss": 0.8137, + "step": 776 + }, + { + "epoch": 0.0427651494303484, + "grad_norm": 1.1608171463012695, + "learning_rate": 9.991390776559421e-06, + "loss": 1.0045, + "step": 777 + }, + { + "epoch": 0.04282018823270406, + "grad_norm": 1.0045493841171265, + "learning_rate": 9.991365331651913e-06, + "loss": 0.8813, + "step": 778 + }, + { + "epoch": 0.042875227035059714, + "grad_norm": 0.918820858001709, + "learning_rate": 9.991339849230639e-06, + "loss": 0.9198, + "step": 779 + }, + { + "epoch": 0.042930265837415375, + "grad_norm": 0.9875735640525818, + "learning_rate": 9.991314329295792e-06, + "loss": 0.8665, + "step": 780 + }, + { + "epoch": 0.04298530463977104, + "grad_norm": 0.873768150806427, + "learning_rate": 9.991288771847561e-06, + "loss": 0.8606, + "step": 781 + }, + { + "epoch": 0.0430403434421267, + "grad_norm": 0.8892746567726135, + "learning_rate": 9.991263176886139e-06, + "loss": 0.9011, + "step": 782 + }, + { + "epoch": 0.04309538224448236, + "grad_norm": 1.097734808921814, + "learning_rate": 9.99123754441172e-06, + "loss": 1.009, + "step": 783 + }, + { + "epoch": 0.04315042104683802, + "grad_norm": 1.0065964460372925, + "learning_rate": 9.991211874424497e-06, + "loss": 0.9492, + "step": 784 + }, + { + "epoch": 0.04320545984919368, + "grad_norm": 1.0791678428649902, + "learning_rate": 9.99118616692466e-06, + "loss": 1.0142, + "step": 785 + }, + { + "epoch": 0.043260498651549344, + "grad_norm": 0.9454777836799622, + "learning_rate": 9.991160421912404e-06, + "loss": 0.8058, + "step": 786 + }, + { + "epoch": 0.043315537453905005, + "grad_norm": 0.9448156952857971, + "learning_rate": 9.991134639387922e-06, + "loss": 0.8184, + "step": 787 + }, + { + "epoch": 0.043370576256260666, + "grad_norm": 0.9636550545692444, + "learning_rate": 9.99110881935141e-06, + "loss": 0.8606, + "step": 788 + }, + { + "epoch": 0.04342561505861633, + "grad_norm": 0.9933613538742065, + "learning_rate": 9.991082961803058e-06, + "loss": 0.9449, + "step": 789 + }, + { + "epoch": 0.04348065386097198, + "grad_norm": 0.8906797170639038, + "learning_rate": 9.991057066743065e-06, + "loss": 0.8053, + "step": 790 + }, + { + "epoch": 0.043535692663327644, + "grad_norm": 1.0393906831741333, + "learning_rate": 9.991031134171621e-06, + "loss": 0.8487, + "step": 791 + }, + { + "epoch": 0.043590731465683305, + "grad_norm": 1.0618231296539307, + "learning_rate": 9.991005164088923e-06, + "loss": 0.9847, + "step": 792 + }, + { + "epoch": 0.043645770268038966, + "grad_norm": 0.9525149464607239, + "learning_rate": 9.990979156495167e-06, + "loss": 0.9318, + "step": 793 + }, + { + "epoch": 0.04370080907039463, + "grad_norm": 0.9430851936340332, + "learning_rate": 9.990953111390546e-06, + "loss": 0.8483, + "step": 794 + }, + { + "epoch": 0.04375584787275029, + "grad_norm": 0.9259672164916992, + "learning_rate": 9.99092702877526e-06, + "loss": 0.9365, + "step": 795 + }, + { + "epoch": 0.04381088667510595, + "grad_norm": 0.942609965801239, + "learning_rate": 9.9909009086495e-06, + "loss": 0.8408, + "step": 796 + }, + { + "epoch": 0.04386592547746161, + "grad_norm": 0.939255952835083, + "learning_rate": 9.990874751013467e-06, + "loss": 0.8749, + "step": 797 + }, + { + "epoch": 0.04392096427981727, + "grad_norm": 1.1701711416244507, + "learning_rate": 9.990848555867353e-06, + "loss": 0.9312, + "step": 798 + }, + { + "epoch": 0.043976003082172935, + "grad_norm": 1.0441124439239502, + "learning_rate": 9.990822323211358e-06, + "loss": 0.8618, + "step": 799 + }, + { + "epoch": 0.04403104188452859, + "grad_norm": 0.9601489305496216, + "learning_rate": 9.990796053045679e-06, + "loss": 0.9569, + "step": 800 + }, + { + "epoch": 0.04408608068688425, + "grad_norm": 0.9394032955169678, + "learning_rate": 9.990769745370513e-06, + "loss": 0.846, + "step": 801 + }, + { + "epoch": 0.04414111948923991, + "grad_norm": 0.9631348252296448, + "learning_rate": 9.990743400186056e-06, + "loss": 0.8754, + "step": 802 + }, + { + "epoch": 0.04419615829159557, + "grad_norm": 0.9234963059425354, + "learning_rate": 9.990717017492508e-06, + "loss": 0.8613, + "step": 803 + }, + { + "epoch": 0.044251197093951235, + "grad_norm": 0.9169090390205383, + "learning_rate": 9.990690597290069e-06, + "loss": 0.8867, + "step": 804 + }, + { + "epoch": 0.044306235896306896, + "grad_norm": 1.0194867849349976, + "learning_rate": 9.990664139578933e-06, + "loss": 0.8675, + "step": 805 + }, + { + "epoch": 0.04436127469866256, + "grad_norm": 1.3226114511489868, + "learning_rate": 9.990637644359302e-06, + "loss": 0.997, + "step": 806 + }, + { + "epoch": 0.04441631350101822, + "grad_norm": 0.8904317617416382, + "learning_rate": 9.990611111631374e-06, + "loss": 0.7274, + "step": 807 + }, + { + "epoch": 0.04447135230337388, + "grad_norm": 0.8909007906913757, + "learning_rate": 9.99058454139535e-06, + "loss": 0.8141, + "step": 808 + }, + { + "epoch": 0.04452639110572954, + "grad_norm": 1.004015564918518, + "learning_rate": 9.990557933651429e-06, + "loss": 0.9883, + "step": 809 + }, + { + "epoch": 0.0445814299080852, + "grad_norm": 1.1215732097625732, + "learning_rate": 9.990531288399807e-06, + "loss": 0.9355, + "step": 810 + }, + { + "epoch": 0.04463646871044086, + "grad_norm": 1.0545012950897217, + "learning_rate": 9.99050460564069e-06, + "loss": 0.9532, + "step": 811 + }, + { + "epoch": 0.04469150751279652, + "grad_norm": 0.9608867168426514, + "learning_rate": 9.990477885374277e-06, + "loss": 0.9363, + "step": 812 + }, + { + "epoch": 0.04474654631515218, + "grad_norm": 0.8750461935997009, + "learning_rate": 9.990451127600766e-06, + "loss": 0.7343, + "step": 813 + }, + { + "epoch": 0.04480158511750784, + "grad_norm": 0.891740620136261, + "learning_rate": 9.99042433232036e-06, + "loss": 0.8541, + "step": 814 + }, + { + "epoch": 0.0448566239198635, + "grad_norm": 1.1520029306411743, + "learning_rate": 9.990397499533264e-06, + "loss": 0.7696, + "step": 815 + }, + { + "epoch": 0.044911662722219164, + "grad_norm": 0.9526278972625732, + "learning_rate": 9.990370629239673e-06, + "loss": 0.8953, + "step": 816 + }, + { + "epoch": 0.044966701524574826, + "grad_norm": 0.9218434691429138, + "learning_rate": 9.990343721439795e-06, + "loss": 0.8198, + "step": 817 + }, + { + "epoch": 0.04502174032693049, + "grad_norm": 0.8502745628356934, + "learning_rate": 9.990316776133827e-06, + "loss": 0.8035, + "step": 818 + }, + { + "epoch": 0.04507677912928615, + "grad_norm": 0.8861565589904785, + "learning_rate": 9.990289793321975e-06, + "loss": 0.8626, + "step": 819 + }, + { + "epoch": 0.04513181793164181, + "grad_norm": 1.1113256216049194, + "learning_rate": 9.99026277300444e-06, + "loss": 0.9363, + "step": 820 + }, + { + "epoch": 0.04518685673399747, + "grad_norm": 0.9984708428382874, + "learning_rate": 9.990235715181426e-06, + "loss": 1.0376, + "step": 821 + }, + { + "epoch": 0.045241895536353126, + "grad_norm": 0.9026711583137512, + "learning_rate": 9.990208619853137e-06, + "loss": 0.9079, + "step": 822 + }, + { + "epoch": 0.04529693433870879, + "grad_norm": 0.8724965453147888, + "learning_rate": 9.990181487019775e-06, + "loss": 0.8665, + "step": 823 + }, + { + "epoch": 0.04535197314106445, + "grad_norm": 0.8923047780990601, + "learning_rate": 9.990154316681543e-06, + "loss": 0.7779, + "step": 824 + }, + { + "epoch": 0.04540701194342011, + "grad_norm": 0.9024640321731567, + "learning_rate": 9.99012710883865e-06, + "loss": 0.8859, + "step": 825 + }, + { + "epoch": 0.04546205074577577, + "grad_norm": 0.9245888590812683, + "learning_rate": 9.990099863491296e-06, + "loss": 0.8501, + "step": 826 + }, + { + "epoch": 0.04551708954813143, + "grad_norm": 0.9257050156593323, + "learning_rate": 9.990072580639687e-06, + "loss": 0.9561, + "step": 827 + }, + { + "epoch": 0.045572128350487094, + "grad_norm": 0.995610773563385, + "learning_rate": 9.99004526028403e-06, + "loss": 0.917, + "step": 828 + }, + { + "epoch": 0.045627167152842756, + "grad_norm": 0.9524009823799133, + "learning_rate": 9.990017902424525e-06, + "loss": 0.9184, + "step": 829 + }, + { + "epoch": 0.04568220595519842, + "grad_norm": 0.9264503121376038, + "learning_rate": 9.989990507061385e-06, + "loss": 0.8615, + "step": 830 + }, + { + "epoch": 0.04573724475755408, + "grad_norm": 1.0068570375442505, + "learning_rate": 9.989963074194809e-06, + "loss": 0.8331, + "step": 831 + }, + { + "epoch": 0.04579228355990974, + "grad_norm": 0.9295952320098877, + "learning_rate": 9.989935603825009e-06, + "loss": 0.8387, + "step": 832 + }, + { + "epoch": 0.045847322362265394, + "grad_norm": 1.0408827066421509, + "learning_rate": 9.989908095952186e-06, + "loss": 0.9686, + "step": 833 + }, + { + "epoch": 0.045902361164621056, + "grad_norm": 0.8874136209487915, + "learning_rate": 9.989880550576551e-06, + "loss": 0.815, + "step": 834 + }, + { + "epoch": 0.04595739996697672, + "grad_norm": 0.9898836016654968, + "learning_rate": 9.989852967698311e-06, + "loss": 0.9458, + "step": 835 + }, + { + "epoch": 0.04601243876933238, + "grad_norm": 0.9828970432281494, + "learning_rate": 9.989825347317668e-06, + "loss": 0.7922, + "step": 836 + }, + { + "epoch": 0.04606747757168804, + "grad_norm": 1.025447964668274, + "learning_rate": 9.989797689434836e-06, + "loss": 0.9349, + "step": 837 + }, + { + "epoch": 0.0461225163740437, + "grad_norm": 0.8623831272125244, + "learning_rate": 9.98976999405002e-06, + "loss": 0.8786, + "step": 838 + }, + { + "epoch": 0.04617755517639936, + "grad_norm": 0.9614997506141663, + "learning_rate": 9.98974226116343e-06, + "loss": 0.7885, + "step": 839 + }, + { + "epoch": 0.046232593978755024, + "grad_norm": 1.0207616090774536, + "learning_rate": 9.989714490775269e-06, + "loss": 0.9786, + "step": 840 + }, + { + "epoch": 0.046287632781110685, + "grad_norm": 0.8509595990180969, + "learning_rate": 9.98968668288575e-06, + "loss": 0.7312, + "step": 841 + }, + { + "epoch": 0.04634267158346635, + "grad_norm": 0.9822607040405273, + "learning_rate": 9.989658837495084e-06, + "loss": 0.952, + "step": 842 + }, + { + "epoch": 0.046397710385822, + "grad_norm": 1.0058252811431885, + "learning_rate": 9.989630954603477e-06, + "loss": 0.8811, + "step": 843 + }, + { + "epoch": 0.04645274918817766, + "grad_norm": 1.0146985054016113, + "learning_rate": 9.989603034211139e-06, + "loss": 0.9051, + "step": 844 + }, + { + "epoch": 0.046507787990533324, + "grad_norm": 0.8976503610610962, + "learning_rate": 9.98957507631828e-06, + "loss": 0.879, + "step": 845 + }, + { + "epoch": 0.046562826792888985, + "grad_norm": 0.8791939616203308, + "learning_rate": 9.989547080925111e-06, + "loss": 0.8944, + "step": 846 + }, + { + "epoch": 0.04661786559524465, + "grad_norm": 0.8530884981155396, + "learning_rate": 9.989519048031842e-06, + "loss": 0.9029, + "step": 847 + }, + { + "epoch": 0.04667290439760031, + "grad_norm": 0.9621617197990417, + "learning_rate": 9.989490977638683e-06, + "loss": 0.8374, + "step": 848 + }, + { + "epoch": 0.04672794319995597, + "grad_norm": 0.9629075527191162, + "learning_rate": 9.989462869745845e-06, + "loss": 0.9032, + "step": 849 + }, + { + "epoch": 0.04678298200231163, + "grad_norm": 1.3256126642227173, + "learning_rate": 9.989434724353541e-06, + "loss": 0.9748, + "step": 850 + }, + { + "epoch": 0.04683802080466729, + "grad_norm": 1.0230494737625122, + "learning_rate": 9.989406541461979e-06, + "loss": 0.9752, + "step": 851 + }, + { + "epoch": 0.046893059607022954, + "grad_norm": 0.8454533219337463, + "learning_rate": 9.989378321071375e-06, + "loss": 0.8426, + "step": 852 + }, + { + "epoch": 0.046948098409378615, + "grad_norm": 0.9995863437652588, + "learning_rate": 9.989350063181939e-06, + "loss": 0.9955, + "step": 853 + }, + { + "epoch": 0.04700313721173427, + "grad_norm": 0.8956604599952698, + "learning_rate": 9.989321767793883e-06, + "loss": 0.9024, + "step": 854 + }, + { + "epoch": 0.04705817601408993, + "grad_norm": 1.0123292207717896, + "learning_rate": 9.989293434907419e-06, + "loss": 0.7856, + "step": 855 + }, + { + "epoch": 0.04711321481644559, + "grad_norm": 0.814577043056488, + "learning_rate": 9.989265064522762e-06, + "loss": 0.8377, + "step": 856 + }, + { + "epoch": 0.047168253618801254, + "grad_norm": 1.1571552753448486, + "learning_rate": 9.989236656640125e-06, + "loss": 0.8562, + "step": 857 + }, + { + "epoch": 0.047223292421156915, + "grad_norm": 0.9681577682495117, + "learning_rate": 9.98920821125972e-06, + "loss": 0.8473, + "step": 858 + }, + { + "epoch": 0.047278331223512576, + "grad_norm": 0.9680121541023254, + "learning_rate": 9.989179728381761e-06, + "loss": 0.9811, + "step": 859 + }, + { + "epoch": 0.04733337002586824, + "grad_norm": 0.985477089881897, + "learning_rate": 9.989151208006464e-06, + "loss": 0.6994, + "step": 860 + }, + { + "epoch": 0.0473884088282239, + "grad_norm": 0.8612962365150452, + "learning_rate": 9.98912265013404e-06, + "loss": 0.7667, + "step": 861 + }, + { + "epoch": 0.04744344763057956, + "grad_norm": 0.8884604573249817, + "learning_rate": 9.989094054764708e-06, + "loss": 0.8382, + "step": 862 + }, + { + "epoch": 0.04749848643293522, + "grad_norm": 1.036881923675537, + "learning_rate": 9.989065421898681e-06, + "loss": 0.8748, + "step": 863 + }, + { + "epoch": 0.04755352523529088, + "grad_norm": 0.9954493045806885, + "learning_rate": 9.989036751536171e-06, + "loss": 0.9174, + "step": 864 + }, + { + "epoch": 0.04760856403764654, + "grad_norm": 0.9984694123268127, + "learning_rate": 9.989008043677399e-06, + "loss": 0.7636, + "step": 865 + }, + { + "epoch": 0.0476636028400022, + "grad_norm": 1.0412588119506836, + "learning_rate": 9.988979298322576e-06, + "loss": 0.773, + "step": 866 + }, + { + "epoch": 0.04771864164235786, + "grad_norm": 0.8034874796867371, + "learning_rate": 9.98895051547192e-06, + "loss": 0.7914, + "step": 867 + }, + { + "epoch": 0.04777368044471352, + "grad_norm": 0.8983979225158691, + "learning_rate": 9.988921695125648e-06, + "loss": 0.7292, + "step": 868 + }, + { + "epoch": 0.04782871924706918, + "grad_norm": 0.9445077776908875, + "learning_rate": 9.988892837283976e-06, + "loss": 0.8263, + "step": 869 + }, + { + "epoch": 0.047883758049424845, + "grad_norm": 1.0753306150436401, + "learning_rate": 9.988863941947121e-06, + "loss": 1.1122, + "step": 870 + }, + { + "epoch": 0.047938796851780506, + "grad_norm": 1.0091484785079956, + "learning_rate": 9.9888350091153e-06, + "loss": 0.9276, + "step": 871 + }, + { + "epoch": 0.04799383565413617, + "grad_norm": 1.0977306365966797, + "learning_rate": 9.988806038788732e-06, + "loss": 0.854, + "step": 872 + }, + { + "epoch": 0.04804887445649183, + "grad_norm": 1.0285007953643799, + "learning_rate": 9.988777030967632e-06, + "loss": 0.9441, + "step": 873 + }, + { + "epoch": 0.04810391325884749, + "grad_norm": 0.8973976373672485, + "learning_rate": 9.988747985652218e-06, + "loss": 0.786, + "step": 874 + }, + { + "epoch": 0.04815895206120315, + "grad_norm": 0.9809553623199463, + "learning_rate": 9.98871890284271e-06, + "loss": 0.9042, + "step": 875 + }, + { + "epoch": 0.048213990863558806, + "grad_norm": 0.8514279723167419, + "learning_rate": 9.988689782539326e-06, + "loss": 0.7874, + "step": 876 + }, + { + "epoch": 0.04826902966591447, + "grad_norm": 0.8299674391746521, + "learning_rate": 9.988660624742286e-06, + "loss": 0.8704, + "step": 877 + }, + { + "epoch": 0.04832406846827013, + "grad_norm": 0.9862462282180786, + "learning_rate": 9.988631429451809e-06, + "loss": 0.9963, + "step": 878 + }, + { + "epoch": 0.04837910727062579, + "grad_norm": 0.9041131734848022, + "learning_rate": 9.988602196668111e-06, + "loss": 0.9207, + "step": 879 + }, + { + "epoch": 0.04843414607298145, + "grad_norm": 0.8597276210784912, + "learning_rate": 9.988572926391416e-06, + "loss": 0.8226, + "step": 880 + }, + { + "epoch": 0.04848918487533711, + "grad_norm": 0.9494329690933228, + "learning_rate": 9.988543618621941e-06, + "loss": 0.8834, + "step": 881 + }, + { + "epoch": 0.048544223677692774, + "grad_norm": 0.9129118323326111, + "learning_rate": 9.98851427335991e-06, + "loss": 0.7819, + "step": 882 + }, + { + "epoch": 0.048599262480048436, + "grad_norm": 0.9145999550819397, + "learning_rate": 9.988484890605539e-06, + "loss": 0.885, + "step": 883 + }, + { + "epoch": 0.0486543012824041, + "grad_norm": 1.0115307569503784, + "learning_rate": 9.98845547035905e-06, + "loss": 0.8347, + "step": 884 + }, + { + "epoch": 0.04870934008475976, + "grad_norm": 1.1372706890106201, + "learning_rate": 9.988426012620667e-06, + "loss": 0.944, + "step": 885 + }, + { + "epoch": 0.04876437888711541, + "grad_norm": 0.9502811431884766, + "learning_rate": 9.98839651739061e-06, + "loss": 0.9054, + "step": 886 + }, + { + "epoch": 0.048819417689471074, + "grad_norm": 0.9612823128700256, + "learning_rate": 9.988366984669097e-06, + "loss": 0.8796, + "step": 887 + }, + { + "epoch": 0.048874456491826736, + "grad_norm": 0.9551461935043335, + "learning_rate": 9.988337414456355e-06, + "loss": 0.8769, + "step": 888 + }, + { + "epoch": 0.0489294952941824, + "grad_norm": 0.8554086089134216, + "learning_rate": 9.988307806752603e-06, + "loss": 0.892, + "step": 889 + }, + { + "epoch": 0.04898453409653806, + "grad_norm": 0.8418886661529541, + "learning_rate": 9.988278161558067e-06, + "loss": 0.7568, + "step": 890 + }, + { + "epoch": 0.04903957289889372, + "grad_norm": 1.4780360460281372, + "learning_rate": 9.988248478872967e-06, + "loss": 0.9126, + "step": 891 + }, + { + "epoch": 0.04909461170124938, + "grad_norm": 0.8236714005470276, + "learning_rate": 9.988218758697526e-06, + "loss": 0.7317, + "step": 892 + }, + { + "epoch": 0.04914965050360504, + "grad_norm": 0.8777141571044922, + "learning_rate": 9.988189001031968e-06, + "loss": 0.7989, + "step": 893 + }, + { + "epoch": 0.049204689305960704, + "grad_norm": 1.0235031843185425, + "learning_rate": 9.988159205876516e-06, + "loss": 0.8335, + "step": 894 + }, + { + "epoch": 0.049259728108316365, + "grad_norm": 0.9340357184410095, + "learning_rate": 9.988129373231395e-06, + "loss": 0.8129, + "step": 895 + }, + { + "epoch": 0.04931476691067203, + "grad_norm": 1.7686667442321777, + "learning_rate": 9.98809950309683e-06, + "loss": 0.9792, + "step": 896 + }, + { + "epoch": 0.04936980571302768, + "grad_norm": 0.9252369403839111, + "learning_rate": 9.988069595473044e-06, + "loss": 0.8671, + "step": 897 + }, + { + "epoch": 0.04942484451538334, + "grad_norm": 0.9989960789680481, + "learning_rate": 9.988039650360262e-06, + "loss": 0.9245, + "step": 898 + }, + { + "epoch": 0.049479883317739004, + "grad_norm": 1.062912106513977, + "learning_rate": 9.98800966775871e-06, + "loss": 0.9146, + "step": 899 + }, + { + "epoch": 0.049534922120094665, + "grad_norm": 0.8698169589042664, + "learning_rate": 9.98797964766861e-06, + "loss": 0.8606, + "step": 900 + }, + { + "epoch": 0.04958996092245033, + "grad_norm": 1.6754224300384521, + "learning_rate": 9.98794959009019e-06, + "loss": 0.9236, + "step": 901 + }, + { + "epoch": 0.04964499972480599, + "grad_norm": 1.084174394607544, + "learning_rate": 9.98791949502368e-06, + "loss": 0.9252, + "step": 902 + }, + { + "epoch": 0.04970003852716165, + "grad_norm": 0.9866724610328674, + "learning_rate": 9.987889362469301e-06, + "loss": 0.9096, + "step": 903 + }, + { + "epoch": 0.04975507732951731, + "grad_norm": 0.8814040422439575, + "learning_rate": 9.987859192427279e-06, + "loss": 0.8475, + "step": 904 + }, + { + "epoch": 0.04981011613187297, + "grad_norm": 0.8796457052230835, + "learning_rate": 9.987828984897843e-06, + "loss": 0.8478, + "step": 905 + }, + { + "epoch": 0.049865154934228634, + "grad_norm": 1.0541884899139404, + "learning_rate": 9.98779873988122e-06, + "loss": 0.9799, + "step": 906 + }, + { + "epoch": 0.049920193736584295, + "grad_norm": 0.91409832239151, + "learning_rate": 9.987768457377636e-06, + "loss": 0.8701, + "step": 907 + }, + { + "epoch": 0.04997523253893995, + "grad_norm": 1.0120370388031006, + "learning_rate": 9.98773813738732e-06, + "loss": 0.8417, + "step": 908 + }, + { + "epoch": 0.05003027134129561, + "grad_norm": 1.7744206190109253, + "learning_rate": 9.987707779910499e-06, + "loss": 0.9263, + "step": 909 + }, + { + "epoch": 0.05008531014365127, + "grad_norm": 0.9423969388008118, + "learning_rate": 9.987677384947402e-06, + "loss": 0.9667, + "step": 910 + }, + { + "epoch": 0.050140348946006934, + "grad_norm": 1.5940319299697876, + "learning_rate": 9.987646952498256e-06, + "loss": 0.9223, + "step": 911 + }, + { + "epoch": 0.050195387748362595, + "grad_norm": 0.941792368888855, + "learning_rate": 9.987616482563292e-06, + "loss": 0.895, + "step": 912 + }, + { + "epoch": 0.05025042655071826, + "grad_norm": 3.1945221424102783, + "learning_rate": 9.987585975142738e-06, + "loss": 0.837, + "step": 913 + }, + { + "epoch": 0.05030546535307392, + "grad_norm": 2.0819199085235596, + "learning_rate": 9.98755543023682e-06, + "loss": 0.918, + "step": 914 + }, + { + "epoch": 0.05036050415542958, + "grad_norm": 0.984282910823822, + "learning_rate": 9.987524847845773e-06, + "loss": 0.8589, + "step": 915 + }, + { + "epoch": 0.05041554295778524, + "grad_norm": 0.9021026492118835, + "learning_rate": 9.987494227969823e-06, + "loss": 0.9053, + "step": 916 + }, + { + "epoch": 0.0504705817601409, + "grad_norm": 2.6515462398529053, + "learning_rate": 9.9874635706092e-06, + "loss": 0.8874, + "step": 917 + }, + { + "epoch": 0.050525620562496563, + "grad_norm": 0.8893095254898071, + "learning_rate": 9.98743287576414e-06, + "loss": 0.8259, + "step": 918 + }, + { + "epoch": 0.05058065936485222, + "grad_norm": 0.9897775650024414, + "learning_rate": 9.987402143434868e-06, + "loss": 0.877, + "step": 919 + }, + { + "epoch": 0.05063569816720788, + "grad_norm": 0.9391944408416748, + "learning_rate": 9.987371373621614e-06, + "loss": 0.9363, + "step": 920 + }, + { + "epoch": 0.05069073696956354, + "grad_norm": 0.9585913419723511, + "learning_rate": 9.987340566324615e-06, + "loss": 0.8704, + "step": 921 + }, + { + "epoch": 0.0507457757719192, + "grad_norm": 0.9210980534553528, + "learning_rate": 9.987309721544098e-06, + "loss": 0.9321, + "step": 922 + }, + { + "epoch": 0.05080081457427486, + "grad_norm": 1.0713307857513428, + "learning_rate": 9.987278839280295e-06, + "loss": 0.9489, + "step": 923 + }, + { + "epoch": 0.050855853376630525, + "grad_norm": 1.0178636312484741, + "learning_rate": 9.98724791953344e-06, + "loss": 0.853, + "step": 924 + }, + { + "epoch": 0.050910892178986186, + "grad_norm": 0.9782636761665344, + "learning_rate": 9.987216962303766e-06, + "loss": 0.924, + "step": 925 + }, + { + "epoch": 0.05096593098134185, + "grad_norm": 0.9474522471427917, + "learning_rate": 9.987185967591503e-06, + "loss": 0.8619, + "step": 926 + }, + { + "epoch": 0.05102096978369751, + "grad_norm": 1.1875778436660767, + "learning_rate": 9.987154935396885e-06, + "loss": 1.012, + "step": 927 + }, + { + "epoch": 0.05107600858605317, + "grad_norm": 1.0585243701934814, + "learning_rate": 9.987123865720147e-06, + "loss": 0.9019, + "step": 928 + }, + { + "epoch": 0.051131047388408825, + "grad_norm": 0.9848800897598267, + "learning_rate": 9.98709275856152e-06, + "loss": 0.9665, + "step": 929 + }, + { + "epoch": 0.051186086190764486, + "grad_norm": 1.04201078414917, + "learning_rate": 9.987061613921238e-06, + "loss": 0.9269, + "step": 930 + }, + { + "epoch": 0.05124112499312015, + "grad_norm": 1.1748600006103516, + "learning_rate": 9.987030431799537e-06, + "loss": 0.8565, + "step": 931 + }, + { + "epoch": 0.05129616379547581, + "grad_norm": 1.879232406616211, + "learning_rate": 9.98699921219665e-06, + "loss": 0.8535, + "step": 932 + }, + { + "epoch": 0.05135120259783147, + "grad_norm": 0.9837847948074341, + "learning_rate": 9.986967955112812e-06, + "loss": 0.927, + "step": 933 + }, + { + "epoch": 0.05140624140018713, + "grad_norm": 0.8637211918830872, + "learning_rate": 9.986936660548257e-06, + "loss": 0.7903, + "step": 934 + }, + { + "epoch": 0.05146128020254279, + "grad_norm": 0.9078792929649353, + "learning_rate": 9.986905328503222e-06, + "loss": 0.9135, + "step": 935 + }, + { + "epoch": 0.051516319004898455, + "grad_norm": 0.9763005971908569, + "learning_rate": 9.98687395897794e-06, + "loss": 0.9006, + "step": 936 + }, + { + "epoch": 0.051571357807254116, + "grad_norm": 1.0174345970153809, + "learning_rate": 9.98684255197265e-06, + "loss": 0.9294, + "step": 937 + }, + { + "epoch": 0.05162639660960978, + "grad_norm": 0.8709769248962402, + "learning_rate": 9.986811107487584e-06, + "loss": 0.7986, + "step": 938 + }, + { + "epoch": 0.05168143541196544, + "grad_norm": 0.8717525601387024, + "learning_rate": 9.986779625522983e-06, + "loss": 0.8705, + "step": 939 + }, + { + "epoch": 0.05173647421432109, + "grad_norm": 0.9682945013046265, + "learning_rate": 9.98674810607908e-06, + "loss": 0.8127, + "step": 940 + }, + { + "epoch": 0.051791513016676755, + "grad_norm": 1.0248037576675415, + "learning_rate": 9.986716549156113e-06, + "loss": 0.9217, + "step": 941 + }, + { + "epoch": 0.051846551819032416, + "grad_norm": 0.9883397221565247, + "learning_rate": 9.98668495475432e-06, + "loss": 0.853, + "step": 942 + }, + { + "epoch": 0.05190159062138808, + "grad_norm": 0.9271108508110046, + "learning_rate": 9.986653322873937e-06, + "loss": 0.8807, + "step": 943 + }, + { + "epoch": 0.05195662942374374, + "grad_norm": 0.9027101397514343, + "learning_rate": 9.986621653515203e-06, + "loss": 0.88, + "step": 944 + }, + { + "epoch": 0.0520116682260994, + "grad_norm": 0.9807021617889404, + "learning_rate": 9.986589946678354e-06, + "loss": 0.8922, + "step": 945 + }, + { + "epoch": 0.05206670702845506, + "grad_norm": 0.8779157400131226, + "learning_rate": 9.98655820236363e-06, + "loss": 0.8988, + "step": 946 + }, + { + "epoch": 0.05212174583081072, + "grad_norm": 0.8182910680770874, + "learning_rate": 9.986526420571272e-06, + "loss": 0.7534, + "step": 947 + }, + { + "epoch": 0.052176784633166384, + "grad_norm": 0.9205981492996216, + "learning_rate": 9.986494601301513e-06, + "loss": 0.7516, + "step": 948 + }, + { + "epoch": 0.052231823435522046, + "grad_norm": 0.9829681515693665, + "learning_rate": 9.986462744554598e-06, + "loss": 0.9358, + "step": 949 + }, + { + "epoch": 0.05228686223787771, + "grad_norm": 0.8869890570640564, + "learning_rate": 9.986430850330762e-06, + "loss": 0.7933, + "step": 950 + }, + { + "epoch": 0.05234190104023336, + "grad_norm": 1.0226716995239258, + "learning_rate": 9.986398918630248e-06, + "loss": 0.9523, + "step": 951 + }, + { + "epoch": 0.05239693984258902, + "grad_norm": 0.9549778699874878, + "learning_rate": 9.986366949453293e-06, + "loss": 0.9368, + "step": 952 + }, + { + "epoch": 0.052451978644944684, + "grad_norm": 0.860454797744751, + "learning_rate": 9.98633494280014e-06, + "loss": 0.7618, + "step": 953 + }, + { + "epoch": 0.052507017447300346, + "grad_norm": 0.9623841643333435, + "learning_rate": 9.986302898671027e-06, + "loss": 0.8356, + "step": 954 + }, + { + "epoch": 0.05256205624965601, + "grad_norm": 0.9236606359481812, + "learning_rate": 9.986270817066196e-06, + "loss": 0.921, + "step": 955 + }, + { + "epoch": 0.05261709505201167, + "grad_norm": 1.0599812269210815, + "learning_rate": 9.98623869798589e-06, + "loss": 0.8082, + "step": 956 + }, + { + "epoch": 0.05267213385436733, + "grad_norm": 1.0321687459945679, + "learning_rate": 9.986206541430347e-06, + "loss": 0.9001, + "step": 957 + }, + { + "epoch": 0.05272717265672299, + "grad_norm": 0.8884543776512146, + "learning_rate": 9.986174347399813e-06, + "loss": 0.8317, + "step": 958 + }, + { + "epoch": 0.05278221145907865, + "grad_norm": 0.9592668414115906, + "learning_rate": 9.986142115894526e-06, + "loss": 0.9955, + "step": 959 + }, + { + "epoch": 0.052837250261434314, + "grad_norm": 0.9604032039642334, + "learning_rate": 9.986109846914729e-06, + "loss": 0.876, + "step": 960 + }, + { + "epoch": 0.052892289063789975, + "grad_norm": 0.9837536811828613, + "learning_rate": 9.986077540460664e-06, + "loss": 0.8247, + "step": 961 + }, + { + "epoch": 0.05294732786614563, + "grad_norm": 0.8570861220359802, + "learning_rate": 9.986045196532576e-06, + "loss": 0.879, + "step": 962 + }, + { + "epoch": 0.05300236666850129, + "grad_norm": 0.8441471457481384, + "learning_rate": 9.986012815130708e-06, + "loss": 0.8979, + "step": 963 + }, + { + "epoch": 0.05305740547085695, + "grad_norm": 0.8976197838783264, + "learning_rate": 9.985980396255302e-06, + "loss": 0.9382, + "step": 964 + }, + { + "epoch": 0.053112444273212614, + "grad_norm": 0.9685307741165161, + "learning_rate": 9.985947939906599e-06, + "loss": 0.8627, + "step": 965 + }, + { + "epoch": 0.053167483075568275, + "grad_norm": 0.8939018249511719, + "learning_rate": 9.98591544608485e-06, + "loss": 0.9221, + "step": 966 + }, + { + "epoch": 0.05322252187792394, + "grad_norm": 0.9218310713768005, + "learning_rate": 9.985882914790292e-06, + "loss": 0.8356, + "step": 967 + }, + { + "epoch": 0.0532775606802796, + "grad_norm": 0.9342261552810669, + "learning_rate": 9.985850346023174e-06, + "loss": 0.971, + "step": 968 + }, + { + "epoch": 0.05333259948263526, + "grad_norm": 1.0860705375671387, + "learning_rate": 9.985817739783741e-06, + "loss": 0.906, + "step": 969 + }, + { + "epoch": 0.05338763828499092, + "grad_norm": 0.8675006031990051, + "learning_rate": 9.985785096072234e-06, + "loss": 0.906, + "step": 970 + }, + { + "epoch": 0.05344267708734658, + "grad_norm": 0.8170626163482666, + "learning_rate": 9.985752414888903e-06, + "loss": 0.8109, + "step": 971 + }, + { + "epoch": 0.05349771588970224, + "grad_norm": 0.936434805393219, + "learning_rate": 9.98571969623399e-06, + "loss": 0.9219, + "step": 972 + }, + { + "epoch": 0.0535527546920579, + "grad_norm": 0.9316715002059937, + "learning_rate": 9.985686940107741e-06, + "loss": 0.8569, + "step": 973 + }, + { + "epoch": 0.05360779349441356, + "grad_norm": 1.183008074760437, + "learning_rate": 9.985654146510405e-06, + "loss": 0.837, + "step": 974 + }, + { + "epoch": 0.05366283229676922, + "grad_norm": 1.0788745880126953, + "learning_rate": 9.98562131544223e-06, + "loss": 0.8822, + "step": 975 + }, + { + "epoch": 0.05371787109912488, + "grad_norm": 0.9285461902618408, + "learning_rate": 9.985588446903455e-06, + "loss": 0.9279, + "step": 976 + }, + { + "epoch": 0.053772909901480544, + "grad_norm": 0.9389022588729858, + "learning_rate": 9.985555540894334e-06, + "loss": 0.9839, + "step": 977 + }, + { + "epoch": 0.053827948703836205, + "grad_norm": 0.8920616507530212, + "learning_rate": 9.985522597415112e-06, + "loss": 0.9205, + "step": 978 + }, + { + "epoch": 0.053882987506191866, + "grad_norm": 0.9755093455314636, + "learning_rate": 9.985489616466035e-06, + "loss": 0.8987, + "step": 979 + }, + { + "epoch": 0.05393802630854753, + "grad_norm": 0.96027010679245, + "learning_rate": 9.985456598047356e-06, + "loss": 0.8543, + "step": 980 + }, + { + "epoch": 0.05399306511090319, + "grad_norm": 1.0489718914031982, + "learning_rate": 9.985423542159317e-06, + "loss": 0.9179, + "step": 981 + }, + { + "epoch": 0.05404810391325885, + "grad_norm": 0.8665526509284973, + "learning_rate": 9.985390448802171e-06, + "loss": 0.9047, + "step": 982 + }, + { + "epoch": 0.054103142715614505, + "grad_norm": 0.8849464654922485, + "learning_rate": 9.985357317976163e-06, + "loss": 0.8892, + "step": 983 + }, + { + "epoch": 0.054158181517970166, + "grad_norm": 1.0083115100860596, + "learning_rate": 9.985324149681545e-06, + "loss": 0.7713, + "step": 984 + }, + { + "epoch": 0.05421322032032583, + "grad_norm": 0.8233863711357117, + "learning_rate": 9.985290943918565e-06, + "loss": 0.7967, + "step": 985 + }, + { + "epoch": 0.05426825912268149, + "grad_norm": 0.9615303874015808, + "learning_rate": 9.985257700687472e-06, + "loss": 0.8576, + "step": 986 + }, + { + "epoch": 0.05432329792503715, + "grad_norm": 0.8856416344642639, + "learning_rate": 9.985224419988517e-06, + "loss": 0.8614, + "step": 987 + }, + { + "epoch": 0.05437833672739281, + "grad_norm": 0.968325674533844, + "learning_rate": 9.98519110182195e-06, + "loss": 0.8247, + "step": 988 + }, + { + "epoch": 0.05443337552974847, + "grad_norm": 0.878402054309845, + "learning_rate": 9.985157746188021e-06, + "loss": 0.8661, + "step": 989 + }, + { + "epoch": 0.054488414332104135, + "grad_norm": 0.8376438021659851, + "learning_rate": 9.985124353086981e-06, + "loss": 0.8554, + "step": 990 + }, + { + "epoch": 0.054543453134459796, + "grad_norm": 1.0293036699295044, + "learning_rate": 9.98509092251908e-06, + "loss": 0.8049, + "step": 991 + }, + { + "epoch": 0.05459849193681546, + "grad_norm": 1.2345234155654907, + "learning_rate": 9.98505745448457e-06, + "loss": 1.0358, + "step": 992 + }, + { + "epoch": 0.05465353073917112, + "grad_norm": 0.9974482655525208, + "learning_rate": 9.985023948983703e-06, + "loss": 0.9329, + "step": 993 + }, + { + "epoch": 0.05470856954152677, + "grad_norm": 1.383955478668213, + "learning_rate": 9.984990406016732e-06, + "loss": 0.8688, + "step": 994 + }, + { + "epoch": 0.054763608343882435, + "grad_norm": 0.9369306564331055, + "learning_rate": 9.984956825583906e-06, + "loss": 0.8308, + "step": 995 + }, + { + "epoch": 0.054818647146238096, + "grad_norm": 0.8676120042800903, + "learning_rate": 9.984923207685478e-06, + "loss": 0.8283, + "step": 996 + }, + { + "epoch": 0.05487368594859376, + "grad_norm": 0.9218453168869019, + "learning_rate": 9.984889552321704e-06, + "loss": 0.7247, + "step": 997 + }, + { + "epoch": 0.05492872475094942, + "grad_norm": 0.8575478196144104, + "learning_rate": 9.984855859492833e-06, + "loss": 0.8462, + "step": 998 + }, + { + "epoch": 0.05498376355330508, + "grad_norm": 1.0042616128921509, + "learning_rate": 9.98482212919912e-06, + "loss": 0.9383, + "step": 999 + }, + { + "epoch": 0.05503880235566074, + "grad_norm": 0.8642181158065796, + "learning_rate": 9.984788361440817e-06, + "loss": 0.8805, + "step": 1000 + }, + { + "epoch": 0.0550938411580164, + "grad_norm": 0.8413823843002319, + "learning_rate": 9.984754556218178e-06, + "loss": 0.8161, + "step": 1001 + }, + { + "epoch": 0.055148879960372064, + "grad_norm": 0.9473856091499329, + "learning_rate": 9.984720713531462e-06, + "loss": 0.8425, + "step": 1002 + }, + { + "epoch": 0.055203918762727726, + "grad_norm": 0.7854379415512085, + "learning_rate": 9.984686833380917e-06, + "loss": 0.7506, + "step": 1003 + }, + { + "epoch": 0.05525895756508339, + "grad_norm": 0.9481745958328247, + "learning_rate": 9.984652915766801e-06, + "loss": 0.954, + "step": 1004 + }, + { + "epoch": 0.05531399636743904, + "grad_norm": 0.767803966999054, + "learning_rate": 9.984618960689366e-06, + "loss": 0.8113, + "step": 1005 + }, + { + "epoch": 0.0553690351697947, + "grad_norm": 0.8957781195640564, + "learning_rate": 9.984584968148871e-06, + "loss": 0.9042, + "step": 1006 + }, + { + "epoch": 0.055424073972150364, + "grad_norm": 1.116646409034729, + "learning_rate": 9.98455093814557e-06, + "loss": 0.8648, + "step": 1007 + }, + { + "epoch": 0.055479112774506026, + "grad_norm": 0.9567018151283264, + "learning_rate": 9.98451687067972e-06, + "loss": 0.9446, + "step": 1008 + }, + { + "epoch": 0.05553415157686169, + "grad_norm": 0.8470665812492371, + "learning_rate": 9.98448276575157e-06, + "loss": 0.8186, + "step": 1009 + }, + { + "epoch": 0.05558919037921735, + "grad_norm": 0.9595193862915039, + "learning_rate": 9.984448623361387e-06, + "loss": 0.8406, + "step": 1010 + }, + { + "epoch": 0.05564422918157301, + "grad_norm": 1.0579735040664673, + "learning_rate": 9.98441444350942e-06, + "loss": 0.9676, + "step": 1011 + }, + { + "epoch": 0.05569926798392867, + "grad_norm": 0.8693701028823853, + "learning_rate": 9.98438022619593e-06, + "loss": 0.9451, + "step": 1012 + }, + { + "epoch": 0.05575430678628433, + "grad_norm": 0.9251859784126282, + "learning_rate": 9.98434597142117e-06, + "loss": 0.7858, + "step": 1013 + }, + { + "epoch": 0.055809345588639994, + "grad_norm": 0.8584280014038086, + "learning_rate": 9.984311679185402e-06, + "loss": 0.8481, + "step": 1014 + }, + { + "epoch": 0.05586438439099565, + "grad_norm": 0.8903968334197998, + "learning_rate": 9.98427734948888e-06, + "loss": 0.7832, + "step": 1015 + }, + { + "epoch": 0.05591942319335131, + "grad_norm": 0.905581533908844, + "learning_rate": 9.984242982331864e-06, + "loss": 0.8088, + "step": 1016 + }, + { + "epoch": 0.05597446199570697, + "grad_norm": 0.9866476655006409, + "learning_rate": 9.984208577714612e-06, + "loss": 0.8366, + "step": 1017 + }, + { + "epoch": 0.05602950079806263, + "grad_norm": 0.8843809962272644, + "learning_rate": 9.984174135637384e-06, + "loss": 0.8961, + "step": 1018 + }, + { + "epoch": 0.056084539600418294, + "grad_norm": 0.9071753621101379, + "learning_rate": 9.984139656100435e-06, + "loss": 0.8671, + "step": 1019 + }, + { + "epoch": 0.056139578402773956, + "grad_norm": 0.9894018173217773, + "learning_rate": 9.984105139104028e-06, + "loss": 0.9099, + "step": 1020 + }, + { + "epoch": 0.05619461720512962, + "grad_norm": 0.8432741165161133, + "learning_rate": 9.98407058464842e-06, + "loss": 0.7817, + "step": 1021 + }, + { + "epoch": 0.05624965600748528, + "grad_norm": 0.9538390040397644, + "learning_rate": 9.984035992733873e-06, + "loss": 0.8689, + "step": 1022 + }, + { + "epoch": 0.05630469480984094, + "grad_norm": 0.9263421297073364, + "learning_rate": 9.984001363360645e-06, + "loss": 0.9066, + "step": 1023 + }, + { + "epoch": 0.0563597336121966, + "grad_norm": 0.8921047449111938, + "learning_rate": 9.983966696528996e-06, + "loss": 0.8304, + "step": 1024 + }, + { + "epoch": 0.05641477241455226, + "grad_norm": 0.8379812240600586, + "learning_rate": 9.983931992239188e-06, + "loss": 0.866, + "step": 1025 + }, + { + "epoch": 0.05646981121690792, + "grad_norm": 0.9444219470024109, + "learning_rate": 9.983897250491481e-06, + "loss": 0.9456, + "step": 1026 + }, + { + "epoch": 0.05652485001926358, + "grad_norm": 1.0268759727478027, + "learning_rate": 9.983862471286137e-06, + "loss": 0.8277, + "step": 1027 + }, + { + "epoch": 0.05657988882161924, + "grad_norm": 1.3949217796325684, + "learning_rate": 9.983827654623418e-06, + "loss": 0.9721, + "step": 1028 + }, + { + "epoch": 0.0566349276239749, + "grad_norm": 0.8899377584457397, + "learning_rate": 9.983792800503582e-06, + "loss": 0.8794, + "step": 1029 + }, + { + "epoch": 0.05668996642633056, + "grad_norm": 0.989072322845459, + "learning_rate": 9.983757908926895e-06, + "loss": 0.8852, + "step": 1030 + }, + { + "epoch": 0.056745005228686224, + "grad_norm": 0.9797759056091309, + "learning_rate": 9.983722979893615e-06, + "loss": 1.0405, + "step": 1031 + }, + { + "epoch": 0.056800044031041885, + "grad_norm": 0.9044767618179321, + "learning_rate": 9.98368801340401e-06, + "loss": 0.7243, + "step": 1032 + }, + { + "epoch": 0.05685508283339755, + "grad_norm": 1.116324782371521, + "learning_rate": 9.983653009458338e-06, + "loss": 0.9183, + "step": 1033 + }, + { + "epoch": 0.05691012163575321, + "grad_norm": 0.9373337030410767, + "learning_rate": 9.983617968056866e-06, + "loss": 0.9417, + "step": 1034 + }, + { + "epoch": 0.05696516043810887, + "grad_norm": 1.0587197542190552, + "learning_rate": 9.983582889199855e-06, + "loss": 0.896, + "step": 1035 + }, + { + "epoch": 0.05702019924046453, + "grad_norm": 1.0080119371414185, + "learning_rate": 9.983547772887568e-06, + "loss": 0.924, + "step": 1036 + }, + { + "epoch": 0.057075238042820185, + "grad_norm": 0.847091019153595, + "learning_rate": 9.98351261912027e-06, + "loss": 0.7443, + "step": 1037 + }, + { + "epoch": 0.05713027684517585, + "grad_norm": 0.9876272082328796, + "learning_rate": 9.983477427898225e-06, + "loss": 0.9365, + "step": 1038 + }, + { + "epoch": 0.05718531564753151, + "grad_norm": 0.9188169240951538, + "learning_rate": 9.983442199221698e-06, + "loss": 0.9213, + "step": 1039 + }, + { + "epoch": 0.05724035444988717, + "grad_norm": 0.932399332523346, + "learning_rate": 9.983406933090954e-06, + "loss": 0.958, + "step": 1040 + }, + { + "epoch": 0.05729539325224283, + "grad_norm": 0.9126465320587158, + "learning_rate": 9.983371629506258e-06, + "loss": 0.8913, + "step": 1041 + }, + { + "epoch": 0.05735043205459849, + "grad_norm": 0.80904620885849, + "learning_rate": 9.983336288467873e-06, + "loss": 0.7719, + "step": 1042 + }, + { + "epoch": 0.057405470856954154, + "grad_norm": 0.873833417892456, + "learning_rate": 9.983300909976067e-06, + "loss": 0.9201, + "step": 1043 + }, + { + "epoch": 0.057460509659309815, + "grad_norm": 0.8331829309463501, + "learning_rate": 9.983265494031107e-06, + "loss": 0.8605, + "step": 1044 + }, + { + "epoch": 0.057515548461665476, + "grad_norm": 0.8364768624305725, + "learning_rate": 9.983230040633255e-06, + "loss": 0.8627, + "step": 1045 + }, + { + "epoch": 0.05757058726402114, + "grad_norm": 0.9226736426353455, + "learning_rate": 9.98319454978278e-06, + "loss": 0.9759, + "step": 1046 + }, + { + "epoch": 0.05762562606637679, + "grad_norm": 0.8174427151679993, + "learning_rate": 9.98315902147995e-06, + "loss": 0.8066, + "step": 1047 + }, + { + "epoch": 0.057680664868732454, + "grad_norm": 0.9154924750328064, + "learning_rate": 9.98312345572503e-06, + "loss": 0.9112, + "step": 1048 + }, + { + "epoch": 0.057735703671088115, + "grad_norm": 0.8884655237197876, + "learning_rate": 9.983087852518289e-06, + "loss": 0.8699, + "step": 1049 + }, + { + "epoch": 0.057790742473443776, + "grad_norm": 0.8849230408668518, + "learning_rate": 9.983052211859992e-06, + "loss": 0.8999, + "step": 1050 + }, + { + "epoch": 0.05784578127579944, + "grad_norm": 1.025843858718872, + "learning_rate": 9.98301653375041e-06, + "loss": 0.7764, + "step": 1051 + }, + { + "epoch": 0.0579008200781551, + "grad_norm": 0.900505006313324, + "learning_rate": 9.98298081818981e-06, + "loss": 0.9196, + "step": 1052 + }, + { + "epoch": 0.05795585888051076, + "grad_norm": 0.9506704211235046, + "learning_rate": 9.982945065178457e-06, + "loss": 0.8319, + "step": 1053 + }, + { + "epoch": 0.05801089768286642, + "grad_norm": 0.9439849853515625, + "learning_rate": 9.982909274716626e-06, + "loss": 0.8561, + "step": 1054 + }, + { + "epoch": 0.05806593648522208, + "grad_norm": 0.8761240243911743, + "learning_rate": 9.982873446804579e-06, + "loss": 0.9681, + "step": 1055 + }, + { + "epoch": 0.058120975287577745, + "grad_norm": 0.8756145238876343, + "learning_rate": 9.982837581442592e-06, + "loss": 0.8452, + "step": 1056 + }, + { + "epoch": 0.058176014089933406, + "grad_norm": 0.8732383847236633, + "learning_rate": 9.982801678630932e-06, + "loss": 0.9018, + "step": 1057 + }, + { + "epoch": 0.05823105289228906, + "grad_norm": 0.8338272571563721, + "learning_rate": 9.982765738369867e-06, + "loss": 0.9308, + "step": 1058 + }, + { + "epoch": 0.05828609169464472, + "grad_norm": 0.843163013458252, + "learning_rate": 9.982729760659669e-06, + "loss": 0.7802, + "step": 1059 + }, + { + "epoch": 0.05834113049700038, + "grad_norm": 1.2007580995559692, + "learning_rate": 9.982693745500606e-06, + "loss": 0.8406, + "step": 1060 + }, + { + "epoch": 0.058396169299356045, + "grad_norm": 0.8760073781013489, + "learning_rate": 9.982657692892954e-06, + "loss": 0.8528, + "step": 1061 + }, + { + "epoch": 0.058451208101711706, + "grad_norm": 0.925309419631958, + "learning_rate": 9.982621602836978e-06, + "loss": 0.9601, + "step": 1062 + }, + { + "epoch": 0.05850624690406737, + "grad_norm": 0.9277135133743286, + "learning_rate": 9.982585475332952e-06, + "loss": 0.8405, + "step": 1063 + }, + { + "epoch": 0.05856128570642303, + "grad_norm": 0.928044319152832, + "learning_rate": 9.98254931038115e-06, + "loss": 0.8259, + "step": 1064 + }, + { + "epoch": 0.05861632450877869, + "grad_norm": 0.8363838195800781, + "learning_rate": 9.982513107981837e-06, + "loss": 0.8655, + "step": 1065 + }, + { + "epoch": 0.05867136331113435, + "grad_norm": 0.9800984859466553, + "learning_rate": 9.982476868135292e-06, + "loss": 0.9285, + "step": 1066 + }, + { + "epoch": 0.05872640211349001, + "grad_norm": 0.8062636256217957, + "learning_rate": 9.982440590841785e-06, + "loss": 0.754, + "step": 1067 + }, + { + "epoch": 0.058781440915845674, + "grad_norm": 1.2010705471038818, + "learning_rate": 9.982404276101586e-06, + "loss": 0.9872, + "step": 1068 + }, + { + "epoch": 0.05883647971820133, + "grad_norm": 1.0036406517028809, + "learning_rate": 9.982367923914971e-06, + "loss": 0.8724, + "step": 1069 + }, + { + "epoch": 0.05889151852055699, + "grad_norm": 0.8768866658210754, + "learning_rate": 9.982331534282212e-06, + "loss": 0.838, + "step": 1070 + }, + { + "epoch": 0.05894655732291265, + "grad_norm": 0.7892739176750183, + "learning_rate": 9.982295107203584e-06, + "loss": 0.6974, + "step": 1071 + }, + { + "epoch": 0.05900159612526831, + "grad_norm": 0.863315999507904, + "learning_rate": 9.982258642679358e-06, + "loss": 0.9282, + "step": 1072 + }, + { + "epoch": 0.059056634927623974, + "grad_norm": 0.8645132780075073, + "learning_rate": 9.982222140709812e-06, + "loss": 0.8504, + "step": 1073 + }, + { + "epoch": 0.059111673729979636, + "grad_norm": 1.0003199577331543, + "learning_rate": 9.982185601295216e-06, + "loss": 1.0293, + "step": 1074 + }, + { + "epoch": 0.0591667125323353, + "grad_norm": 0.8391831517219543, + "learning_rate": 9.982149024435848e-06, + "loss": 0.8609, + "step": 1075 + }, + { + "epoch": 0.05922175133469096, + "grad_norm": 0.9940230846405029, + "learning_rate": 9.982112410131981e-06, + "loss": 0.9623, + "step": 1076 + }, + { + "epoch": 0.05927679013704662, + "grad_norm": 1.0670262575149536, + "learning_rate": 9.98207575838389e-06, + "loss": 0.9952, + "step": 1077 + }, + { + "epoch": 0.05933182893940228, + "grad_norm": 0.8506165742874146, + "learning_rate": 9.982039069191853e-06, + "loss": 0.8401, + "step": 1078 + }, + { + "epoch": 0.05938686774175794, + "grad_norm": 0.8956409096717834, + "learning_rate": 9.982002342556144e-06, + "loss": 0.8779, + "step": 1079 + }, + { + "epoch": 0.0594419065441136, + "grad_norm": 0.8955749273300171, + "learning_rate": 9.981965578477038e-06, + "loss": 0.8946, + "step": 1080 + }, + { + "epoch": 0.05949694534646926, + "grad_norm": 0.9035234451293945, + "learning_rate": 9.981928776954811e-06, + "loss": 0.9352, + "step": 1081 + }, + { + "epoch": 0.05955198414882492, + "grad_norm": 0.8748759627342224, + "learning_rate": 9.981891937989743e-06, + "loss": 0.8803, + "step": 1082 + }, + { + "epoch": 0.05960702295118058, + "grad_norm": 0.9966281056404114, + "learning_rate": 9.981855061582108e-06, + "loss": 0.9304, + "step": 1083 + }, + { + "epoch": 0.05966206175353624, + "grad_norm": 0.8696668148040771, + "learning_rate": 9.981818147732183e-06, + "loss": 0.8706, + "step": 1084 + }, + { + "epoch": 0.059717100555891904, + "grad_norm": 0.9823188185691833, + "learning_rate": 9.981781196440249e-06, + "loss": 0.9431, + "step": 1085 + }, + { + "epoch": 0.059772139358247565, + "grad_norm": 0.8401583433151245, + "learning_rate": 9.981744207706577e-06, + "loss": 0.8369, + "step": 1086 + }, + { + "epoch": 0.05982717816060323, + "grad_norm": 0.8775757551193237, + "learning_rate": 9.981707181531452e-06, + "loss": 0.9516, + "step": 1087 + }, + { + "epoch": 0.05988221696295889, + "grad_norm": 0.9153465628623962, + "learning_rate": 9.981670117915148e-06, + "loss": 0.8997, + "step": 1088 + }, + { + "epoch": 0.05993725576531455, + "grad_norm": 0.9053078889846802, + "learning_rate": 9.981633016857946e-06, + "loss": 0.9452, + "step": 1089 + }, + { + "epoch": 0.059992294567670204, + "grad_norm": 0.9154480695724487, + "learning_rate": 9.981595878360123e-06, + "loss": 0.8293, + "step": 1090 + }, + { + "epoch": 0.060047333370025865, + "grad_norm": 0.85718834400177, + "learning_rate": 9.981558702421958e-06, + "loss": 0.876, + "step": 1091 + }, + { + "epoch": 0.06010237217238153, + "grad_norm": 0.9437130689620972, + "learning_rate": 9.981521489043734e-06, + "loss": 0.9731, + "step": 1092 + }, + { + "epoch": 0.06015741097473719, + "grad_norm": 0.9014891386032104, + "learning_rate": 9.981484238225725e-06, + "loss": 0.811, + "step": 1093 + }, + { + "epoch": 0.06021244977709285, + "grad_norm": 0.8942846655845642, + "learning_rate": 9.981446949968216e-06, + "loss": 0.808, + "step": 1094 + }, + { + "epoch": 0.06026748857944851, + "grad_norm": 0.855297863483429, + "learning_rate": 9.981409624271483e-06, + "loss": 0.8319, + "step": 1095 + }, + { + "epoch": 0.06032252738180417, + "grad_norm": 0.9310913681983948, + "learning_rate": 9.981372261135811e-06, + "loss": 0.899, + "step": 1096 + }, + { + "epoch": 0.060377566184159834, + "grad_norm": 0.8472979664802551, + "learning_rate": 9.981334860561478e-06, + "loss": 0.8818, + "step": 1097 + }, + { + "epoch": 0.060432604986515495, + "grad_norm": 0.896617591381073, + "learning_rate": 9.981297422548764e-06, + "loss": 0.8991, + "step": 1098 + }, + { + "epoch": 0.06048764378887116, + "grad_norm": 0.8543037176132202, + "learning_rate": 9.981259947097954e-06, + "loss": 0.8595, + "step": 1099 + }, + { + "epoch": 0.06054268259122682, + "grad_norm": 0.8794904947280884, + "learning_rate": 9.981222434209327e-06, + "loss": 0.8561, + "step": 1100 + }, + { + "epoch": 0.06059772139358247, + "grad_norm": 0.8882116675376892, + "learning_rate": 9.981184883883165e-06, + "loss": 0.8099, + "step": 1101 + }, + { + "epoch": 0.060652760195938134, + "grad_norm": 1.0068262815475464, + "learning_rate": 9.98114729611975e-06, + "loss": 0.8104, + "step": 1102 + }, + { + "epoch": 0.060707798998293795, + "grad_norm": 1.072316288948059, + "learning_rate": 9.981109670919366e-06, + "loss": 0.9877, + "step": 1103 + }, + { + "epoch": 0.06076283780064946, + "grad_norm": 0.9959045052528381, + "learning_rate": 9.981072008282298e-06, + "loss": 0.906, + "step": 1104 + }, + { + "epoch": 0.06081787660300512, + "grad_norm": 0.8712790608406067, + "learning_rate": 9.981034308208823e-06, + "loss": 0.8725, + "step": 1105 + }, + { + "epoch": 0.06087291540536078, + "grad_norm": 0.9114679098129272, + "learning_rate": 9.980996570699228e-06, + "loss": 0.8385, + "step": 1106 + }, + { + "epoch": 0.06092795420771644, + "grad_norm": 1.0024466514587402, + "learning_rate": 9.980958795753796e-06, + "loss": 0.8661, + "step": 1107 + }, + { + "epoch": 0.0609829930100721, + "grad_norm": 0.9578461050987244, + "learning_rate": 9.98092098337281e-06, + "loss": 0.9358, + "step": 1108 + }, + { + "epoch": 0.061038031812427763, + "grad_norm": 0.8677787780761719, + "learning_rate": 9.980883133556557e-06, + "loss": 0.8146, + "step": 1109 + }, + { + "epoch": 0.061093070614783425, + "grad_norm": 0.9072276949882507, + "learning_rate": 9.98084524630532e-06, + "loss": 0.91, + "step": 1110 + }, + { + "epoch": 0.061148109417139086, + "grad_norm": 0.8827292919158936, + "learning_rate": 9.980807321619381e-06, + "loss": 0.8854, + "step": 1111 + }, + { + "epoch": 0.06120314821949474, + "grad_norm": 1.0012744665145874, + "learning_rate": 9.98076935949903e-06, + "loss": 0.8242, + "step": 1112 + }, + { + "epoch": 0.0612581870218504, + "grad_norm": 0.9152620434761047, + "learning_rate": 9.980731359944548e-06, + "loss": 0.8832, + "step": 1113 + }, + { + "epoch": 0.061313225824206063, + "grad_norm": 0.8986824750900269, + "learning_rate": 9.980693322956222e-06, + "loss": 0.7975, + "step": 1114 + }, + { + "epoch": 0.061368264626561725, + "grad_norm": 0.9373019933700562, + "learning_rate": 9.98065524853434e-06, + "loss": 0.9541, + "step": 1115 + }, + { + "epoch": 0.061423303428917386, + "grad_norm": 0.9875593781471252, + "learning_rate": 9.980617136679185e-06, + "loss": 1.0052, + "step": 1116 + }, + { + "epoch": 0.06147834223127305, + "grad_norm": 1.0664819478988647, + "learning_rate": 9.980578987391045e-06, + "loss": 0.8584, + "step": 1117 + }, + { + "epoch": 0.06153338103362871, + "grad_norm": 0.9149377942085266, + "learning_rate": 9.980540800670207e-06, + "loss": 0.8467, + "step": 1118 + }, + { + "epoch": 0.06158841983598437, + "grad_norm": 0.9303194284439087, + "learning_rate": 9.980502576516959e-06, + "loss": 0.8219, + "step": 1119 + }, + { + "epoch": 0.06164345863834003, + "grad_norm": 0.9059457778930664, + "learning_rate": 9.980464314931583e-06, + "loss": 0.8459, + "step": 1120 + }, + { + "epoch": 0.06169849744069569, + "grad_norm": 0.9368849396705627, + "learning_rate": 9.980426015914375e-06, + "loss": 0.8933, + "step": 1121 + }, + { + "epoch": 0.061753536243051355, + "grad_norm": 0.8188626766204834, + "learning_rate": 9.980387679465615e-06, + "loss": 0.807, + "step": 1122 + }, + { + "epoch": 0.06180857504540701, + "grad_norm": 1.027171015739441, + "learning_rate": 9.980349305585595e-06, + "loss": 0.8919, + "step": 1123 + }, + { + "epoch": 0.06186361384776267, + "grad_norm": 0.831649899482727, + "learning_rate": 9.980310894274603e-06, + "loss": 0.8109, + "step": 1124 + }, + { + "epoch": 0.06191865265011833, + "grad_norm": 1.0170252323150635, + "learning_rate": 9.980272445532928e-06, + "loss": 0.9537, + "step": 1125 + }, + { + "epoch": 0.06197369145247399, + "grad_norm": 0.97837233543396, + "learning_rate": 9.980233959360858e-06, + "loss": 0.9104, + "step": 1126 + }, + { + "epoch": 0.062028730254829655, + "grad_norm": 0.9548324942588806, + "learning_rate": 9.980195435758681e-06, + "loss": 0.9473, + "step": 1127 + }, + { + "epoch": 0.062083769057185316, + "grad_norm": 0.8675842881202698, + "learning_rate": 9.980156874726692e-06, + "loss": 0.8313, + "step": 1128 + }, + { + "epoch": 0.06213880785954098, + "grad_norm": 0.8948968052864075, + "learning_rate": 9.980118276265173e-06, + "loss": 0.8008, + "step": 1129 + }, + { + "epoch": 0.06219384666189664, + "grad_norm": 0.8914239406585693, + "learning_rate": 9.98007964037442e-06, + "loss": 0.7642, + "step": 1130 + }, + { + "epoch": 0.0622488854642523, + "grad_norm": 0.9499951004981995, + "learning_rate": 9.980040967054723e-06, + "loss": 0.8669, + "step": 1131 + }, + { + "epoch": 0.06230392426660796, + "grad_norm": 0.8959251642227173, + "learning_rate": 9.980002256306369e-06, + "loss": 0.9177, + "step": 1132 + }, + { + "epoch": 0.062358963068963616, + "grad_norm": 0.8634380102157593, + "learning_rate": 9.97996350812965e-06, + "loss": 0.8252, + "step": 1133 + }, + { + "epoch": 0.06241400187131928, + "grad_norm": 0.9380598068237305, + "learning_rate": 9.97992472252486e-06, + "loss": 0.9335, + "step": 1134 + }, + { + "epoch": 0.06246904067367494, + "grad_norm": 0.8373183608055115, + "learning_rate": 9.97988589949229e-06, + "loss": 0.848, + "step": 1135 + }, + { + "epoch": 0.0625240794760306, + "grad_norm": 0.9649023413658142, + "learning_rate": 9.97984703903223e-06, + "loss": 0.9648, + "step": 1136 + }, + { + "epoch": 0.06257911827838626, + "grad_norm": 0.9972373843193054, + "learning_rate": 9.979808141144972e-06, + "loss": 0.9104, + "step": 1137 + }, + { + "epoch": 0.06263415708074192, + "grad_norm": 0.8230985403060913, + "learning_rate": 9.97976920583081e-06, + "loss": 0.8393, + "step": 1138 + }, + { + "epoch": 0.06268919588309758, + "grad_norm": 0.9775324463844299, + "learning_rate": 9.979730233090034e-06, + "loss": 0.8385, + "step": 1139 + }, + { + "epoch": 0.06274423468545325, + "grad_norm": 0.8288110494613647, + "learning_rate": 9.97969122292294e-06, + "loss": 0.7308, + "step": 1140 + }, + { + "epoch": 0.06279927348780891, + "grad_norm": 0.8980758786201477, + "learning_rate": 9.979652175329819e-06, + "loss": 0.863, + "step": 1141 + }, + { + "epoch": 0.06285431229016457, + "grad_norm": 7.43889045715332, + "learning_rate": 9.979613090310965e-06, + "loss": 0.9412, + "step": 1142 + }, + { + "epoch": 0.06290935109252023, + "grad_norm": 0.9758191704750061, + "learning_rate": 9.97957396786667e-06, + "loss": 0.8896, + "step": 1143 + }, + { + "epoch": 0.06296438989487589, + "grad_norm": 0.8211693167686462, + "learning_rate": 9.979534807997234e-06, + "loss": 0.7352, + "step": 1144 + }, + { + "epoch": 0.06301942869723155, + "grad_norm": 0.8643441796302795, + "learning_rate": 9.979495610702945e-06, + "loss": 0.8701, + "step": 1145 + }, + { + "epoch": 0.06307446749958721, + "grad_norm": 1.0199437141418457, + "learning_rate": 9.9794563759841e-06, + "loss": 0.9025, + "step": 1146 + }, + { + "epoch": 0.06312950630194288, + "grad_norm": 0.8367893695831299, + "learning_rate": 9.979417103840994e-06, + "loss": 0.8491, + "step": 1147 + }, + { + "epoch": 0.06318454510429854, + "grad_norm": 0.9411819577217102, + "learning_rate": 9.979377794273923e-06, + "loss": 0.8501, + "step": 1148 + }, + { + "epoch": 0.06323958390665418, + "grad_norm": 1.1497365236282349, + "learning_rate": 9.97933844728318e-06, + "loss": 1.0227, + "step": 1149 + }, + { + "epoch": 0.06329462270900985, + "grad_norm": 0.9892984628677368, + "learning_rate": 9.979299062869064e-06, + "loss": 0.8942, + "step": 1150 + }, + { + "epoch": 0.06334966151136551, + "grad_norm": 0.947952926158905, + "learning_rate": 9.979259641031867e-06, + "loss": 1.0149, + "step": 1151 + }, + { + "epoch": 0.06340470031372117, + "grad_norm": 0.9060251712799072, + "learning_rate": 9.979220181771889e-06, + "loss": 0.8607, + "step": 1152 + }, + { + "epoch": 0.06345973911607683, + "grad_norm": 0.8331984281539917, + "learning_rate": 9.979180685089424e-06, + "loss": 0.8777, + "step": 1153 + }, + { + "epoch": 0.06351477791843249, + "grad_norm": 0.9133188724517822, + "learning_rate": 9.97914115098477e-06, + "loss": 0.7409, + "step": 1154 + }, + { + "epoch": 0.06356981672078815, + "grad_norm": 0.9095513820648193, + "learning_rate": 9.979101579458224e-06, + "loss": 0.8938, + "step": 1155 + }, + { + "epoch": 0.06362485552314381, + "grad_norm": 0.9584553241729736, + "learning_rate": 9.979061970510082e-06, + "loss": 0.8765, + "step": 1156 + }, + { + "epoch": 0.06367989432549948, + "grad_norm": 0.8742124438285828, + "learning_rate": 9.979022324140644e-06, + "loss": 0.8564, + "step": 1157 + }, + { + "epoch": 0.06373493312785514, + "grad_norm": 0.8776904344558716, + "learning_rate": 9.978982640350208e-06, + "loss": 0.8713, + "step": 1158 + }, + { + "epoch": 0.0637899719302108, + "grad_norm": 0.8667464852333069, + "learning_rate": 9.97894291913907e-06, + "loss": 0.8705, + "step": 1159 + }, + { + "epoch": 0.06384501073256646, + "grad_norm": 0.9028087854385376, + "learning_rate": 9.978903160507531e-06, + "loss": 0.8297, + "step": 1160 + }, + { + "epoch": 0.06390004953492212, + "grad_norm": 0.900812029838562, + "learning_rate": 9.978863364455887e-06, + "loss": 0.8456, + "step": 1161 + }, + { + "epoch": 0.06395508833727778, + "grad_norm": 0.9667207598686218, + "learning_rate": 9.97882353098444e-06, + "loss": 0.8081, + "step": 1162 + }, + { + "epoch": 0.06401012713963344, + "grad_norm": 0.8959711194038391, + "learning_rate": 9.978783660093488e-06, + "loss": 0.8455, + "step": 1163 + }, + { + "epoch": 0.0640651659419891, + "grad_norm": 0.8519117832183838, + "learning_rate": 9.97874375178333e-06, + "loss": 0.849, + "step": 1164 + }, + { + "epoch": 0.06412020474434477, + "grad_norm": 1.0532654523849487, + "learning_rate": 9.978703806054267e-06, + "loss": 0.7356, + "step": 1165 + }, + { + "epoch": 0.06417524354670043, + "grad_norm": 1.0313252210617065, + "learning_rate": 9.9786638229066e-06, + "loss": 1.024, + "step": 1166 + }, + { + "epoch": 0.06423028234905609, + "grad_norm": 1.0567537546157837, + "learning_rate": 9.978623802340627e-06, + "loss": 0.9423, + "step": 1167 + }, + { + "epoch": 0.06428532115141175, + "grad_norm": 0.8198097348213196, + "learning_rate": 9.97858374435665e-06, + "loss": 0.829, + "step": 1168 + }, + { + "epoch": 0.06434035995376741, + "grad_norm": 0.8718193173408508, + "learning_rate": 9.97854364895497e-06, + "loss": 0.7184, + "step": 1169 + }, + { + "epoch": 0.06439539875612307, + "grad_norm": 0.8037594556808472, + "learning_rate": 9.978503516135892e-06, + "loss": 0.7961, + "step": 1170 + }, + { + "epoch": 0.06445043755847872, + "grad_norm": 0.9052229523658752, + "learning_rate": 9.978463345899709e-06, + "loss": 0.8016, + "step": 1171 + }, + { + "epoch": 0.06450547636083438, + "grad_norm": 1.0194638967514038, + "learning_rate": 9.978423138246731e-06, + "loss": 0.9045, + "step": 1172 + }, + { + "epoch": 0.06456051516319004, + "grad_norm": 0.953078031539917, + "learning_rate": 9.978382893177259e-06, + "loss": 0.9661, + "step": 1173 + }, + { + "epoch": 0.0646155539655457, + "grad_norm": 0.8777341842651367, + "learning_rate": 9.978342610691592e-06, + "loss": 0.8685, + "step": 1174 + }, + { + "epoch": 0.06467059276790137, + "grad_norm": 1.0178394317626953, + "learning_rate": 9.978302290790034e-06, + "loss": 0.9075, + "step": 1175 + }, + { + "epoch": 0.06472563157025703, + "grad_norm": 0.935694694519043, + "learning_rate": 9.978261933472889e-06, + "loss": 0.8438, + "step": 1176 + }, + { + "epoch": 0.06478067037261269, + "grad_norm": 1.0022411346435547, + "learning_rate": 9.97822153874046e-06, + "loss": 0.8701, + "step": 1177 + }, + { + "epoch": 0.06483570917496835, + "grad_norm": 1.0371203422546387, + "learning_rate": 9.97818110659305e-06, + "loss": 0.9111, + "step": 1178 + }, + { + "epoch": 0.06489074797732401, + "grad_norm": 0.7972478866577148, + "learning_rate": 9.978140637030963e-06, + "loss": 0.8602, + "step": 1179 + }, + { + "epoch": 0.06494578677967967, + "grad_norm": 0.8556679487228394, + "learning_rate": 9.978100130054505e-06, + "loss": 0.9149, + "step": 1180 + }, + { + "epoch": 0.06500082558203533, + "grad_norm": 0.92474365234375, + "learning_rate": 9.978059585663979e-06, + "loss": 0.8608, + "step": 1181 + }, + { + "epoch": 0.065055864384391, + "grad_norm": 1.0170830488204956, + "learning_rate": 9.978019003859687e-06, + "loss": 0.9986, + "step": 1182 + }, + { + "epoch": 0.06511090318674666, + "grad_norm": 0.9405049681663513, + "learning_rate": 9.97797838464194e-06, + "loss": 0.9023, + "step": 1183 + }, + { + "epoch": 0.06516594198910232, + "grad_norm": 0.9351203441619873, + "learning_rate": 9.977937728011038e-06, + "loss": 0.8698, + "step": 1184 + }, + { + "epoch": 0.06522098079145798, + "grad_norm": 0.8620241284370422, + "learning_rate": 9.97789703396729e-06, + "loss": 0.9393, + "step": 1185 + }, + { + "epoch": 0.06527601959381364, + "grad_norm": 0.9440441131591797, + "learning_rate": 9.977856302511e-06, + "loss": 0.8249, + "step": 1186 + }, + { + "epoch": 0.0653310583961693, + "grad_norm": 0.8311079144477844, + "learning_rate": 9.977815533642474e-06, + "loss": 0.8614, + "step": 1187 + }, + { + "epoch": 0.06538609719852496, + "grad_norm": 0.8911672830581665, + "learning_rate": 9.977774727362018e-06, + "loss": 0.7909, + "step": 1188 + }, + { + "epoch": 0.06544113600088063, + "grad_norm": 0.9237088561058044, + "learning_rate": 9.97773388366994e-06, + "loss": 0.7116, + "step": 1189 + }, + { + "epoch": 0.06549617480323629, + "grad_norm": 1.1155747175216675, + "learning_rate": 9.977693002566549e-06, + "loss": 0.9248, + "step": 1190 + }, + { + "epoch": 0.06555121360559195, + "grad_norm": 0.9386736750602722, + "learning_rate": 9.977652084052148e-06, + "loss": 0.8307, + "step": 1191 + }, + { + "epoch": 0.0656062524079476, + "grad_norm": 1.1666040420532227, + "learning_rate": 9.977611128127044e-06, + "loss": 0.9723, + "step": 1192 + }, + { + "epoch": 0.06566129121030326, + "grad_norm": 1.2366368770599365, + "learning_rate": 9.977570134791552e-06, + "loss": 0.8253, + "step": 1193 + }, + { + "epoch": 0.06571633001265892, + "grad_norm": 0.823443591594696, + "learning_rate": 9.977529104045971e-06, + "loss": 0.7472, + "step": 1194 + }, + { + "epoch": 0.06577136881501458, + "grad_norm": 0.9481683969497681, + "learning_rate": 9.977488035890617e-06, + "loss": 0.887, + "step": 1195 + }, + { + "epoch": 0.06582640761737024, + "grad_norm": 0.9120422005653381, + "learning_rate": 9.977446930325794e-06, + "loss": 0.867, + "step": 1196 + }, + { + "epoch": 0.0658814464197259, + "grad_norm": 0.8595587015151978, + "learning_rate": 9.977405787351811e-06, + "loss": 0.8532, + "step": 1197 + }, + { + "epoch": 0.06593648522208156, + "grad_norm": 0.8590419888496399, + "learning_rate": 9.97736460696898e-06, + "loss": 0.8998, + "step": 1198 + }, + { + "epoch": 0.06599152402443723, + "grad_norm": 0.9670939445495605, + "learning_rate": 9.977323389177609e-06, + "loss": 0.8964, + "step": 1199 + }, + { + "epoch": 0.06604656282679289, + "grad_norm": 0.8870261907577515, + "learning_rate": 9.977282133978006e-06, + "loss": 0.9542, + "step": 1200 + }, + { + "epoch": 0.06610160162914855, + "grad_norm": 0.942294180393219, + "learning_rate": 9.977240841370484e-06, + "loss": 0.8681, + "step": 1201 + }, + { + "epoch": 0.06615664043150421, + "grad_norm": 0.9632517099380493, + "learning_rate": 9.977199511355353e-06, + "loss": 0.7327, + "step": 1202 + }, + { + "epoch": 0.06621167923385987, + "grad_norm": 4.8085479736328125, + "learning_rate": 9.97715814393292e-06, + "loss": 0.8528, + "step": 1203 + }, + { + "epoch": 0.06626671803621553, + "grad_norm": 0.9084093570709229, + "learning_rate": 9.977116739103503e-06, + "loss": 0.7836, + "step": 1204 + }, + { + "epoch": 0.0663217568385712, + "grad_norm": 0.8961902260780334, + "learning_rate": 9.977075296867406e-06, + "loss": 0.854, + "step": 1205 + }, + { + "epoch": 0.06637679564092686, + "grad_norm": 0.8727987408638, + "learning_rate": 9.977033817224945e-06, + "loss": 0.7931, + "step": 1206 + }, + { + "epoch": 0.06643183444328252, + "grad_norm": 0.8263267874717712, + "learning_rate": 9.976992300176428e-06, + "loss": 0.852, + "step": 1207 + }, + { + "epoch": 0.06648687324563818, + "grad_norm": 1.0499639511108398, + "learning_rate": 9.97695074572217e-06, + "loss": 1.0427, + "step": 1208 + }, + { + "epoch": 0.06654191204799384, + "grad_norm": 0.9337313771247864, + "learning_rate": 9.976909153862482e-06, + "loss": 0.8035, + "step": 1209 + }, + { + "epoch": 0.0665969508503495, + "grad_norm": 0.8795992732048035, + "learning_rate": 9.976867524597678e-06, + "loss": 0.9022, + "step": 1210 + }, + { + "epoch": 0.06665198965270516, + "grad_norm": 0.9787294268608093, + "learning_rate": 9.976825857928069e-06, + "loss": 0.8259, + "step": 1211 + }, + { + "epoch": 0.06670702845506082, + "grad_norm": 0.8570082187652588, + "learning_rate": 9.976784153853969e-06, + "loss": 0.8567, + "step": 1212 + }, + { + "epoch": 0.06676206725741649, + "grad_norm": 1.0620380640029907, + "learning_rate": 9.976742412375694e-06, + "loss": 0.851, + "step": 1213 + }, + { + "epoch": 0.06681710605977213, + "grad_norm": 0.8545439839363098, + "learning_rate": 9.976700633493551e-06, + "loss": 0.8827, + "step": 1214 + }, + { + "epoch": 0.0668721448621278, + "grad_norm": 0.8543682098388672, + "learning_rate": 9.97665881720786e-06, + "loss": 0.8524, + "step": 1215 + }, + { + "epoch": 0.06692718366448346, + "grad_norm": 0.7748527526855469, + "learning_rate": 9.976616963518935e-06, + "loss": 0.7459, + "step": 1216 + }, + { + "epoch": 0.06698222246683912, + "grad_norm": 0.9876659512519836, + "learning_rate": 9.976575072427087e-06, + "loss": 0.8426, + "step": 1217 + }, + { + "epoch": 0.06703726126919478, + "grad_norm": 0.8763901591300964, + "learning_rate": 9.976533143932635e-06, + "loss": 0.8561, + "step": 1218 + }, + { + "epoch": 0.06709230007155044, + "grad_norm": 0.7816654443740845, + "learning_rate": 9.97649117803589e-06, + "loss": 0.8361, + "step": 1219 + }, + { + "epoch": 0.0671473388739061, + "grad_norm": 0.8659802675247192, + "learning_rate": 9.97644917473717e-06, + "loss": 0.897, + "step": 1220 + }, + { + "epoch": 0.06720237767626176, + "grad_norm": 0.9180877208709717, + "learning_rate": 9.97640713403679e-06, + "loss": 0.9516, + "step": 1221 + }, + { + "epoch": 0.06725741647861742, + "grad_norm": 0.9624410271644592, + "learning_rate": 9.976365055935067e-06, + "loss": 0.9119, + "step": 1222 + }, + { + "epoch": 0.06731245528097309, + "grad_norm": 0.8291105031967163, + "learning_rate": 9.976322940432314e-06, + "loss": 0.788, + "step": 1223 + }, + { + "epoch": 0.06736749408332875, + "grad_norm": 0.9858983755111694, + "learning_rate": 9.976280787528854e-06, + "loss": 0.8794, + "step": 1224 + }, + { + "epoch": 0.06742253288568441, + "grad_norm": 0.8283948302268982, + "learning_rate": 9.976238597224996e-06, + "loss": 0.8571, + "step": 1225 + }, + { + "epoch": 0.06747757168804007, + "grad_norm": 0.8585363626480103, + "learning_rate": 9.976196369521063e-06, + "loss": 0.9005, + "step": 1226 + }, + { + "epoch": 0.06753261049039573, + "grad_norm": 0.847882091999054, + "learning_rate": 9.976154104417369e-06, + "loss": 0.8058, + "step": 1227 + }, + { + "epoch": 0.06758764929275139, + "grad_norm": 0.9045611023902893, + "learning_rate": 9.976111801914232e-06, + "loss": 0.7864, + "step": 1228 + }, + { + "epoch": 0.06764268809510705, + "grad_norm": 0.805932879447937, + "learning_rate": 9.976069462011972e-06, + "loss": 0.8436, + "step": 1229 + }, + { + "epoch": 0.06769772689746271, + "grad_norm": 0.8809003233909607, + "learning_rate": 9.976027084710906e-06, + "loss": 0.7876, + "step": 1230 + }, + { + "epoch": 0.06775276569981838, + "grad_norm": 0.8681740760803223, + "learning_rate": 9.975984670011352e-06, + "loss": 0.877, + "step": 1231 + }, + { + "epoch": 0.06780780450217404, + "grad_norm": 0.9909854531288147, + "learning_rate": 9.975942217913627e-06, + "loss": 0.8957, + "step": 1232 + }, + { + "epoch": 0.0678628433045297, + "grad_norm": 0.9213934540748596, + "learning_rate": 9.975899728418056e-06, + "loss": 0.8344, + "step": 1233 + }, + { + "epoch": 0.06791788210688536, + "grad_norm": 0.8289967179298401, + "learning_rate": 9.975857201524952e-06, + "loss": 0.876, + "step": 1234 + }, + { + "epoch": 0.06797292090924101, + "grad_norm": 0.891812264919281, + "learning_rate": 9.97581463723464e-06, + "loss": 0.8611, + "step": 1235 + }, + { + "epoch": 0.06802795971159667, + "grad_norm": 1.0301382541656494, + "learning_rate": 9.975772035547435e-06, + "loss": 0.8177, + "step": 1236 + }, + { + "epoch": 0.06808299851395233, + "grad_norm": 0.8380662798881531, + "learning_rate": 9.975729396463659e-06, + "loss": 0.8631, + "step": 1237 + }, + { + "epoch": 0.06813803731630799, + "grad_norm": 0.9226046204566956, + "learning_rate": 9.975686719983633e-06, + "loss": 0.8927, + "step": 1238 + }, + { + "epoch": 0.06819307611866365, + "grad_norm": 0.8917136192321777, + "learning_rate": 9.975644006107679e-06, + "loss": 0.9048, + "step": 1239 + }, + { + "epoch": 0.06824811492101931, + "grad_norm": 0.8559191226959229, + "learning_rate": 9.975601254836114e-06, + "loss": 0.8169, + "step": 1240 + }, + { + "epoch": 0.06830315372337498, + "grad_norm": 0.9345341920852661, + "learning_rate": 9.975558466169263e-06, + "loss": 0.7929, + "step": 1241 + }, + { + "epoch": 0.06835819252573064, + "grad_norm": 0.9155850410461426, + "learning_rate": 9.975515640107447e-06, + "loss": 0.8825, + "step": 1242 + }, + { + "epoch": 0.0684132313280863, + "grad_norm": 0.899712860584259, + "learning_rate": 9.975472776650987e-06, + "loss": 0.825, + "step": 1243 + }, + { + "epoch": 0.06846827013044196, + "grad_norm": 0.8280880451202393, + "learning_rate": 9.975429875800206e-06, + "loss": 0.8539, + "step": 1244 + }, + { + "epoch": 0.06852330893279762, + "grad_norm": 0.9589636325836182, + "learning_rate": 9.975386937555426e-06, + "loss": 0.9465, + "step": 1245 + }, + { + "epoch": 0.06857834773515328, + "grad_norm": 1.1027253866195679, + "learning_rate": 9.97534396191697e-06, + "loss": 0.87, + "step": 1246 + }, + { + "epoch": 0.06863338653750894, + "grad_norm": 1.0510318279266357, + "learning_rate": 9.975300948885158e-06, + "loss": 0.8569, + "step": 1247 + }, + { + "epoch": 0.0686884253398646, + "grad_norm": 0.8897958397865295, + "learning_rate": 9.975257898460317e-06, + "loss": 0.8431, + "step": 1248 + }, + { + "epoch": 0.06874346414222027, + "grad_norm": 0.8827036619186401, + "learning_rate": 9.975214810642771e-06, + "loss": 0.922, + "step": 1249 + }, + { + "epoch": 0.06879850294457593, + "grad_norm": 0.8798324465751648, + "learning_rate": 9.97517168543284e-06, + "loss": 0.7837, + "step": 1250 + }, + { + "epoch": 0.06885354174693159, + "grad_norm": 0.9053803086280823, + "learning_rate": 9.975128522830853e-06, + "loss": 0.82, + "step": 1251 + }, + { + "epoch": 0.06890858054928725, + "grad_norm": 0.8362607359886169, + "learning_rate": 9.975085322837129e-06, + "loss": 0.7684, + "step": 1252 + }, + { + "epoch": 0.06896361935164291, + "grad_norm": 0.8898602724075317, + "learning_rate": 9.975042085451997e-06, + "loss": 0.8205, + "step": 1253 + }, + { + "epoch": 0.06901865815399857, + "grad_norm": 0.9210274815559387, + "learning_rate": 9.97499881067578e-06, + "loss": 0.8364, + "step": 1254 + }, + { + "epoch": 0.06907369695635424, + "grad_norm": 1.0881952047348022, + "learning_rate": 9.974955498508804e-06, + "loss": 0.8234, + "step": 1255 + }, + { + "epoch": 0.0691287357587099, + "grad_norm": 0.8875024914741516, + "learning_rate": 9.974912148951394e-06, + "loss": 0.7974, + "step": 1256 + }, + { + "epoch": 0.06918377456106554, + "grad_norm": 0.9065666794776917, + "learning_rate": 9.974868762003876e-06, + "loss": 0.7721, + "step": 1257 + }, + { + "epoch": 0.0692388133634212, + "grad_norm": 0.8904553651809692, + "learning_rate": 9.974825337666576e-06, + "loss": 0.8551, + "step": 1258 + }, + { + "epoch": 0.06929385216577687, + "grad_norm": 0.8586102724075317, + "learning_rate": 9.974781875939821e-06, + "loss": 0.8666, + "step": 1259 + }, + { + "epoch": 0.06934889096813253, + "grad_norm": 0.9103402495384216, + "learning_rate": 9.974738376823935e-06, + "loss": 0.8361, + "step": 1260 + }, + { + "epoch": 0.06940392977048819, + "grad_norm": 0.8657701015472412, + "learning_rate": 9.974694840319249e-06, + "loss": 0.8217, + "step": 1261 + }, + { + "epoch": 0.06945896857284385, + "grad_norm": 0.865703821182251, + "learning_rate": 9.974651266426088e-06, + "loss": 0.8751, + "step": 1262 + }, + { + "epoch": 0.06951400737519951, + "grad_norm": 0.8932577967643738, + "learning_rate": 9.974607655144779e-06, + "loss": 0.8709, + "step": 1263 + }, + { + "epoch": 0.06956904617755517, + "grad_norm": 0.8417405486106873, + "learning_rate": 9.97456400647565e-06, + "loss": 0.8104, + "step": 1264 + }, + { + "epoch": 0.06962408497991084, + "grad_norm": 0.8578035235404968, + "learning_rate": 9.974520320419032e-06, + "loss": 0.9173, + "step": 1265 + }, + { + "epoch": 0.0696791237822665, + "grad_norm": 0.957539439201355, + "learning_rate": 9.974476596975249e-06, + "loss": 0.8955, + "step": 1266 + }, + { + "epoch": 0.06973416258462216, + "grad_norm": 0.851222038269043, + "learning_rate": 9.974432836144632e-06, + "loss": 0.8696, + "step": 1267 + }, + { + "epoch": 0.06978920138697782, + "grad_norm": 0.8178789615631104, + "learning_rate": 9.974389037927508e-06, + "loss": 0.7921, + "step": 1268 + }, + { + "epoch": 0.06984424018933348, + "grad_norm": 0.954091489315033, + "learning_rate": 9.97434520232421e-06, + "loss": 0.9362, + "step": 1269 + }, + { + "epoch": 0.06989927899168914, + "grad_norm": 0.8525053858757019, + "learning_rate": 9.974301329335063e-06, + "loss": 0.7996, + "step": 1270 + }, + { + "epoch": 0.0699543177940448, + "grad_norm": 0.9340476393699646, + "learning_rate": 9.9742574189604e-06, + "loss": 0.9091, + "step": 1271 + }, + { + "epoch": 0.07000935659640047, + "grad_norm": 0.7946187257766724, + "learning_rate": 9.974213471200548e-06, + "loss": 0.874, + "step": 1272 + }, + { + "epoch": 0.07006439539875613, + "grad_norm": 0.8048381209373474, + "learning_rate": 9.97416948605584e-06, + "loss": 0.8557, + "step": 1273 + }, + { + "epoch": 0.07011943420111179, + "grad_norm": 0.9849064946174622, + "learning_rate": 9.974125463526607e-06, + "loss": 0.8154, + "step": 1274 + }, + { + "epoch": 0.07017447300346745, + "grad_norm": 0.9030239582061768, + "learning_rate": 9.974081403613178e-06, + "loss": 0.9411, + "step": 1275 + }, + { + "epoch": 0.07022951180582311, + "grad_norm": 0.8869300484657288, + "learning_rate": 9.974037306315882e-06, + "loss": 0.8978, + "step": 1276 + }, + { + "epoch": 0.07028455060817877, + "grad_norm": 0.8558536767959595, + "learning_rate": 9.973993171635057e-06, + "loss": 0.8937, + "step": 1277 + }, + { + "epoch": 0.07033958941053442, + "grad_norm": 0.9005453586578369, + "learning_rate": 9.973948999571029e-06, + "loss": 0.9336, + "step": 1278 + }, + { + "epoch": 0.07039462821289008, + "grad_norm": 0.8489978909492493, + "learning_rate": 9.973904790124131e-06, + "loss": 0.8267, + "step": 1279 + }, + { + "epoch": 0.07044966701524574, + "grad_norm": 0.8295948505401611, + "learning_rate": 9.973860543294696e-06, + "loss": 0.8478, + "step": 1280 + }, + { + "epoch": 0.0705047058176014, + "grad_norm": 0.8111379742622375, + "learning_rate": 9.973816259083058e-06, + "loss": 0.8333, + "step": 1281 + }, + { + "epoch": 0.07055974461995707, + "grad_norm": 0.9380189776420593, + "learning_rate": 9.973771937489547e-06, + "loss": 0.9718, + "step": 1282 + }, + { + "epoch": 0.07061478342231273, + "grad_norm": 1.251194953918457, + "learning_rate": 9.973727578514499e-06, + "loss": 0.9531, + "step": 1283 + }, + { + "epoch": 0.07066982222466839, + "grad_norm": 0.9897224307060242, + "learning_rate": 9.973683182158243e-06, + "loss": 0.7853, + "step": 1284 + }, + { + "epoch": 0.07072486102702405, + "grad_norm": 0.8409335017204285, + "learning_rate": 9.973638748421119e-06, + "loss": 0.7692, + "step": 1285 + }, + { + "epoch": 0.07077989982937971, + "grad_norm": 0.9019681215286255, + "learning_rate": 9.973594277303456e-06, + "loss": 0.8135, + "step": 1286 + }, + { + "epoch": 0.07083493863173537, + "grad_norm": 0.9236096739768982, + "learning_rate": 9.973549768805588e-06, + "loss": 0.9304, + "step": 1287 + }, + { + "epoch": 0.07088997743409103, + "grad_norm": 0.9244743585586548, + "learning_rate": 9.973505222927854e-06, + "loss": 0.9056, + "step": 1288 + }, + { + "epoch": 0.0709450162364467, + "grad_norm": 1.3418753147125244, + "learning_rate": 9.973460639670585e-06, + "loss": 0.8419, + "step": 1289 + }, + { + "epoch": 0.07100005503880236, + "grad_norm": 0.8715767860412598, + "learning_rate": 9.973416019034117e-06, + "loss": 0.9704, + "step": 1290 + }, + { + "epoch": 0.07105509384115802, + "grad_norm": 0.9609012007713318, + "learning_rate": 9.973371361018787e-06, + "loss": 0.8807, + "step": 1291 + }, + { + "epoch": 0.07111013264351368, + "grad_norm": 0.8085873126983643, + "learning_rate": 9.973326665624927e-06, + "loss": 0.7947, + "step": 1292 + }, + { + "epoch": 0.07116517144586934, + "grad_norm": 0.919280469417572, + "learning_rate": 9.973281932852877e-06, + "loss": 0.9743, + "step": 1293 + }, + { + "epoch": 0.071220210248225, + "grad_norm": 1.0651074647903442, + "learning_rate": 9.973237162702968e-06, + "loss": 0.7164, + "step": 1294 + }, + { + "epoch": 0.07127524905058066, + "grad_norm": 0.987251341342926, + "learning_rate": 9.973192355175542e-06, + "loss": 0.9286, + "step": 1295 + }, + { + "epoch": 0.07133028785293632, + "grad_norm": 1.5507274866104126, + "learning_rate": 9.973147510270935e-06, + "loss": 0.9733, + "step": 1296 + }, + { + "epoch": 0.07138532665529199, + "grad_norm": 0.8439416885375977, + "learning_rate": 9.97310262798948e-06, + "loss": 0.7462, + "step": 1297 + }, + { + "epoch": 0.07144036545764765, + "grad_norm": 0.9604889750480652, + "learning_rate": 9.973057708331519e-06, + "loss": 1.0006, + "step": 1298 + }, + { + "epoch": 0.07149540426000331, + "grad_norm": 0.8568960428237915, + "learning_rate": 9.973012751297386e-06, + "loss": 0.878, + "step": 1299 + }, + { + "epoch": 0.07155044306235896, + "grad_norm": 0.8169522285461426, + "learning_rate": 9.972967756887419e-06, + "loss": 0.8241, + "step": 1300 + }, + { + "epoch": 0.07160548186471462, + "grad_norm": 0.875738799571991, + "learning_rate": 9.97292272510196e-06, + "loss": 0.854, + "step": 1301 + }, + { + "epoch": 0.07166052066707028, + "grad_norm": 0.7877739071846008, + "learning_rate": 9.972877655941345e-06, + "loss": 0.779, + "step": 1302 + }, + { + "epoch": 0.07171555946942594, + "grad_norm": 0.8148574829101562, + "learning_rate": 9.972832549405912e-06, + "loss": 0.6965, + "step": 1303 + }, + { + "epoch": 0.0717705982717816, + "grad_norm": 0.936720609664917, + "learning_rate": 9.972787405495998e-06, + "loss": 0.798, + "step": 1304 + }, + { + "epoch": 0.07182563707413726, + "grad_norm": 0.8932886123657227, + "learning_rate": 9.972742224211949e-06, + "loss": 0.9196, + "step": 1305 + }, + { + "epoch": 0.07188067587649292, + "grad_norm": 0.899246871471405, + "learning_rate": 9.972697005554099e-06, + "loss": 0.8081, + "step": 1306 + }, + { + "epoch": 0.07193571467884859, + "grad_norm": 0.8789899349212646, + "learning_rate": 9.972651749522788e-06, + "loss": 0.89, + "step": 1307 + }, + { + "epoch": 0.07199075348120425, + "grad_norm": 1.2412173748016357, + "learning_rate": 9.97260645611836e-06, + "loss": 0.9866, + "step": 1308 + }, + { + "epoch": 0.07204579228355991, + "grad_norm": 0.8655833005905151, + "learning_rate": 9.972561125341152e-06, + "loss": 0.8144, + "step": 1309 + }, + { + "epoch": 0.07210083108591557, + "grad_norm": 0.8705299496650696, + "learning_rate": 9.972515757191506e-06, + "loss": 0.8431, + "step": 1310 + }, + { + "epoch": 0.07215586988827123, + "grad_norm": 0.8813188672065735, + "learning_rate": 9.972470351669761e-06, + "loss": 0.859, + "step": 1311 + }, + { + "epoch": 0.0722109086906269, + "grad_norm": 2.043627977371216, + "learning_rate": 9.972424908776262e-06, + "loss": 0.9886, + "step": 1312 + }, + { + "epoch": 0.07226594749298255, + "grad_norm": 0.9167500734329224, + "learning_rate": 9.972379428511348e-06, + "loss": 0.7203, + "step": 1313 + }, + { + "epoch": 0.07232098629533822, + "grad_norm": 1.3145136833190918, + "learning_rate": 9.972333910875358e-06, + "loss": 0.9325, + "step": 1314 + }, + { + "epoch": 0.07237602509769388, + "grad_norm": 0.834710419178009, + "learning_rate": 9.972288355868641e-06, + "loss": 0.9361, + "step": 1315 + }, + { + "epoch": 0.07243106390004954, + "grad_norm": 0.9039230942726135, + "learning_rate": 9.972242763491535e-06, + "loss": 0.8027, + "step": 1316 + }, + { + "epoch": 0.0724861027024052, + "grad_norm": 0.8911495208740234, + "learning_rate": 9.972197133744384e-06, + "loss": 0.951, + "step": 1317 + }, + { + "epoch": 0.07254114150476086, + "grad_norm": 1.0752439498901367, + "learning_rate": 9.972151466627529e-06, + "loss": 0.8421, + "step": 1318 + }, + { + "epoch": 0.07259618030711652, + "grad_norm": 0.926135778427124, + "learning_rate": 9.972105762141314e-06, + "loss": 0.8901, + "step": 1319 + }, + { + "epoch": 0.07265121910947218, + "grad_norm": 0.8166295289993286, + "learning_rate": 9.972060020286085e-06, + "loss": 0.7845, + "step": 1320 + }, + { + "epoch": 0.07270625791182783, + "grad_norm": 1.0000934600830078, + "learning_rate": 9.972014241062182e-06, + "loss": 0.8383, + "step": 1321 + }, + { + "epoch": 0.0727612967141835, + "grad_norm": 1.2617899179458618, + "learning_rate": 9.971968424469951e-06, + "loss": 0.9826, + "step": 1322 + }, + { + "epoch": 0.07281633551653915, + "grad_norm": 0.8451040983200073, + "learning_rate": 9.971922570509738e-06, + "loss": 0.8262, + "step": 1323 + }, + { + "epoch": 0.07287137431889482, + "grad_norm": 0.8101939558982849, + "learning_rate": 9.971876679181884e-06, + "loss": 0.6904, + "step": 1324 + }, + { + "epoch": 0.07292641312125048, + "grad_norm": 0.8805514574050903, + "learning_rate": 9.971830750486736e-06, + "loss": 0.8491, + "step": 1325 + }, + { + "epoch": 0.07298145192360614, + "grad_norm": 0.8236901164054871, + "learning_rate": 9.97178478442464e-06, + "loss": 0.8462, + "step": 1326 + }, + { + "epoch": 0.0730364907259618, + "grad_norm": 0.9183042645454407, + "learning_rate": 9.971738780995938e-06, + "loss": 0.7577, + "step": 1327 + }, + { + "epoch": 0.07309152952831746, + "grad_norm": 0.8425934314727783, + "learning_rate": 9.971692740200982e-06, + "loss": 0.8462, + "step": 1328 + }, + { + "epoch": 0.07314656833067312, + "grad_norm": 0.9114993214607239, + "learning_rate": 9.971646662040112e-06, + "loss": 0.9132, + "step": 1329 + }, + { + "epoch": 0.07320160713302878, + "grad_norm": 0.8516649603843689, + "learning_rate": 9.971600546513675e-06, + "loss": 0.8819, + "step": 1330 + }, + { + "epoch": 0.07325664593538445, + "grad_norm": 1.0859558582305908, + "learning_rate": 9.971554393622023e-06, + "loss": 0.9929, + "step": 1331 + }, + { + "epoch": 0.07331168473774011, + "grad_norm": 0.8906900882720947, + "learning_rate": 9.971508203365497e-06, + "loss": 0.9166, + "step": 1332 + }, + { + "epoch": 0.07336672354009577, + "grad_norm": 0.8931803703308105, + "learning_rate": 9.971461975744445e-06, + "loss": 0.864, + "step": 1333 + }, + { + "epoch": 0.07342176234245143, + "grad_norm": 0.8404982686042786, + "learning_rate": 9.971415710759216e-06, + "loss": 0.8609, + "step": 1334 + }, + { + "epoch": 0.07347680114480709, + "grad_norm": 0.8016490340232849, + "learning_rate": 9.971369408410157e-06, + "loss": 0.7694, + "step": 1335 + }, + { + "epoch": 0.07353183994716275, + "grad_norm": 0.7700600028038025, + "learning_rate": 9.971323068697618e-06, + "loss": 0.7875, + "step": 1336 + }, + { + "epoch": 0.07358687874951841, + "grad_norm": 0.8679799437522888, + "learning_rate": 9.971276691621946e-06, + "loss": 0.8409, + "step": 1337 + }, + { + "epoch": 0.07364191755187408, + "grad_norm": 0.8329173922538757, + "learning_rate": 9.971230277183486e-06, + "loss": 0.8707, + "step": 1338 + }, + { + "epoch": 0.07369695635422974, + "grad_norm": 0.8790140151977539, + "learning_rate": 9.97118382538259e-06, + "loss": 0.7631, + "step": 1339 + }, + { + "epoch": 0.0737519951565854, + "grad_norm": 1.1895341873168945, + "learning_rate": 9.97113733621961e-06, + "loss": 0.8555, + "step": 1340 + }, + { + "epoch": 0.07380703395894106, + "grad_norm": 0.8531593680381775, + "learning_rate": 9.97109080969489e-06, + "loss": 0.7192, + "step": 1341 + }, + { + "epoch": 0.07386207276129672, + "grad_norm": 1.0388946533203125, + "learning_rate": 9.971044245808784e-06, + "loss": 0.8182, + "step": 1342 + }, + { + "epoch": 0.07391711156365237, + "grad_norm": 0.8858556747436523, + "learning_rate": 9.970997644561639e-06, + "loss": 0.7981, + "step": 1343 + }, + { + "epoch": 0.07397215036600803, + "grad_norm": 0.8710204362869263, + "learning_rate": 9.970951005953807e-06, + "loss": 0.7667, + "step": 1344 + }, + { + "epoch": 0.07402718916836369, + "grad_norm": 0.9788708090782166, + "learning_rate": 9.970904329985638e-06, + "loss": 0.9693, + "step": 1345 + }, + { + "epoch": 0.07408222797071935, + "grad_norm": 0.7805914878845215, + "learning_rate": 9.970857616657482e-06, + "loss": 0.6683, + "step": 1346 + }, + { + "epoch": 0.07413726677307501, + "grad_norm": 0.9977933168411255, + "learning_rate": 9.97081086596969e-06, + "loss": 0.8288, + "step": 1347 + }, + { + "epoch": 0.07419230557543068, + "grad_norm": 0.829115629196167, + "learning_rate": 9.970764077922617e-06, + "loss": 0.8361, + "step": 1348 + }, + { + "epoch": 0.07424734437778634, + "grad_norm": 1.226120114326477, + "learning_rate": 9.97071725251661e-06, + "loss": 1.0008, + "step": 1349 + }, + { + "epoch": 0.074302383180142, + "grad_norm": 0.8997750878334045, + "learning_rate": 9.970670389752021e-06, + "loss": 0.8048, + "step": 1350 + }, + { + "epoch": 0.07435742198249766, + "grad_norm": 1.0885238647460938, + "learning_rate": 9.970623489629205e-06, + "loss": 0.9202, + "step": 1351 + }, + { + "epoch": 0.07441246078485332, + "grad_norm": 0.8736100792884827, + "learning_rate": 9.970576552148515e-06, + "loss": 0.8515, + "step": 1352 + }, + { + "epoch": 0.07446749958720898, + "grad_norm": 0.9211294651031494, + "learning_rate": 9.970529577310301e-06, + "loss": 0.9389, + "step": 1353 + }, + { + "epoch": 0.07452253838956464, + "grad_norm": 0.9334765672683716, + "learning_rate": 9.970482565114917e-06, + "loss": 0.8165, + "step": 1354 + }, + { + "epoch": 0.0745775771919203, + "grad_norm": 0.8307162523269653, + "learning_rate": 9.970435515562717e-06, + "loss": 0.7829, + "step": 1355 + }, + { + "epoch": 0.07463261599427597, + "grad_norm": 0.987634003162384, + "learning_rate": 9.970388428654055e-06, + "loss": 0.848, + "step": 1356 + }, + { + "epoch": 0.07468765479663163, + "grad_norm": 1.094752311706543, + "learning_rate": 9.970341304389281e-06, + "loss": 1.003, + "step": 1357 + }, + { + "epoch": 0.07474269359898729, + "grad_norm": 0.9865909814834595, + "learning_rate": 9.970294142768755e-06, + "loss": 0.9116, + "step": 1358 + }, + { + "epoch": 0.07479773240134295, + "grad_norm": 0.8404149413108826, + "learning_rate": 9.970246943792828e-06, + "loss": 0.8699, + "step": 1359 + }, + { + "epoch": 0.07485277120369861, + "grad_norm": 0.9602416753768921, + "learning_rate": 9.970199707461855e-06, + "loss": 0.8166, + "step": 1360 + }, + { + "epoch": 0.07490781000605427, + "grad_norm": 0.9748693704605103, + "learning_rate": 9.970152433776193e-06, + "loss": 0.8767, + "step": 1361 + }, + { + "epoch": 0.07496284880840993, + "grad_norm": 0.8721657991409302, + "learning_rate": 9.970105122736194e-06, + "loss": 0.8825, + "step": 1362 + }, + { + "epoch": 0.0750178876107656, + "grad_norm": 0.8683610558509827, + "learning_rate": 9.970057774342215e-06, + "loss": 0.7873, + "step": 1363 + }, + { + "epoch": 0.07507292641312124, + "grad_norm": 0.856396496295929, + "learning_rate": 9.970010388594613e-06, + "loss": 0.8505, + "step": 1364 + }, + { + "epoch": 0.0751279652154769, + "grad_norm": 1.0709880590438843, + "learning_rate": 9.969962965493744e-06, + "loss": 0.9519, + "step": 1365 + }, + { + "epoch": 0.07518300401783257, + "grad_norm": 0.8839450478553772, + "learning_rate": 9.969915505039963e-06, + "loss": 0.8041, + "step": 1366 + }, + { + "epoch": 0.07523804282018823, + "grad_norm": 0.89545738697052, + "learning_rate": 9.969868007233627e-06, + "loss": 0.8713, + "step": 1367 + }, + { + "epoch": 0.07529308162254389, + "grad_norm": 0.9870849251747131, + "learning_rate": 9.969820472075094e-06, + "loss": 0.8655, + "step": 1368 + }, + { + "epoch": 0.07534812042489955, + "grad_norm": 1.3123797178268433, + "learning_rate": 9.96977289956472e-06, + "loss": 1.0425, + "step": 1369 + }, + { + "epoch": 0.07540315922725521, + "grad_norm": 0.8538400530815125, + "learning_rate": 9.969725289702865e-06, + "loss": 0.7052, + "step": 1370 + }, + { + "epoch": 0.07545819802961087, + "grad_norm": 0.933397114276886, + "learning_rate": 9.969677642489884e-06, + "loss": 0.9819, + "step": 1371 + }, + { + "epoch": 0.07551323683196653, + "grad_norm": 0.8428112268447876, + "learning_rate": 9.969629957926134e-06, + "loss": 0.7313, + "step": 1372 + }, + { + "epoch": 0.0755682756343222, + "grad_norm": 0.9023239612579346, + "learning_rate": 9.96958223601198e-06, + "loss": 0.8297, + "step": 1373 + }, + { + "epoch": 0.07562331443667786, + "grad_norm": 0.8971324563026428, + "learning_rate": 9.969534476747771e-06, + "loss": 0.8832, + "step": 1374 + }, + { + "epoch": 0.07567835323903352, + "grad_norm": 0.8709388375282288, + "learning_rate": 9.969486680133874e-06, + "loss": 0.743, + "step": 1375 + }, + { + "epoch": 0.07573339204138918, + "grad_norm": 0.9094591736793518, + "learning_rate": 9.969438846170644e-06, + "loss": 0.8294, + "step": 1376 + }, + { + "epoch": 0.07578843084374484, + "grad_norm": 1.0753988027572632, + "learning_rate": 9.969390974858444e-06, + "loss": 0.7479, + "step": 1377 + }, + { + "epoch": 0.0758434696461005, + "grad_norm": 0.933775007724762, + "learning_rate": 9.96934306619763e-06, + "loss": 0.8235, + "step": 1378 + }, + { + "epoch": 0.07589850844845616, + "grad_norm": 0.8419735431671143, + "learning_rate": 9.969295120188565e-06, + "loss": 0.8103, + "step": 1379 + }, + { + "epoch": 0.07595354725081183, + "grad_norm": 0.8912790417671204, + "learning_rate": 9.969247136831606e-06, + "loss": 0.911, + "step": 1380 + }, + { + "epoch": 0.07600858605316749, + "grad_norm": 0.8780983090400696, + "learning_rate": 9.969199116127118e-06, + "loss": 0.8619, + "step": 1381 + }, + { + "epoch": 0.07606362485552315, + "grad_norm": 0.8503809571266174, + "learning_rate": 9.969151058075459e-06, + "loss": 0.8093, + "step": 1382 + }, + { + "epoch": 0.07611866365787881, + "grad_norm": 0.8633087277412415, + "learning_rate": 9.96910296267699e-06, + "loss": 0.7524, + "step": 1383 + }, + { + "epoch": 0.07617370246023447, + "grad_norm": 1.1203595399856567, + "learning_rate": 9.969054829932074e-06, + "loss": 0.945, + "step": 1384 + }, + { + "epoch": 0.07622874126259013, + "grad_norm": 0.8766878843307495, + "learning_rate": 9.969006659841072e-06, + "loss": 0.7537, + "step": 1385 + }, + { + "epoch": 0.07628378006494578, + "grad_norm": 0.9795958399772644, + "learning_rate": 9.968958452404345e-06, + "loss": 0.7963, + "step": 1386 + }, + { + "epoch": 0.07633881886730144, + "grad_norm": 0.9117506146430969, + "learning_rate": 9.968910207622257e-06, + "loss": 0.9469, + "step": 1387 + }, + { + "epoch": 0.0763938576696571, + "grad_norm": 0.9731466770172119, + "learning_rate": 9.96886192549517e-06, + "loss": 0.9536, + "step": 1388 + }, + { + "epoch": 0.07644889647201276, + "grad_norm": 0.8923571109771729, + "learning_rate": 9.968813606023446e-06, + "loss": 0.8362, + "step": 1389 + }, + { + "epoch": 0.07650393527436843, + "grad_norm": 0.8819600343704224, + "learning_rate": 9.96876524920745e-06, + "loss": 0.6938, + "step": 1390 + }, + { + "epoch": 0.07655897407672409, + "grad_norm": 0.9629887342453003, + "learning_rate": 9.968716855047545e-06, + "loss": 0.9104, + "step": 1391 + }, + { + "epoch": 0.07661401287907975, + "grad_norm": 0.992770254611969, + "learning_rate": 9.968668423544093e-06, + "loss": 0.944, + "step": 1392 + }, + { + "epoch": 0.07666905168143541, + "grad_norm": 0.8578491806983948, + "learning_rate": 9.96861995469746e-06, + "loss": 0.898, + "step": 1393 + }, + { + "epoch": 0.07672409048379107, + "grad_norm": 1.1169229745864868, + "learning_rate": 9.968571448508008e-06, + "loss": 0.8324, + "step": 1394 + }, + { + "epoch": 0.07677912928614673, + "grad_norm": 0.9600160121917725, + "learning_rate": 9.968522904976106e-06, + "loss": 0.9519, + "step": 1395 + }, + { + "epoch": 0.0768341680885024, + "grad_norm": 0.8271373510360718, + "learning_rate": 9.968474324102112e-06, + "loss": 0.8576, + "step": 1396 + }, + { + "epoch": 0.07688920689085806, + "grad_norm": 0.9437325596809387, + "learning_rate": 9.968425705886397e-06, + "loss": 0.9201, + "step": 1397 + }, + { + "epoch": 0.07694424569321372, + "grad_norm": 0.8679039478302002, + "learning_rate": 9.968377050329325e-06, + "loss": 0.8893, + "step": 1398 + }, + { + "epoch": 0.07699928449556938, + "grad_norm": 1.0178717374801636, + "learning_rate": 9.96832835743126e-06, + "loss": 0.9718, + "step": 1399 + }, + { + "epoch": 0.07705432329792504, + "grad_norm": 0.8354432582855225, + "learning_rate": 9.96827962719257e-06, + "loss": 0.83, + "step": 1400 + }, + { + "epoch": 0.0771093621002807, + "grad_norm": 1.2244631052017212, + "learning_rate": 9.968230859613619e-06, + "loss": 0.907, + "step": 1401 + }, + { + "epoch": 0.07716440090263636, + "grad_norm": 0.9099625945091248, + "learning_rate": 9.968182054694775e-06, + "loss": 0.809, + "step": 1402 + }, + { + "epoch": 0.07721943970499202, + "grad_norm": 0.8591424226760864, + "learning_rate": 9.968133212436404e-06, + "loss": 0.8869, + "step": 1403 + }, + { + "epoch": 0.07727447850734769, + "grad_norm": 1.068003535270691, + "learning_rate": 9.968084332838876e-06, + "loss": 0.8747, + "step": 1404 + }, + { + "epoch": 0.07732951730970335, + "grad_norm": 0.8503691554069519, + "learning_rate": 9.968035415902555e-06, + "loss": 0.7478, + "step": 1405 + }, + { + "epoch": 0.07738455611205901, + "grad_norm": 0.9209537506103516, + "learning_rate": 9.967986461627808e-06, + "loss": 0.9052, + "step": 1406 + }, + { + "epoch": 0.07743959491441466, + "grad_norm": 0.8447962999343872, + "learning_rate": 9.967937470015006e-06, + "loss": 0.7897, + "step": 1407 + }, + { + "epoch": 0.07749463371677032, + "grad_norm": 0.8731846809387207, + "learning_rate": 9.967888441064515e-06, + "loss": 0.837, + "step": 1408 + }, + { + "epoch": 0.07754967251912598, + "grad_norm": 0.9810444712638855, + "learning_rate": 9.967839374776705e-06, + "loss": 0.8236, + "step": 1409 + }, + { + "epoch": 0.07760471132148164, + "grad_norm": 0.8283190131187439, + "learning_rate": 9.967790271151944e-06, + "loss": 0.8443, + "step": 1410 + }, + { + "epoch": 0.0776597501238373, + "grad_norm": 0.7999932765960693, + "learning_rate": 9.9677411301906e-06, + "loss": 0.7945, + "step": 1411 + }, + { + "epoch": 0.07771478892619296, + "grad_norm": 0.9435983300209045, + "learning_rate": 9.967691951893044e-06, + "loss": 0.9745, + "step": 1412 + }, + { + "epoch": 0.07776982772854862, + "grad_norm": 0.8885984420776367, + "learning_rate": 9.967642736259646e-06, + "loss": 0.9163, + "step": 1413 + }, + { + "epoch": 0.07782486653090429, + "grad_norm": 0.993928074836731, + "learning_rate": 9.967593483290776e-06, + "loss": 0.7797, + "step": 1414 + }, + { + "epoch": 0.07787990533325995, + "grad_norm": 1.058830976486206, + "learning_rate": 9.9675441929868e-06, + "loss": 0.8671, + "step": 1415 + }, + { + "epoch": 0.07793494413561561, + "grad_norm": 1.0469766855239868, + "learning_rate": 9.967494865348093e-06, + "loss": 0.8671, + "step": 1416 + }, + { + "epoch": 0.07798998293797127, + "grad_norm": 0.902729868888855, + "learning_rate": 9.967445500375025e-06, + "loss": 0.8748, + "step": 1417 + }, + { + "epoch": 0.07804502174032693, + "grad_norm": 0.90755295753479, + "learning_rate": 9.967396098067965e-06, + "loss": 0.8279, + "step": 1418 + }, + { + "epoch": 0.07810006054268259, + "grad_norm": 0.8822374939918518, + "learning_rate": 9.967346658427287e-06, + "loss": 0.9386, + "step": 1419 + }, + { + "epoch": 0.07815509934503825, + "grad_norm": 0.9201469421386719, + "learning_rate": 9.96729718145336e-06, + "loss": 0.8684, + "step": 1420 + }, + { + "epoch": 0.07821013814739392, + "grad_norm": 0.9451109766960144, + "learning_rate": 9.967247667146558e-06, + "loss": 0.7854, + "step": 1421 + }, + { + "epoch": 0.07826517694974958, + "grad_norm": 0.9146197438240051, + "learning_rate": 9.96719811550725e-06, + "loss": 0.8496, + "step": 1422 + }, + { + "epoch": 0.07832021575210524, + "grad_norm": 0.9771224856376648, + "learning_rate": 9.967148526535813e-06, + "loss": 0.9657, + "step": 1423 + }, + { + "epoch": 0.0783752545544609, + "grad_norm": 0.8437683582305908, + "learning_rate": 9.967098900232616e-06, + "loss": 0.8336, + "step": 1424 + }, + { + "epoch": 0.07843029335681656, + "grad_norm": 0.8232185244560242, + "learning_rate": 9.967049236598034e-06, + "loss": 0.8878, + "step": 1425 + }, + { + "epoch": 0.07848533215917222, + "grad_norm": 1.0200369358062744, + "learning_rate": 9.96699953563244e-06, + "loss": 0.8135, + "step": 1426 + }, + { + "epoch": 0.07854037096152788, + "grad_norm": 0.8779187202453613, + "learning_rate": 9.966949797336208e-06, + "loss": 0.9124, + "step": 1427 + }, + { + "epoch": 0.07859540976388354, + "grad_norm": 0.9557466506958008, + "learning_rate": 9.966900021709708e-06, + "loss": 0.9118, + "step": 1428 + }, + { + "epoch": 0.07865044856623919, + "grad_norm": 0.8431050777435303, + "learning_rate": 9.966850208753317e-06, + "loss": 0.8361, + "step": 1429 + }, + { + "epoch": 0.07870548736859485, + "grad_norm": 0.9269648194313049, + "learning_rate": 9.966800358467412e-06, + "loss": 0.9194, + "step": 1430 + }, + { + "epoch": 0.07876052617095052, + "grad_norm": 0.818681538105011, + "learning_rate": 9.966750470852363e-06, + "loss": 0.7483, + "step": 1431 + }, + { + "epoch": 0.07881556497330618, + "grad_norm": 0.8788284659385681, + "learning_rate": 9.966700545908547e-06, + "loss": 0.858, + "step": 1432 + }, + { + "epoch": 0.07887060377566184, + "grad_norm": 0.7734160423278809, + "learning_rate": 9.966650583636342e-06, + "loss": 0.694, + "step": 1433 + }, + { + "epoch": 0.0789256425780175, + "grad_norm": 0.8846608996391296, + "learning_rate": 9.966600584036117e-06, + "loss": 0.8144, + "step": 1434 + }, + { + "epoch": 0.07898068138037316, + "grad_norm": 0.9740058183670044, + "learning_rate": 9.966550547108254e-06, + "loss": 0.9314, + "step": 1435 + }, + { + "epoch": 0.07903572018272882, + "grad_norm": 0.8731759786605835, + "learning_rate": 9.966500472853124e-06, + "loss": 0.8475, + "step": 1436 + }, + { + "epoch": 0.07909075898508448, + "grad_norm": 0.8984843492507935, + "learning_rate": 9.966450361271109e-06, + "loss": 0.7803, + "step": 1437 + }, + { + "epoch": 0.07914579778744014, + "grad_norm": 0.8897966742515564, + "learning_rate": 9.96640021236258e-06, + "loss": 0.8879, + "step": 1438 + }, + { + "epoch": 0.0792008365897958, + "grad_norm": 0.80704265832901, + "learning_rate": 9.966350026127917e-06, + "loss": 0.7585, + "step": 1439 + }, + { + "epoch": 0.07925587539215147, + "grad_norm": 1.0807467699050903, + "learning_rate": 9.966299802567499e-06, + "loss": 1.078, + "step": 1440 + }, + { + "epoch": 0.07931091419450713, + "grad_norm": 0.7994028925895691, + "learning_rate": 9.966249541681697e-06, + "loss": 0.8074, + "step": 1441 + }, + { + "epoch": 0.07936595299686279, + "grad_norm": 0.877592921257019, + "learning_rate": 9.966199243470895e-06, + "loss": 0.8084, + "step": 1442 + }, + { + "epoch": 0.07942099179921845, + "grad_norm": 0.7704572081565857, + "learning_rate": 9.966148907935469e-06, + "loss": 0.7206, + "step": 1443 + }, + { + "epoch": 0.07947603060157411, + "grad_norm": 0.8222140669822693, + "learning_rate": 9.966098535075797e-06, + "loss": 0.7768, + "step": 1444 + }, + { + "epoch": 0.07953106940392977, + "grad_norm": 1.389320731163025, + "learning_rate": 9.966048124892257e-06, + "loss": 1.0356, + "step": 1445 + }, + { + "epoch": 0.07958610820628544, + "grad_norm": 0.9082457423210144, + "learning_rate": 9.965997677385229e-06, + "loss": 0.7379, + "step": 1446 + }, + { + "epoch": 0.0796411470086411, + "grad_norm": 0.8029153943061829, + "learning_rate": 9.965947192555093e-06, + "loss": 0.7826, + "step": 1447 + }, + { + "epoch": 0.07969618581099676, + "grad_norm": 0.8752758502960205, + "learning_rate": 9.965896670402227e-06, + "loss": 0.8526, + "step": 1448 + }, + { + "epoch": 0.07975122461335242, + "grad_norm": 1.0665404796600342, + "learning_rate": 9.965846110927009e-06, + "loss": 0.858, + "step": 1449 + }, + { + "epoch": 0.07980626341570807, + "grad_norm": 0.9468502402305603, + "learning_rate": 9.96579551412982e-06, + "loss": 0.9658, + "step": 1450 + }, + { + "epoch": 0.07986130221806373, + "grad_norm": 1.0239403247833252, + "learning_rate": 9.965744880011046e-06, + "loss": 0.7995, + "step": 1451 + }, + { + "epoch": 0.07991634102041939, + "grad_norm": 0.9808099865913391, + "learning_rate": 9.965694208571059e-06, + "loss": 1.0173, + "step": 1452 + }, + { + "epoch": 0.07997137982277505, + "grad_norm": 0.9338780641555786, + "learning_rate": 9.965643499810245e-06, + "loss": 0.7917, + "step": 1453 + }, + { + "epoch": 0.08002641862513071, + "grad_norm": 0.9294295310974121, + "learning_rate": 9.965592753728981e-06, + "loss": 0.88, + "step": 1454 + }, + { + "epoch": 0.08008145742748637, + "grad_norm": 1.0261508226394653, + "learning_rate": 9.965541970327654e-06, + "loss": 0.8825, + "step": 1455 + }, + { + "epoch": 0.08013649622984204, + "grad_norm": 0.8964946269989014, + "learning_rate": 9.965491149606642e-06, + "loss": 0.81, + "step": 1456 + }, + { + "epoch": 0.0801915350321977, + "grad_norm": 0.9468267560005188, + "learning_rate": 9.965440291566329e-06, + "loss": 0.9453, + "step": 1457 + }, + { + "epoch": 0.08024657383455336, + "grad_norm": 0.8289040327072144, + "learning_rate": 9.965389396207092e-06, + "loss": 0.7373, + "step": 1458 + }, + { + "epoch": 0.08030161263690902, + "grad_norm": 0.8782384991645813, + "learning_rate": 9.965338463529322e-06, + "loss": 0.9199, + "step": 1459 + }, + { + "epoch": 0.08035665143926468, + "grad_norm": 0.8613787293434143, + "learning_rate": 9.965287493533395e-06, + "loss": 0.8719, + "step": 1460 + }, + { + "epoch": 0.08041169024162034, + "grad_norm": 0.8474903106689453, + "learning_rate": 9.965236486219696e-06, + "loss": 0.8033, + "step": 1461 + }, + { + "epoch": 0.080466729043976, + "grad_norm": 1.1442681550979614, + "learning_rate": 9.965185441588609e-06, + "loss": 0.8996, + "step": 1462 + }, + { + "epoch": 0.08052176784633167, + "grad_norm": 1.564138412475586, + "learning_rate": 9.965134359640518e-06, + "loss": 0.7451, + "step": 1463 + }, + { + "epoch": 0.08057680664868733, + "grad_norm": 0.9211083054542542, + "learning_rate": 9.965083240375806e-06, + "loss": 0.8939, + "step": 1464 + }, + { + "epoch": 0.08063184545104299, + "grad_norm": 0.9503418207168579, + "learning_rate": 9.965032083794856e-06, + "loss": 0.8544, + "step": 1465 + }, + { + "epoch": 0.08068688425339865, + "grad_norm": 0.9304021596908569, + "learning_rate": 9.964980889898055e-06, + "loss": 0.9192, + "step": 1466 + }, + { + "epoch": 0.08074192305575431, + "grad_norm": 0.8430425524711609, + "learning_rate": 9.964929658685787e-06, + "loss": 0.8586, + "step": 1467 + }, + { + "epoch": 0.08079696185810997, + "grad_norm": 0.8671759366989136, + "learning_rate": 9.964878390158437e-06, + "loss": 0.8807, + "step": 1468 + }, + { + "epoch": 0.08085200066046563, + "grad_norm": 0.9548830986022949, + "learning_rate": 9.964827084316389e-06, + "loss": 0.9033, + "step": 1469 + }, + { + "epoch": 0.0809070394628213, + "grad_norm": 0.8736767768859863, + "learning_rate": 9.964775741160029e-06, + "loss": 0.8509, + "step": 1470 + }, + { + "epoch": 0.08096207826517696, + "grad_norm": 0.8827025890350342, + "learning_rate": 9.964724360689745e-06, + "loss": 0.897, + "step": 1471 + }, + { + "epoch": 0.0810171170675326, + "grad_norm": 1.02822744846344, + "learning_rate": 9.964672942905921e-06, + "loss": 1.0371, + "step": 1472 + }, + { + "epoch": 0.08107215586988827, + "grad_norm": 0.8619557619094849, + "learning_rate": 9.964621487808946e-06, + "loss": 0.7654, + "step": 1473 + }, + { + "epoch": 0.08112719467224393, + "grad_norm": 0.7855951189994812, + "learning_rate": 9.9645699953992e-06, + "loss": 0.7767, + "step": 1474 + }, + { + "epoch": 0.08118223347459959, + "grad_norm": 0.8139809370040894, + "learning_rate": 9.96451846567708e-06, + "loss": 0.7535, + "step": 1475 + }, + { + "epoch": 0.08123727227695525, + "grad_norm": 0.8491657376289368, + "learning_rate": 9.964466898642966e-06, + "loss": 0.854, + "step": 1476 + }, + { + "epoch": 0.08129231107931091, + "grad_norm": 0.8968605399131775, + "learning_rate": 9.964415294297247e-06, + "loss": 0.8914, + "step": 1477 + }, + { + "epoch": 0.08134734988166657, + "grad_norm": 0.8692505359649658, + "learning_rate": 9.964363652640313e-06, + "loss": 0.9245, + "step": 1478 + }, + { + "epoch": 0.08140238868402223, + "grad_norm": 0.8916530013084412, + "learning_rate": 9.964311973672549e-06, + "loss": 0.7662, + "step": 1479 + }, + { + "epoch": 0.0814574274863779, + "grad_norm": 0.8239215612411499, + "learning_rate": 9.964260257394347e-06, + "loss": 0.9191, + "step": 1480 + }, + { + "epoch": 0.08151246628873356, + "grad_norm": 0.8672100901603699, + "learning_rate": 9.964208503806092e-06, + "loss": 0.7656, + "step": 1481 + }, + { + "epoch": 0.08156750509108922, + "grad_norm": 0.9195712208747864, + "learning_rate": 9.964156712908177e-06, + "loss": 0.8656, + "step": 1482 + }, + { + "epoch": 0.08162254389344488, + "grad_norm": 0.8282535672187805, + "learning_rate": 9.964104884700986e-06, + "loss": 0.8264, + "step": 1483 + }, + { + "epoch": 0.08167758269580054, + "grad_norm": 0.8492032289505005, + "learning_rate": 9.964053019184913e-06, + "loss": 0.7816, + "step": 1484 + }, + { + "epoch": 0.0817326214981562, + "grad_norm": 0.8491117358207703, + "learning_rate": 9.964001116360347e-06, + "loss": 0.7885, + "step": 1485 + }, + { + "epoch": 0.08178766030051186, + "grad_norm": 0.9415153861045837, + "learning_rate": 9.963949176227677e-06, + "loss": 0.8165, + "step": 1486 + }, + { + "epoch": 0.08184269910286752, + "grad_norm": 0.8462526202201843, + "learning_rate": 9.963897198787294e-06, + "loss": 0.8498, + "step": 1487 + }, + { + "epoch": 0.08189773790522319, + "grad_norm": 0.8591959476470947, + "learning_rate": 9.963845184039586e-06, + "loss": 0.8906, + "step": 1488 + }, + { + "epoch": 0.08195277670757885, + "grad_norm": 0.840761661529541, + "learning_rate": 9.963793131984949e-06, + "loss": 0.7831, + "step": 1489 + }, + { + "epoch": 0.08200781550993451, + "grad_norm": 0.931404173374176, + "learning_rate": 9.96374104262377e-06, + "loss": 0.889, + "step": 1490 + }, + { + "epoch": 0.08206285431229017, + "grad_norm": 0.9048783779144287, + "learning_rate": 9.963688915956443e-06, + "loss": 0.8321, + "step": 1491 + }, + { + "epoch": 0.08211789311464583, + "grad_norm": 0.9145931601524353, + "learning_rate": 9.96363675198336e-06, + "loss": 0.9918, + "step": 1492 + }, + { + "epoch": 0.08217293191700148, + "grad_norm": 0.9256643652915955, + "learning_rate": 9.963584550704908e-06, + "loss": 0.8731, + "step": 1493 + }, + { + "epoch": 0.08222797071935714, + "grad_norm": 1.0212007761001587, + "learning_rate": 9.963532312121486e-06, + "loss": 0.9077, + "step": 1494 + }, + { + "epoch": 0.0822830095217128, + "grad_norm": 0.9206242561340332, + "learning_rate": 9.963480036233483e-06, + "loss": 0.9076, + "step": 1495 + }, + { + "epoch": 0.08233804832406846, + "grad_norm": 0.8846865296363831, + "learning_rate": 9.963427723041294e-06, + "loss": 0.6826, + "step": 1496 + }, + { + "epoch": 0.08239308712642412, + "grad_norm": 0.8745351433753967, + "learning_rate": 9.963375372545309e-06, + "loss": 0.7935, + "step": 1497 + }, + { + "epoch": 0.08244812592877979, + "grad_norm": 0.9019666314125061, + "learning_rate": 9.963322984745924e-06, + "loss": 0.8435, + "step": 1498 + }, + { + "epoch": 0.08250316473113545, + "grad_norm": 0.8586859703063965, + "learning_rate": 9.963270559643531e-06, + "loss": 0.8118, + "step": 1499 + }, + { + "epoch": 0.08255820353349111, + "grad_norm": 0.9192817807197571, + "learning_rate": 9.963218097238528e-06, + "loss": 0.824, + "step": 1500 + }, + { + "epoch": 0.08261324233584677, + "grad_norm": 0.8972243070602417, + "learning_rate": 9.963165597531304e-06, + "loss": 0.8404, + "step": 1501 + }, + { + "epoch": 0.08266828113820243, + "grad_norm": 0.8953961133956909, + "learning_rate": 9.963113060522256e-06, + "loss": 0.9031, + "step": 1502 + }, + { + "epoch": 0.0827233199405581, + "grad_norm": 0.9551270604133606, + "learning_rate": 9.963060486211779e-06, + "loss": 0.9177, + "step": 1503 + }, + { + "epoch": 0.08277835874291375, + "grad_norm": 0.8524616956710815, + "learning_rate": 9.963007874600268e-06, + "loss": 0.8582, + "step": 1504 + }, + { + "epoch": 0.08283339754526942, + "grad_norm": 0.8148764371871948, + "learning_rate": 9.962955225688118e-06, + "loss": 0.6859, + "step": 1505 + }, + { + "epoch": 0.08288843634762508, + "grad_norm": 0.9110590219497681, + "learning_rate": 9.962902539475728e-06, + "loss": 0.7189, + "step": 1506 + }, + { + "epoch": 0.08294347514998074, + "grad_norm": 0.8700116872787476, + "learning_rate": 9.962849815963487e-06, + "loss": 0.9462, + "step": 1507 + }, + { + "epoch": 0.0829985139523364, + "grad_norm": 0.877109706401825, + "learning_rate": 9.962797055151797e-06, + "loss": 0.8138, + "step": 1508 + }, + { + "epoch": 0.08305355275469206, + "grad_norm": 0.7818365097045898, + "learning_rate": 9.962744257041053e-06, + "loss": 0.8474, + "step": 1509 + }, + { + "epoch": 0.08310859155704772, + "grad_norm": 0.88360196352005, + "learning_rate": 9.96269142163165e-06, + "loss": 0.8724, + "step": 1510 + }, + { + "epoch": 0.08316363035940338, + "grad_norm": 0.8982682228088379, + "learning_rate": 9.962638548923988e-06, + "loss": 0.9687, + "step": 1511 + }, + { + "epoch": 0.08321866916175905, + "grad_norm": 0.7362002730369568, + "learning_rate": 9.962585638918462e-06, + "loss": 0.7666, + "step": 1512 + }, + { + "epoch": 0.08327370796411471, + "grad_norm": 1.0993375778198242, + "learning_rate": 9.962532691615472e-06, + "loss": 0.8869, + "step": 1513 + }, + { + "epoch": 0.08332874676647037, + "grad_norm": 0.8684842586517334, + "learning_rate": 9.962479707015415e-06, + "loss": 0.872, + "step": 1514 + }, + { + "epoch": 0.08338378556882602, + "grad_norm": 1.0598478317260742, + "learning_rate": 9.962426685118689e-06, + "loss": 0.9102, + "step": 1515 + }, + { + "epoch": 0.08343882437118168, + "grad_norm": 0.8492125272750854, + "learning_rate": 9.96237362592569e-06, + "loss": 0.7554, + "step": 1516 + }, + { + "epoch": 0.08349386317353734, + "grad_norm": 0.8489052653312683, + "learning_rate": 9.962320529436821e-06, + "loss": 0.9139, + "step": 1517 + }, + { + "epoch": 0.083548901975893, + "grad_norm": 0.8650774359703064, + "learning_rate": 9.962267395652479e-06, + "loss": 0.8717, + "step": 1518 + }, + { + "epoch": 0.08360394077824866, + "grad_norm": 0.8393206596374512, + "learning_rate": 9.962214224573064e-06, + "loss": 0.8256, + "step": 1519 + }, + { + "epoch": 0.08365897958060432, + "grad_norm": 0.8304896354675293, + "learning_rate": 9.962161016198974e-06, + "loss": 0.8232, + "step": 1520 + }, + { + "epoch": 0.08371401838295998, + "grad_norm": 0.8718386292457581, + "learning_rate": 9.962107770530612e-06, + "loss": 0.8206, + "step": 1521 + }, + { + "epoch": 0.08376905718531565, + "grad_norm": 0.9109341502189636, + "learning_rate": 9.962054487568373e-06, + "loss": 0.9576, + "step": 1522 + }, + { + "epoch": 0.08382409598767131, + "grad_norm": 0.9543303847312927, + "learning_rate": 9.962001167312663e-06, + "loss": 0.8816, + "step": 1523 + }, + { + "epoch": 0.08387913479002697, + "grad_norm": 0.9992844462394714, + "learning_rate": 9.961947809763881e-06, + "loss": 0.8682, + "step": 1524 + }, + { + "epoch": 0.08393417359238263, + "grad_norm": 0.8092770576477051, + "learning_rate": 9.961894414922425e-06, + "loss": 0.6352, + "step": 1525 + }, + { + "epoch": 0.08398921239473829, + "grad_norm": 0.9888653755187988, + "learning_rate": 9.961840982788703e-06, + "loss": 0.8721, + "step": 1526 + }, + { + "epoch": 0.08404425119709395, + "grad_norm": 1.0092703104019165, + "learning_rate": 9.961787513363108e-06, + "loss": 0.7776, + "step": 1527 + }, + { + "epoch": 0.08409928999944961, + "grad_norm": 0.8654646277427673, + "learning_rate": 9.961734006646049e-06, + "loss": 0.8835, + "step": 1528 + }, + { + "epoch": 0.08415432880180528, + "grad_norm": 0.7630153298377991, + "learning_rate": 9.961680462637924e-06, + "loss": 0.7501, + "step": 1529 + }, + { + "epoch": 0.08420936760416094, + "grad_norm": 1.1883158683776855, + "learning_rate": 9.961626881339138e-06, + "loss": 0.9476, + "step": 1530 + }, + { + "epoch": 0.0842644064065166, + "grad_norm": 0.8710927963256836, + "learning_rate": 9.96157326275009e-06, + "loss": 0.749, + "step": 1531 + }, + { + "epoch": 0.08431944520887226, + "grad_norm": 0.9500633478164673, + "learning_rate": 9.961519606871188e-06, + "loss": 0.8994, + "step": 1532 + }, + { + "epoch": 0.08437448401122792, + "grad_norm": 0.873257577419281, + "learning_rate": 9.961465913702833e-06, + "loss": 0.816, + "step": 1533 + }, + { + "epoch": 0.08442952281358358, + "grad_norm": 0.8007022142410278, + "learning_rate": 9.961412183245426e-06, + "loss": 0.787, + "step": 1534 + }, + { + "epoch": 0.08448456161593924, + "grad_norm": 0.8998435139656067, + "learning_rate": 9.961358415499374e-06, + "loss": 0.8741, + "step": 1535 + }, + { + "epoch": 0.08453960041829489, + "grad_norm": 0.9152502417564392, + "learning_rate": 9.961304610465081e-06, + "loss": 0.9749, + "step": 1536 + }, + { + "epoch": 0.08459463922065055, + "grad_norm": 0.8961958289146423, + "learning_rate": 9.961250768142949e-06, + "loss": 0.8683, + "step": 1537 + }, + { + "epoch": 0.08464967802300621, + "grad_norm": 0.8683995008468628, + "learning_rate": 9.961196888533387e-06, + "loss": 0.8347, + "step": 1538 + }, + { + "epoch": 0.08470471682536188, + "grad_norm": 0.835221529006958, + "learning_rate": 9.961142971636795e-06, + "loss": 0.8936, + "step": 1539 + }, + { + "epoch": 0.08475975562771754, + "grad_norm": 0.8666725158691406, + "learning_rate": 9.96108901745358e-06, + "loss": 0.7344, + "step": 1540 + }, + { + "epoch": 0.0848147944300732, + "grad_norm": 0.9509082436561584, + "learning_rate": 9.96103502598415e-06, + "loss": 0.8965, + "step": 1541 + }, + { + "epoch": 0.08486983323242886, + "grad_norm": 0.8134233951568604, + "learning_rate": 9.960980997228908e-06, + "loss": 0.797, + "step": 1542 + }, + { + "epoch": 0.08492487203478452, + "grad_norm": 1.0432242155075073, + "learning_rate": 9.96092693118826e-06, + "loss": 0.8754, + "step": 1543 + }, + { + "epoch": 0.08497991083714018, + "grad_norm": 0.9560218453407288, + "learning_rate": 9.960872827862613e-06, + "loss": 0.9238, + "step": 1544 + }, + { + "epoch": 0.08503494963949584, + "grad_norm": 0.8471649885177612, + "learning_rate": 9.960818687252374e-06, + "loss": 0.8622, + "step": 1545 + }, + { + "epoch": 0.0850899884418515, + "grad_norm": 1.2584747076034546, + "learning_rate": 9.960764509357951e-06, + "loss": 0.8007, + "step": 1546 + }, + { + "epoch": 0.08514502724420717, + "grad_norm": 0.8730618953704834, + "learning_rate": 9.960710294179748e-06, + "loss": 0.7412, + "step": 1547 + }, + { + "epoch": 0.08520006604656283, + "grad_norm": 0.8361592292785645, + "learning_rate": 9.960656041718176e-06, + "loss": 0.7018, + "step": 1548 + }, + { + "epoch": 0.08525510484891849, + "grad_norm": 0.8351722359657288, + "learning_rate": 9.96060175197364e-06, + "loss": 0.843, + "step": 1549 + }, + { + "epoch": 0.08531014365127415, + "grad_norm": 0.8665090203285217, + "learning_rate": 9.960547424946549e-06, + "loss": 0.8235, + "step": 1550 + }, + { + "epoch": 0.08536518245362981, + "grad_norm": 0.9254478812217712, + "learning_rate": 9.960493060637313e-06, + "loss": 0.8122, + "step": 1551 + }, + { + "epoch": 0.08542022125598547, + "grad_norm": 0.8712261319160461, + "learning_rate": 9.960438659046337e-06, + "loss": 0.823, + "step": 1552 + }, + { + "epoch": 0.08547526005834113, + "grad_norm": 0.9027207493782043, + "learning_rate": 9.960384220174033e-06, + "loss": 0.7964, + "step": 1553 + }, + { + "epoch": 0.0855302988606968, + "grad_norm": 0.854626476764679, + "learning_rate": 9.960329744020808e-06, + "loss": 0.755, + "step": 1554 + }, + { + "epoch": 0.08558533766305246, + "grad_norm": 0.9398048520088196, + "learning_rate": 9.960275230587073e-06, + "loss": 0.8607, + "step": 1555 + }, + { + "epoch": 0.08564037646540812, + "grad_norm": 1.008002758026123, + "learning_rate": 9.960220679873238e-06, + "loss": 0.9711, + "step": 1556 + }, + { + "epoch": 0.08569541526776378, + "grad_norm": 0.8999453783035278, + "learning_rate": 9.96016609187971e-06, + "loss": 0.8233, + "step": 1557 + }, + { + "epoch": 0.08575045407011943, + "grad_norm": 0.8912106156349182, + "learning_rate": 9.960111466606903e-06, + "loss": 0.8271, + "step": 1558 + }, + { + "epoch": 0.08580549287247509, + "grad_norm": 0.9269998073577881, + "learning_rate": 9.960056804055227e-06, + "loss": 0.7959, + "step": 1559 + }, + { + "epoch": 0.08586053167483075, + "grad_norm": 1.083815336227417, + "learning_rate": 9.96000210422509e-06, + "loss": 0.9436, + "step": 1560 + }, + { + "epoch": 0.08591557047718641, + "grad_norm": 0.8906280398368835, + "learning_rate": 9.959947367116905e-06, + "loss": 0.9317, + "step": 1561 + }, + { + "epoch": 0.08597060927954207, + "grad_norm": 1.211696743965149, + "learning_rate": 9.959892592731084e-06, + "loss": 0.9076, + "step": 1562 + }, + { + "epoch": 0.08602564808189773, + "grad_norm": 0.9050534963607788, + "learning_rate": 9.959837781068038e-06, + "loss": 0.8728, + "step": 1563 + }, + { + "epoch": 0.0860806868842534, + "grad_norm": 0.9384796619415283, + "learning_rate": 9.959782932128178e-06, + "loss": 0.9277, + "step": 1564 + }, + { + "epoch": 0.08613572568660906, + "grad_norm": 0.795844316482544, + "learning_rate": 9.959728045911915e-06, + "loss": 0.7666, + "step": 1565 + }, + { + "epoch": 0.08619076448896472, + "grad_norm": 0.925956666469574, + "learning_rate": 9.959673122419668e-06, + "loss": 0.815, + "step": 1566 + }, + { + "epoch": 0.08624580329132038, + "grad_norm": 0.898047924041748, + "learning_rate": 9.959618161651843e-06, + "loss": 0.8131, + "step": 1567 + }, + { + "epoch": 0.08630084209367604, + "grad_norm": 0.8656220436096191, + "learning_rate": 9.959563163608856e-06, + "loss": 0.9336, + "step": 1568 + }, + { + "epoch": 0.0863558808960317, + "grad_norm": 0.9184645414352417, + "learning_rate": 9.95950812829112e-06, + "loss": 0.9557, + "step": 1569 + }, + { + "epoch": 0.08641091969838736, + "grad_norm": 0.8607667684555054, + "learning_rate": 9.959453055699048e-06, + "loss": 0.8272, + "step": 1570 + }, + { + "epoch": 0.08646595850074303, + "grad_norm": 0.9561272263526917, + "learning_rate": 9.959397945833056e-06, + "loss": 0.8876, + "step": 1571 + }, + { + "epoch": 0.08652099730309869, + "grad_norm": 0.8562412261962891, + "learning_rate": 9.959342798693556e-06, + "loss": 0.8404, + "step": 1572 + }, + { + "epoch": 0.08657603610545435, + "grad_norm": 0.8924610614776611, + "learning_rate": 9.95928761428096e-06, + "loss": 0.8779, + "step": 1573 + }, + { + "epoch": 0.08663107490781001, + "grad_norm": 0.8343208432197571, + "learning_rate": 9.95923239259569e-06, + "loss": 0.8992, + "step": 1574 + }, + { + "epoch": 0.08668611371016567, + "grad_norm": 0.8835015296936035, + "learning_rate": 9.959177133638155e-06, + "loss": 1.0026, + "step": 1575 + }, + { + "epoch": 0.08674115251252133, + "grad_norm": 0.9540221095085144, + "learning_rate": 9.959121837408771e-06, + "loss": 0.8507, + "step": 1576 + }, + { + "epoch": 0.086796191314877, + "grad_norm": 1.087817668914795, + "learning_rate": 9.959066503907957e-06, + "loss": 0.8607, + "step": 1577 + }, + { + "epoch": 0.08685123011723266, + "grad_norm": 0.8072447180747986, + "learning_rate": 9.959011133136124e-06, + "loss": 0.882, + "step": 1578 + }, + { + "epoch": 0.0869062689195883, + "grad_norm": 0.7646876573562622, + "learning_rate": 9.958955725093694e-06, + "loss": 0.7653, + "step": 1579 + }, + { + "epoch": 0.08696130772194396, + "grad_norm": 0.8979537487030029, + "learning_rate": 9.958900279781078e-06, + "loss": 0.9033, + "step": 1580 + }, + { + "epoch": 0.08701634652429963, + "grad_norm": 0.9445611834526062, + "learning_rate": 9.958844797198696e-06, + "loss": 0.9423, + "step": 1581 + }, + { + "epoch": 0.08707138532665529, + "grad_norm": 0.8836671113967896, + "learning_rate": 9.958789277346963e-06, + "loss": 0.839, + "step": 1582 + }, + { + "epoch": 0.08712642412901095, + "grad_norm": 1.0333542823791504, + "learning_rate": 9.958733720226296e-06, + "loss": 0.9211, + "step": 1583 + }, + { + "epoch": 0.08718146293136661, + "grad_norm": 0.8084085583686829, + "learning_rate": 9.958678125837117e-06, + "loss": 0.8387, + "step": 1584 + }, + { + "epoch": 0.08723650173372227, + "grad_norm": 0.7769419550895691, + "learning_rate": 9.958622494179838e-06, + "loss": 0.8307, + "step": 1585 + }, + { + "epoch": 0.08729154053607793, + "grad_norm": 0.8387578129768372, + "learning_rate": 9.95856682525488e-06, + "loss": 0.8001, + "step": 1586 + }, + { + "epoch": 0.0873465793384336, + "grad_norm": 0.8989812731742859, + "learning_rate": 9.95851111906266e-06, + "loss": 0.7752, + "step": 1587 + }, + { + "epoch": 0.08740161814078926, + "grad_norm": 0.8558734655380249, + "learning_rate": 9.958455375603602e-06, + "loss": 0.8149, + "step": 1588 + }, + { + "epoch": 0.08745665694314492, + "grad_norm": 0.8890896439552307, + "learning_rate": 9.958399594878117e-06, + "loss": 0.8232, + "step": 1589 + }, + { + "epoch": 0.08751169574550058, + "grad_norm": 0.875912070274353, + "learning_rate": 9.95834377688663e-06, + "loss": 0.7458, + "step": 1590 + }, + { + "epoch": 0.08756673454785624, + "grad_norm": 0.808355987071991, + "learning_rate": 9.958287921629557e-06, + "loss": 0.8296, + "step": 1591 + }, + { + "epoch": 0.0876217733502119, + "grad_norm": 0.9637090563774109, + "learning_rate": 9.958232029107318e-06, + "loss": 0.8769, + "step": 1592 + }, + { + "epoch": 0.08767681215256756, + "grad_norm": 0.8980715870857239, + "learning_rate": 9.958176099320336e-06, + "loss": 0.7995, + "step": 1593 + }, + { + "epoch": 0.08773185095492322, + "grad_norm": 0.9369860291481018, + "learning_rate": 9.95812013226903e-06, + "loss": 0.8545, + "step": 1594 + }, + { + "epoch": 0.08778688975727889, + "grad_norm": 0.8589349389076233, + "learning_rate": 9.958064127953819e-06, + "loss": 0.8693, + "step": 1595 + }, + { + "epoch": 0.08784192855963455, + "grad_norm": 0.929207444190979, + "learning_rate": 9.958008086375126e-06, + "loss": 0.811, + "step": 1596 + }, + { + "epoch": 0.08789696736199021, + "grad_norm": 1.0825661420822144, + "learning_rate": 9.957952007533371e-06, + "loss": 1.0145, + "step": 1597 + }, + { + "epoch": 0.08795200616434587, + "grad_norm": 0.8818382024765015, + "learning_rate": 9.957895891428978e-06, + "loss": 0.7771, + "step": 1598 + }, + { + "epoch": 0.08800704496670153, + "grad_norm": 0.882780909538269, + "learning_rate": 9.957839738062363e-06, + "loss": 0.8857, + "step": 1599 + }, + { + "epoch": 0.08806208376905718, + "grad_norm": 0.9136924743652344, + "learning_rate": 9.957783547433955e-06, + "loss": 0.8873, + "step": 1600 + }, + { + "epoch": 0.08811712257141284, + "grad_norm": 0.8896858096122742, + "learning_rate": 9.95772731954417e-06, + "loss": 0.8463, + "step": 1601 + }, + { + "epoch": 0.0881721613737685, + "grad_norm": 0.8671631813049316, + "learning_rate": 9.957671054393436e-06, + "loss": 0.8333, + "step": 1602 + }, + { + "epoch": 0.08822720017612416, + "grad_norm": 0.9442896246910095, + "learning_rate": 9.957614751982172e-06, + "loss": 0.9676, + "step": 1603 + }, + { + "epoch": 0.08828223897847982, + "grad_norm": 0.8249240517616272, + "learning_rate": 9.957558412310803e-06, + "loss": 0.7746, + "step": 1604 + }, + { + "epoch": 0.08833727778083549, + "grad_norm": 0.8125253319740295, + "learning_rate": 9.957502035379751e-06, + "loss": 0.7816, + "step": 1605 + }, + { + "epoch": 0.08839231658319115, + "grad_norm": 0.8467233777046204, + "learning_rate": 9.957445621189442e-06, + "loss": 0.7697, + "step": 1606 + }, + { + "epoch": 0.08844735538554681, + "grad_norm": 0.8322175145149231, + "learning_rate": 9.957389169740299e-06, + "loss": 0.7561, + "step": 1607 + }, + { + "epoch": 0.08850239418790247, + "grad_norm": 0.869163453578949, + "learning_rate": 9.957332681032746e-06, + "loss": 0.8984, + "step": 1608 + }, + { + "epoch": 0.08855743299025813, + "grad_norm": 0.8755944967269897, + "learning_rate": 9.957276155067206e-06, + "loss": 0.8016, + "step": 1609 + }, + { + "epoch": 0.08861247179261379, + "grad_norm": 0.8152669668197632, + "learning_rate": 9.957219591844108e-06, + "loss": 0.7763, + "step": 1610 + }, + { + "epoch": 0.08866751059496945, + "grad_norm": 0.979752779006958, + "learning_rate": 9.957162991363871e-06, + "loss": 0.7755, + "step": 1611 + }, + { + "epoch": 0.08872254939732512, + "grad_norm": 1.0481054782867432, + "learning_rate": 9.957106353626926e-06, + "loss": 0.9395, + "step": 1612 + }, + { + "epoch": 0.08877758819968078, + "grad_norm": 0.7773686647415161, + "learning_rate": 9.957049678633697e-06, + "loss": 0.7713, + "step": 1613 + }, + { + "epoch": 0.08883262700203644, + "grad_norm": 0.838979959487915, + "learning_rate": 9.956992966384609e-06, + "loss": 0.7909, + "step": 1614 + }, + { + "epoch": 0.0888876658043921, + "grad_norm": 0.9527049660682678, + "learning_rate": 9.956936216880089e-06, + "loss": 0.7944, + "step": 1615 + }, + { + "epoch": 0.08894270460674776, + "grad_norm": 0.7967305183410645, + "learning_rate": 9.956879430120561e-06, + "loss": 0.7703, + "step": 1616 + }, + { + "epoch": 0.08899774340910342, + "grad_norm": 0.9065802097320557, + "learning_rate": 9.956822606106456e-06, + "loss": 0.8188, + "step": 1617 + }, + { + "epoch": 0.08905278221145908, + "grad_norm": 0.7329322099685669, + "learning_rate": 9.956765744838199e-06, + "loss": 0.8043, + "step": 1618 + }, + { + "epoch": 0.08910782101381474, + "grad_norm": 0.864973247051239, + "learning_rate": 9.95670884631622e-06, + "loss": 0.8334, + "step": 1619 + }, + { + "epoch": 0.0891628598161704, + "grad_norm": 1.073559045791626, + "learning_rate": 9.95665191054094e-06, + "loss": 0.7755, + "step": 1620 + }, + { + "epoch": 0.08921789861852607, + "grad_norm": 0.7347918748855591, + "learning_rate": 9.956594937512794e-06, + "loss": 0.7556, + "step": 1621 + }, + { + "epoch": 0.08927293742088172, + "grad_norm": 0.8756610751152039, + "learning_rate": 9.956537927232205e-06, + "loss": 0.8129, + "step": 1622 + }, + { + "epoch": 0.08932797622323738, + "grad_norm": 0.9132435917854309, + "learning_rate": 9.956480879699605e-06, + "loss": 0.8221, + "step": 1623 + }, + { + "epoch": 0.08938301502559304, + "grad_norm": 1.1978256702423096, + "learning_rate": 9.956423794915421e-06, + "loss": 0.8651, + "step": 1624 + }, + { + "epoch": 0.0894380538279487, + "grad_norm": 0.8493894934654236, + "learning_rate": 9.956366672880082e-06, + "loss": 0.7267, + "step": 1625 + }, + { + "epoch": 0.08949309263030436, + "grad_norm": 1.0971951484680176, + "learning_rate": 9.956309513594019e-06, + "loss": 0.7852, + "step": 1626 + }, + { + "epoch": 0.08954813143266002, + "grad_norm": 0.899974524974823, + "learning_rate": 9.95625231705766e-06, + "loss": 0.8868, + "step": 1627 + }, + { + "epoch": 0.08960317023501568, + "grad_norm": 0.8995566368103027, + "learning_rate": 9.956195083271436e-06, + "loss": 0.87, + "step": 1628 + }, + { + "epoch": 0.08965820903737134, + "grad_norm": 0.8924218416213989, + "learning_rate": 9.956137812235776e-06, + "loss": 0.7885, + "step": 1629 + }, + { + "epoch": 0.089713247839727, + "grad_norm": 0.9232820868492126, + "learning_rate": 9.956080503951108e-06, + "loss": 0.7923, + "step": 1630 + }, + { + "epoch": 0.08976828664208267, + "grad_norm": 0.9298982620239258, + "learning_rate": 9.956023158417869e-06, + "loss": 0.8625, + "step": 1631 + }, + { + "epoch": 0.08982332544443833, + "grad_norm": 0.86515212059021, + "learning_rate": 9.955965775636488e-06, + "loss": 0.7683, + "step": 1632 + }, + { + "epoch": 0.08987836424679399, + "grad_norm": 0.8016952276229858, + "learning_rate": 9.955908355607392e-06, + "loss": 0.8122, + "step": 1633 + }, + { + "epoch": 0.08993340304914965, + "grad_norm": 0.842703640460968, + "learning_rate": 9.955850898331015e-06, + "loss": 0.8487, + "step": 1634 + }, + { + "epoch": 0.08998844185150531, + "grad_norm": 0.8239083886146545, + "learning_rate": 9.95579340380779e-06, + "loss": 0.8701, + "step": 1635 + }, + { + "epoch": 0.09004348065386097, + "grad_norm": 0.8575418591499329, + "learning_rate": 9.955735872038149e-06, + "loss": 0.8263, + "step": 1636 + }, + { + "epoch": 0.09009851945621664, + "grad_norm": 0.8884586095809937, + "learning_rate": 9.955678303022522e-06, + "loss": 0.8112, + "step": 1637 + }, + { + "epoch": 0.0901535582585723, + "grad_norm": 0.9024681448936462, + "learning_rate": 9.955620696761345e-06, + "loss": 0.9174, + "step": 1638 + }, + { + "epoch": 0.09020859706092796, + "grad_norm": 0.8151944875717163, + "learning_rate": 9.955563053255049e-06, + "loss": 0.806, + "step": 1639 + }, + { + "epoch": 0.09026363586328362, + "grad_norm": 0.8292184472084045, + "learning_rate": 9.955505372504069e-06, + "loss": 0.8007, + "step": 1640 + }, + { + "epoch": 0.09031867466563928, + "grad_norm": 0.9445936679840088, + "learning_rate": 9.955447654508835e-06, + "loss": 0.7089, + "step": 1641 + }, + { + "epoch": 0.09037371346799494, + "grad_norm": 0.781579315662384, + "learning_rate": 9.955389899269782e-06, + "loss": 0.8224, + "step": 1642 + }, + { + "epoch": 0.09042875227035059, + "grad_norm": 0.9028880596160889, + "learning_rate": 9.955332106787348e-06, + "loss": 0.7976, + "step": 1643 + }, + { + "epoch": 0.09048379107270625, + "grad_norm": 1.0336887836456299, + "learning_rate": 9.955274277061963e-06, + "loss": 0.9296, + "step": 1644 + }, + { + "epoch": 0.09053882987506191, + "grad_norm": 0.8894197940826416, + "learning_rate": 9.955216410094062e-06, + "loss": 0.815, + "step": 1645 + }, + { + "epoch": 0.09059386867741757, + "grad_norm": 0.8955528140068054, + "learning_rate": 9.955158505884083e-06, + "loss": 0.8707, + "step": 1646 + }, + { + "epoch": 0.09064890747977324, + "grad_norm": 0.8012683987617493, + "learning_rate": 9.955100564432458e-06, + "loss": 0.7467, + "step": 1647 + }, + { + "epoch": 0.0907039462821289, + "grad_norm": 0.917969286441803, + "learning_rate": 9.955042585739623e-06, + "loss": 0.8835, + "step": 1648 + }, + { + "epoch": 0.09075898508448456, + "grad_norm": 0.8066666722297668, + "learning_rate": 9.954984569806014e-06, + "loss": 0.8338, + "step": 1649 + }, + { + "epoch": 0.09081402388684022, + "grad_norm": 1.1324070692062378, + "learning_rate": 9.954926516632069e-06, + "loss": 0.8245, + "step": 1650 + }, + { + "epoch": 0.09086906268919588, + "grad_norm": 0.8196014761924744, + "learning_rate": 9.954868426218222e-06, + "loss": 0.7897, + "step": 1651 + }, + { + "epoch": 0.09092410149155154, + "grad_norm": 0.8713478446006775, + "learning_rate": 9.95481029856491e-06, + "loss": 0.891, + "step": 1652 + }, + { + "epoch": 0.0909791402939072, + "grad_norm": 0.8489059805870056, + "learning_rate": 9.954752133672569e-06, + "loss": 0.7748, + "step": 1653 + }, + { + "epoch": 0.09103417909626287, + "grad_norm": 0.8914602994918823, + "learning_rate": 9.954693931541638e-06, + "loss": 0.8657, + "step": 1654 + }, + { + "epoch": 0.09108921789861853, + "grad_norm": 0.9031614661216736, + "learning_rate": 9.954635692172555e-06, + "loss": 0.7409, + "step": 1655 + }, + { + "epoch": 0.09114425670097419, + "grad_norm": 0.8680000305175781, + "learning_rate": 9.954577415565756e-06, + "loss": 0.8535, + "step": 1656 + }, + { + "epoch": 0.09119929550332985, + "grad_norm": 0.830596923828125, + "learning_rate": 9.954519101721679e-06, + "loss": 0.8601, + "step": 1657 + }, + { + "epoch": 0.09125433430568551, + "grad_norm": 0.9041332602500916, + "learning_rate": 9.954460750640762e-06, + "loss": 0.9104, + "step": 1658 + }, + { + "epoch": 0.09130937310804117, + "grad_norm": 0.7786296606063843, + "learning_rate": 9.954402362323445e-06, + "loss": 0.7671, + "step": 1659 + }, + { + "epoch": 0.09136441191039683, + "grad_norm": 1.0363564491271973, + "learning_rate": 9.954343936770165e-06, + "loss": 0.9339, + "step": 1660 + }, + { + "epoch": 0.0914194507127525, + "grad_norm": 0.8049986958503723, + "learning_rate": 9.954285473981363e-06, + "loss": 0.8125, + "step": 1661 + }, + { + "epoch": 0.09147448951510816, + "grad_norm": 0.7842011451721191, + "learning_rate": 9.954226973957477e-06, + "loss": 0.7153, + "step": 1662 + }, + { + "epoch": 0.09152952831746382, + "grad_norm": 0.8929729461669922, + "learning_rate": 9.954168436698948e-06, + "loss": 0.9563, + "step": 1663 + }, + { + "epoch": 0.09158456711981948, + "grad_norm": 0.8850226402282715, + "learning_rate": 9.954109862206216e-06, + "loss": 0.8257, + "step": 1664 + }, + { + "epoch": 0.09163960592217513, + "grad_norm": 0.8673348426818848, + "learning_rate": 9.954051250479719e-06, + "loss": 0.9489, + "step": 1665 + }, + { + "epoch": 0.09169464472453079, + "grad_norm": 0.8726119995117188, + "learning_rate": 9.9539926015199e-06, + "loss": 0.8222, + "step": 1666 + }, + { + "epoch": 0.09174968352688645, + "grad_norm": 0.7609312534332275, + "learning_rate": 9.953933915327196e-06, + "loss": 0.7749, + "step": 1667 + }, + { + "epoch": 0.09180472232924211, + "grad_norm": 0.857404887676239, + "learning_rate": 9.953875191902055e-06, + "loss": 0.8496, + "step": 1668 + }, + { + "epoch": 0.09185976113159777, + "grad_norm": 0.7835526466369629, + "learning_rate": 9.953816431244909e-06, + "loss": 0.7258, + "step": 1669 + }, + { + "epoch": 0.09191479993395343, + "grad_norm": 0.944984495639801, + "learning_rate": 9.95375763335621e-06, + "loss": 0.902, + "step": 1670 + }, + { + "epoch": 0.0919698387363091, + "grad_norm": 0.9038936495780945, + "learning_rate": 9.953698798236391e-06, + "loss": 0.7559, + "step": 1671 + }, + { + "epoch": 0.09202487753866476, + "grad_norm": 0.8450848460197449, + "learning_rate": 9.953639925885898e-06, + "loss": 0.8338, + "step": 1672 + }, + { + "epoch": 0.09207991634102042, + "grad_norm": 0.827419102191925, + "learning_rate": 9.953581016305175e-06, + "loss": 0.8167, + "step": 1673 + }, + { + "epoch": 0.09213495514337608, + "grad_norm": 0.8517075777053833, + "learning_rate": 9.953522069494663e-06, + "loss": 0.8681, + "step": 1674 + }, + { + "epoch": 0.09218999394573174, + "grad_norm": 0.9504323601722717, + "learning_rate": 9.953463085454804e-06, + "loss": 0.8688, + "step": 1675 + }, + { + "epoch": 0.0922450327480874, + "grad_norm": 0.8905719518661499, + "learning_rate": 9.953404064186044e-06, + "loss": 0.8818, + "step": 1676 + }, + { + "epoch": 0.09230007155044306, + "grad_norm": 0.9223340153694153, + "learning_rate": 9.953345005688822e-06, + "loss": 0.8752, + "step": 1677 + }, + { + "epoch": 0.09235511035279872, + "grad_norm": 1.0500547885894775, + "learning_rate": 9.953285909963588e-06, + "loss": 0.7816, + "step": 1678 + }, + { + "epoch": 0.09241014915515439, + "grad_norm": 0.8407441973686218, + "learning_rate": 9.953226777010781e-06, + "loss": 0.745, + "step": 1679 + }, + { + "epoch": 0.09246518795751005, + "grad_norm": 0.7997288107872009, + "learning_rate": 9.953167606830847e-06, + "loss": 0.8171, + "step": 1680 + }, + { + "epoch": 0.09252022675986571, + "grad_norm": 0.9752318859100342, + "learning_rate": 9.953108399424234e-06, + "loss": 0.8719, + "step": 1681 + }, + { + "epoch": 0.09257526556222137, + "grad_norm": 0.8524298667907715, + "learning_rate": 9.953049154791382e-06, + "loss": 0.8257, + "step": 1682 + }, + { + "epoch": 0.09263030436457703, + "grad_norm": 0.9460529088973999, + "learning_rate": 9.952989872932739e-06, + "loss": 0.7278, + "step": 1683 + }, + { + "epoch": 0.0926853431669327, + "grad_norm": 0.8959575891494751, + "learning_rate": 9.95293055384875e-06, + "loss": 0.903, + "step": 1684 + }, + { + "epoch": 0.09274038196928835, + "grad_norm": 0.8764386177062988, + "learning_rate": 9.95287119753986e-06, + "loss": 0.7958, + "step": 1685 + }, + { + "epoch": 0.092795420771644, + "grad_norm": 0.9611337184906006, + "learning_rate": 9.952811804006517e-06, + "loss": 0.8726, + "step": 1686 + }, + { + "epoch": 0.09285045957399966, + "grad_norm": 0.8155574202537537, + "learning_rate": 9.952752373249165e-06, + "loss": 0.7882, + "step": 1687 + }, + { + "epoch": 0.09290549837635532, + "grad_norm": 0.8789697289466858, + "learning_rate": 9.952692905268253e-06, + "loss": 0.8642, + "step": 1688 + }, + { + "epoch": 0.09296053717871099, + "grad_norm": 0.7910027503967285, + "learning_rate": 9.952633400064227e-06, + "loss": 0.7852, + "step": 1689 + }, + { + "epoch": 0.09301557598106665, + "grad_norm": 0.815819501876831, + "learning_rate": 9.952573857637533e-06, + "loss": 0.8606, + "step": 1690 + }, + { + "epoch": 0.09307061478342231, + "grad_norm": 0.9840701818466187, + "learning_rate": 9.95251427798862e-06, + "loss": 0.9349, + "step": 1691 + }, + { + "epoch": 0.09312565358577797, + "grad_norm": 0.8715788722038269, + "learning_rate": 9.952454661117936e-06, + "loss": 0.813, + "step": 1692 + }, + { + "epoch": 0.09318069238813363, + "grad_norm": 0.8287779092788696, + "learning_rate": 9.952395007025926e-06, + "loss": 0.8346, + "step": 1693 + }, + { + "epoch": 0.0932357311904893, + "grad_norm": 0.9375059008598328, + "learning_rate": 9.952335315713044e-06, + "loss": 0.8868, + "step": 1694 + }, + { + "epoch": 0.09329076999284495, + "grad_norm": 0.9063667058944702, + "learning_rate": 9.952275587179734e-06, + "loss": 0.9562, + "step": 1695 + }, + { + "epoch": 0.09334580879520062, + "grad_norm": 0.816643476486206, + "learning_rate": 9.952215821426447e-06, + "loss": 0.7456, + "step": 1696 + }, + { + "epoch": 0.09340084759755628, + "grad_norm": 0.9004347324371338, + "learning_rate": 9.95215601845363e-06, + "loss": 0.8545, + "step": 1697 + }, + { + "epoch": 0.09345588639991194, + "grad_norm": 0.919195830821991, + "learning_rate": 9.952096178261736e-06, + "loss": 0.9347, + "step": 1698 + }, + { + "epoch": 0.0935109252022676, + "grad_norm": 0.8313261866569519, + "learning_rate": 9.952036300851211e-06, + "loss": 0.9169, + "step": 1699 + }, + { + "epoch": 0.09356596400462326, + "grad_norm": 0.8674910664558411, + "learning_rate": 9.951976386222507e-06, + "loss": 0.7621, + "step": 1700 + }, + { + "epoch": 0.09362100280697892, + "grad_norm": 0.8931052684783936, + "learning_rate": 9.951916434376074e-06, + "loss": 0.8702, + "step": 1701 + }, + { + "epoch": 0.09367604160933458, + "grad_norm": 0.8748393058776855, + "learning_rate": 9.951856445312364e-06, + "loss": 0.7446, + "step": 1702 + }, + { + "epoch": 0.09373108041169025, + "grad_norm": 1.005459189414978, + "learning_rate": 9.951796419031825e-06, + "loss": 0.9843, + "step": 1703 + }, + { + "epoch": 0.09378611921404591, + "grad_norm": 1.0155184268951416, + "learning_rate": 9.95173635553491e-06, + "loss": 0.8868, + "step": 1704 + }, + { + "epoch": 0.09384115801640157, + "grad_norm": 2.1387271881103516, + "learning_rate": 9.951676254822072e-06, + "loss": 0.8691, + "step": 1705 + }, + { + "epoch": 0.09389619681875723, + "grad_norm": 0.9768403768539429, + "learning_rate": 9.951616116893757e-06, + "loss": 0.8409, + "step": 1706 + }, + { + "epoch": 0.09395123562111289, + "grad_norm": 0.7994607090950012, + "learning_rate": 9.951555941750424e-06, + "loss": 0.7836, + "step": 1707 + }, + { + "epoch": 0.09400627442346854, + "grad_norm": 0.8460201025009155, + "learning_rate": 9.95149572939252e-06, + "loss": 0.8216, + "step": 1708 + }, + { + "epoch": 0.0940613132258242, + "grad_norm": 0.8904135227203369, + "learning_rate": 9.951435479820499e-06, + "loss": 0.9053, + "step": 1709 + }, + { + "epoch": 0.09411635202817986, + "grad_norm": 0.9084494113922119, + "learning_rate": 9.951375193034815e-06, + "loss": 0.9308, + "step": 1710 + }, + { + "epoch": 0.09417139083053552, + "grad_norm": 1.0826482772827148, + "learning_rate": 9.951314869035921e-06, + "loss": 0.8468, + "step": 1711 + }, + { + "epoch": 0.09422642963289118, + "grad_norm": 0.8068915009498596, + "learning_rate": 9.95125450782427e-06, + "loss": 0.8253, + "step": 1712 + }, + { + "epoch": 0.09428146843524685, + "grad_norm": 0.8445400595664978, + "learning_rate": 9.951194109400316e-06, + "loss": 0.8386, + "step": 1713 + }, + { + "epoch": 0.09433650723760251, + "grad_norm": 0.8180645704269409, + "learning_rate": 9.951133673764513e-06, + "loss": 0.7907, + "step": 1714 + }, + { + "epoch": 0.09439154603995817, + "grad_norm": 0.8111036419868469, + "learning_rate": 9.951073200917311e-06, + "loss": 0.7918, + "step": 1715 + }, + { + "epoch": 0.09444658484231383, + "grad_norm": 0.862042248249054, + "learning_rate": 9.951012690859172e-06, + "loss": 0.783, + "step": 1716 + }, + { + "epoch": 0.09450162364466949, + "grad_norm": 0.8189615607261658, + "learning_rate": 9.950952143590544e-06, + "loss": 0.8192, + "step": 1717 + }, + { + "epoch": 0.09455666244702515, + "grad_norm": 0.9714062809944153, + "learning_rate": 9.950891559111887e-06, + "loss": 0.774, + "step": 1718 + }, + { + "epoch": 0.09461170124938081, + "grad_norm": 0.9691846370697021, + "learning_rate": 9.950830937423655e-06, + "loss": 0.8347, + "step": 1719 + }, + { + "epoch": 0.09466674005173648, + "grad_norm": 0.8488250970840454, + "learning_rate": 9.950770278526301e-06, + "loss": 0.8228, + "step": 1720 + }, + { + "epoch": 0.09472177885409214, + "grad_norm": 0.8638359904289246, + "learning_rate": 9.950709582420282e-06, + "loss": 0.8973, + "step": 1721 + }, + { + "epoch": 0.0947768176564478, + "grad_norm": 1.0148643255233765, + "learning_rate": 9.950648849106058e-06, + "loss": 0.9638, + "step": 1722 + }, + { + "epoch": 0.09483185645880346, + "grad_norm": 0.8870131969451904, + "learning_rate": 9.95058807858408e-06, + "loss": 0.8259, + "step": 1723 + }, + { + "epoch": 0.09488689526115912, + "grad_norm": 0.9134769439697266, + "learning_rate": 9.950527270854807e-06, + "loss": 0.865, + "step": 1724 + }, + { + "epoch": 0.09494193406351478, + "grad_norm": 0.7221654653549194, + "learning_rate": 9.950466425918697e-06, + "loss": 0.7593, + "step": 1725 + }, + { + "epoch": 0.09499697286587044, + "grad_norm": 0.9386674165725708, + "learning_rate": 9.950405543776207e-06, + "loss": 0.9508, + "step": 1726 + }, + { + "epoch": 0.0950520116682261, + "grad_norm": 0.7850627899169922, + "learning_rate": 9.950344624427795e-06, + "loss": 0.7999, + "step": 1727 + }, + { + "epoch": 0.09510705047058177, + "grad_norm": 0.921198308467865, + "learning_rate": 9.950283667873916e-06, + "loss": 0.8249, + "step": 1728 + }, + { + "epoch": 0.09516208927293741, + "grad_norm": 0.9503389000892639, + "learning_rate": 9.95022267411503e-06, + "loss": 0.901, + "step": 1729 + }, + { + "epoch": 0.09521712807529308, + "grad_norm": 0.7977343201637268, + "learning_rate": 9.950161643151597e-06, + "loss": 0.838, + "step": 1730 + }, + { + "epoch": 0.09527216687764874, + "grad_norm": 0.9056238532066345, + "learning_rate": 9.950100574984072e-06, + "loss": 0.9756, + "step": 1731 + }, + { + "epoch": 0.0953272056800044, + "grad_norm": 0.8092935681343079, + "learning_rate": 9.950039469612918e-06, + "loss": 0.8812, + "step": 1732 + }, + { + "epoch": 0.09538224448236006, + "grad_norm": 0.823693573474884, + "learning_rate": 9.949978327038592e-06, + "loss": 0.7914, + "step": 1733 + }, + { + "epoch": 0.09543728328471572, + "grad_norm": 0.9114876389503479, + "learning_rate": 9.949917147261554e-06, + "loss": 0.7944, + "step": 1734 + }, + { + "epoch": 0.09549232208707138, + "grad_norm": 1.0084123611450195, + "learning_rate": 9.949855930282262e-06, + "loss": 0.8544, + "step": 1735 + }, + { + "epoch": 0.09554736088942704, + "grad_norm": 0.842462956905365, + "learning_rate": 9.949794676101181e-06, + "loss": 0.7056, + "step": 1736 + }, + { + "epoch": 0.0956023996917827, + "grad_norm": 1.00497305393219, + "learning_rate": 9.949733384718766e-06, + "loss": 0.8372, + "step": 1737 + }, + { + "epoch": 0.09565743849413837, + "grad_norm": 1.0166410207748413, + "learning_rate": 9.94967205613548e-06, + "loss": 0.9316, + "step": 1738 + }, + { + "epoch": 0.09571247729649403, + "grad_norm": 0.8520192503929138, + "learning_rate": 9.949610690351784e-06, + "loss": 0.786, + "step": 1739 + }, + { + "epoch": 0.09576751609884969, + "grad_norm": 0.8003227114677429, + "learning_rate": 9.949549287368139e-06, + "loss": 0.8003, + "step": 1740 + }, + { + "epoch": 0.09582255490120535, + "grad_norm": 0.8657151460647583, + "learning_rate": 9.949487847185006e-06, + "loss": 0.8407, + "step": 1741 + }, + { + "epoch": 0.09587759370356101, + "grad_norm": 1.1119858026504517, + "learning_rate": 9.949426369802848e-06, + "loss": 0.8594, + "step": 1742 + }, + { + "epoch": 0.09593263250591667, + "grad_norm": 0.8968474864959717, + "learning_rate": 9.949364855222126e-06, + "loss": 0.8254, + "step": 1743 + }, + { + "epoch": 0.09598767130827233, + "grad_norm": 0.8740531206130981, + "learning_rate": 9.949303303443304e-06, + "loss": 0.8748, + "step": 1744 + }, + { + "epoch": 0.096042710110628, + "grad_norm": 0.8833459615707397, + "learning_rate": 9.94924171446684e-06, + "loss": 0.838, + "step": 1745 + }, + { + "epoch": 0.09609774891298366, + "grad_norm": 0.8783486485481262, + "learning_rate": 9.949180088293201e-06, + "loss": 0.7972, + "step": 1746 + }, + { + "epoch": 0.09615278771533932, + "grad_norm": 0.9197877049446106, + "learning_rate": 9.949118424922852e-06, + "loss": 0.8669, + "step": 1747 + }, + { + "epoch": 0.09620782651769498, + "grad_norm": 0.9771283864974976, + "learning_rate": 9.949056724356251e-06, + "loss": 0.8461, + "step": 1748 + }, + { + "epoch": 0.09626286532005064, + "grad_norm": 0.8325022459030151, + "learning_rate": 9.948994986593864e-06, + "loss": 0.8482, + "step": 1749 + }, + { + "epoch": 0.0963179041224063, + "grad_norm": 0.9732363224029541, + "learning_rate": 9.948933211636158e-06, + "loss": 0.8825, + "step": 1750 + }, + { + "epoch": 0.09637294292476195, + "grad_norm": 0.8229798078536987, + "learning_rate": 9.948871399483592e-06, + "loss": 0.8079, + "step": 1751 + }, + { + "epoch": 0.09642798172711761, + "grad_norm": 0.8861554265022278, + "learning_rate": 9.948809550136635e-06, + "loss": 0.8323, + "step": 1752 + }, + { + "epoch": 0.09648302052947327, + "grad_norm": 1.0618904829025269, + "learning_rate": 9.94874766359575e-06, + "loss": 0.8519, + "step": 1753 + }, + { + "epoch": 0.09653805933182893, + "grad_norm": 0.8494864702224731, + "learning_rate": 9.948685739861403e-06, + "loss": 0.961, + "step": 1754 + }, + { + "epoch": 0.0965930981341846, + "grad_norm": 0.8872213959693909, + "learning_rate": 9.948623778934058e-06, + "loss": 0.9367, + "step": 1755 + }, + { + "epoch": 0.09664813693654026, + "grad_norm": 0.8441230058670044, + "learning_rate": 9.948561780814181e-06, + "loss": 0.7654, + "step": 1756 + }, + { + "epoch": 0.09670317573889592, + "grad_norm": 0.8072223663330078, + "learning_rate": 9.948499745502239e-06, + "loss": 0.7894, + "step": 1757 + }, + { + "epoch": 0.09675821454125158, + "grad_norm": 0.8285261392593384, + "learning_rate": 9.948437672998696e-06, + "loss": 0.8351, + "step": 1758 + }, + { + "epoch": 0.09681325334360724, + "grad_norm": 0.9272124767303467, + "learning_rate": 9.94837556330402e-06, + "loss": 0.8708, + "step": 1759 + }, + { + "epoch": 0.0968682921459629, + "grad_norm": 0.8689375519752502, + "learning_rate": 9.94831341641868e-06, + "loss": 0.8478, + "step": 1760 + }, + { + "epoch": 0.09692333094831856, + "grad_norm": 1.040784239768982, + "learning_rate": 9.94825123234314e-06, + "loss": 0.8915, + "step": 1761 + }, + { + "epoch": 0.09697836975067423, + "grad_norm": 0.7819718718528748, + "learning_rate": 9.948189011077867e-06, + "loss": 0.7728, + "step": 1762 + }, + { + "epoch": 0.09703340855302989, + "grad_norm": 0.7959379553794861, + "learning_rate": 9.948126752623331e-06, + "loss": 0.8248, + "step": 1763 + }, + { + "epoch": 0.09708844735538555, + "grad_norm": 0.8844753503799438, + "learning_rate": 9.94806445698e-06, + "loss": 0.7742, + "step": 1764 + }, + { + "epoch": 0.09714348615774121, + "grad_norm": 0.9168505668640137, + "learning_rate": 9.948002124148339e-06, + "loss": 0.9145, + "step": 1765 + }, + { + "epoch": 0.09719852496009687, + "grad_norm": 0.7199662923812866, + "learning_rate": 9.947939754128819e-06, + "loss": 0.6652, + "step": 1766 + }, + { + "epoch": 0.09725356376245253, + "grad_norm": 0.866470992565155, + "learning_rate": 9.947877346921909e-06, + "loss": 0.8293, + "step": 1767 + }, + { + "epoch": 0.0973086025648082, + "grad_norm": 0.9124754667282104, + "learning_rate": 9.947814902528078e-06, + "loss": 0.8599, + "step": 1768 + }, + { + "epoch": 0.09736364136716386, + "grad_norm": 0.9169870615005493, + "learning_rate": 9.947752420947792e-06, + "loss": 0.8382, + "step": 1769 + }, + { + "epoch": 0.09741868016951952, + "grad_norm": 1.0147640705108643, + "learning_rate": 9.947689902181526e-06, + "loss": 0.8425, + "step": 1770 + }, + { + "epoch": 0.09747371897187518, + "grad_norm": 0.778575599193573, + "learning_rate": 9.947627346229745e-06, + "loss": 0.6979, + "step": 1771 + }, + { + "epoch": 0.09752875777423083, + "grad_norm": 0.815101146697998, + "learning_rate": 9.947564753092922e-06, + "loss": 0.8617, + "step": 1772 + }, + { + "epoch": 0.09758379657658649, + "grad_norm": 0.9556358456611633, + "learning_rate": 9.947502122771527e-06, + "loss": 0.9009, + "step": 1773 + }, + { + "epoch": 0.09763883537894215, + "grad_norm": 0.8603761196136475, + "learning_rate": 9.94743945526603e-06, + "loss": 0.9443, + "step": 1774 + }, + { + "epoch": 0.09769387418129781, + "grad_norm": 0.8621761798858643, + "learning_rate": 9.947376750576903e-06, + "loss": 0.7537, + "step": 1775 + }, + { + "epoch": 0.09774891298365347, + "grad_norm": 0.7399948835372925, + "learning_rate": 9.947314008704616e-06, + "loss": 0.7477, + "step": 1776 + }, + { + "epoch": 0.09780395178600913, + "grad_norm": 0.8855582475662231, + "learning_rate": 9.947251229649641e-06, + "loss": 0.8745, + "step": 1777 + }, + { + "epoch": 0.0978589905883648, + "grad_norm": 0.8718472719192505, + "learning_rate": 9.947188413412452e-06, + "loss": 0.9672, + "step": 1778 + }, + { + "epoch": 0.09791402939072046, + "grad_norm": 0.8598514795303345, + "learning_rate": 9.947125559993517e-06, + "loss": 0.8278, + "step": 1779 + }, + { + "epoch": 0.09796906819307612, + "grad_norm": 1.0373798608779907, + "learning_rate": 9.947062669393312e-06, + "loss": 0.8123, + "step": 1780 + }, + { + "epoch": 0.09802410699543178, + "grad_norm": 1.0198705196380615, + "learning_rate": 9.946999741612306e-06, + "loss": 0.9039, + "step": 1781 + }, + { + "epoch": 0.09807914579778744, + "grad_norm": 0.8770025968551636, + "learning_rate": 9.946936776650977e-06, + "loss": 0.8326, + "step": 1782 + }, + { + "epoch": 0.0981341846001431, + "grad_norm": 0.7970215678215027, + "learning_rate": 9.946873774509794e-06, + "loss": 0.848, + "step": 1783 + }, + { + "epoch": 0.09818922340249876, + "grad_norm": 0.90342777967453, + "learning_rate": 9.946810735189231e-06, + "loss": 0.7993, + "step": 1784 + }, + { + "epoch": 0.09824426220485442, + "grad_norm": 1.2095681428909302, + "learning_rate": 9.946747658689763e-06, + "loss": 0.8544, + "step": 1785 + }, + { + "epoch": 0.09829930100721009, + "grad_norm": 0.8500953316688538, + "learning_rate": 9.946684545011866e-06, + "loss": 0.8398, + "step": 1786 + }, + { + "epoch": 0.09835433980956575, + "grad_norm": 0.8570724725723267, + "learning_rate": 9.946621394156011e-06, + "loss": 0.9255, + "step": 1787 + }, + { + "epoch": 0.09840937861192141, + "grad_norm": 0.8314846158027649, + "learning_rate": 9.946558206122672e-06, + "loss": 0.8398, + "step": 1788 + }, + { + "epoch": 0.09846441741427707, + "grad_norm": 0.8894716501235962, + "learning_rate": 9.946494980912326e-06, + "loss": 0.8612, + "step": 1789 + }, + { + "epoch": 0.09851945621663273, + "grad_norm": 0.9555756449699402, + "learning_rate": 9.94643171852545e-06, + "loss": 0.9551, + "step": 1790 + }, + { + "epoch": 0.09857449501898839, + "grad_norm": 0.9556692838668823, + "learning_rate": 9.946368418962515e-06, + "loss": 0.8175, + "step": 1791 + }, + { + "epoch": 0.09862953382134405, + "grad_norm": 0.7288535833358765, + "learning_rate": 9.946305082224e-06, + "loss": 0.6162, + "step": 1792 + }, + { + "epoch": 0.09868457262369972, + "grad_norm": 0.95478355884552, + "learning_rate": 9.94624170831038e-06, + "loss": 0.9089, + "step": 1793 + }, + { + "epoch": 0.09873961142605536, + "grad_norm": 0.9080137610435486, + "learning_rate": 9.946178297222133e-06, + "loss": 0.9443, + "step": 1794 + }, + { + "epoch": 0.09879465022841102, + "grad_norm": 0.8060124516487122, + "learning_rate": 9.946114848959732e-06, + "loss": 0.7412, + "step": 1795 + }, + { + "epoch": 0.09884968903076669, + "grad_norm": 0.8487932085990906, + "learning_rate": 9.946051363523655e-06, + "loss": 0.7098, + "step": 1796 + }, + { + "epoch": 0.09890472783312235, + "grad_norm": 0.8982037901878357, + "learning_rate": 9.945987840914381e-06, + "loss": 0.8304, + "step": 1797 + }, + { + "epoch": 0.09895976663547801, + "grad_norm": 0.8124602437019348, + "learning_rate": 9.945924281132386e-06, + "loss": 0.8441, + "step": 1798 + }, + { + "epoch": 0.09901480543783367, + "grad_norm": 0.8081663250923157, + "learning_rate": 9.945860684178147e-06, + "loss": 0.732, + "step": 1799 + }, + { + "epoch": 0.09906984424018933, + "grad_norm": 0.7662907242774963, + "learning_rate": 9.945797050052147e-06, + "loss": 0.7538, + "step": 1800 + }, + { + "epoch": 0.09912488304254499, + "grad_norm": 0.8418399095535278, + "learning_rate": 9.945733378754856e-06, + "loss": 0.8488, + "step": 1801 + }, + { + "epoch": 0.09917992184490065, + "grad_norm": 0.7298988699913025, + "learning_rate": 9.94566967028676e-06, + "loss": 0.7822, + "step": 1802 + }, + { + "epoch": 0.09923496064725632, + "grad_norm": 0.7788695693016052, + "learning_rate": 9.945605924648332e-06, + "loss": 0.8037, + "step": 1803 + }, + { + "epoch": 0.09928999944961198, + "grad_norm": 0.939297080039978, + "learning_rate": 9.945542141840054e-06, + "loss": 0.8654, + "step": 1804 + }, + { + "epoch": 0.09934503825196764, + "grad_norm": 0.9274358749389648, + "learning_rate": 9.945478321862406e-06, + "loss": 0.7712, + "step": 1805 + }, + { + "epoch": 0.0994000770543233, + "grad_norm": 0.816561222076416, + "learning_rate": 9.945414464715866e-06, + "loss": 0.7676, + "step": 1806 + }, + { + "epoch": 0.09945511585667896, + "grad_norm": 0.867915153503418, + "learning_rate": 9.945350570400916e-06, + "loss": 0.8343, + "step": 1807 + }, + { + "epoch": 0.09951015465903462, + "grad_norm": 0.8446162939071655, + "learning_rate": 9.945286638918034e-06, + "loss": 0.8128, + "step": 1808 + }, + { + "epoch": 0.09956519346139028, + "grad_norm": 0.8372986316680908, + "learning_rate": 9.945222670267703e-06, + "loss": 0.8611, + "step": 1809 + }, + { + "epoch": 0.09962023226374594, + "grad_norm": 0.787836492061615, + "learning_rate": 9.945158664450399e-06, + "loss": 0.7286, + "step": 1810 + }, + { + "epoch": 0.0996752710661016, + "grad_norm": 0.9293436408042908, + "learning_rate": 9.945094621466609e-06, + "loss": 0.8699, + "step": 1811 + }, + { + "epoch": 0.09973030986845727, + "grad_norm": 0.8336932063102722, + "learning_rate": 9.94503054131681e-06, + "loss": 0.8222, + "step": 1812 + }, + { + "epoch": 0.09978534867081293, + "grad_norm": 0.8310953378677368, + "learning_rate": 9.944966424001486e-06, + "loss": 0.8131, + "step": 1813 + }, + { + "epoch": 0.09984038747316859, + "grad_norm": 0.7703443169593811, + "learning_rate": 9.944902269521117e-06, + "loss": 0.8135, + "step": 1814 + }, + { + "epoch": 0.09989542627552424, + "grad_norm": 0.750990092754364, + "learning_rate": 9.944838077876186e-06, + "loss": 0.8137, + "step": 1815 + }, + { + "epoch": 0.0999504650778799, + "grad_norm": 0.8502481579780579, + "learning_rate": 9.944773849067178e-06, + "loss": 0.8973, + "step": 1816 + }, + { + "epoch": 0.10000550388023556, + "grad_norm": 0.8299791812896729, + "learning_rate": 9.94470958309457e-06, + "loss": 0.8341, + "step": 1817 + }, + { + "epoch": 0.10006054268259122, + "grad_norm": 0.8519022464752197, + "learning_rate": 9.94464527995885e-06, + "loss": 0.8529, + "step": 1818 + }, + { + "epoch": 0.10011558148494688, + "grad_norm": 0.9318063259124756, + "learning_rate": 9.944580939660501e-06, + "loss": 0.8978, + "step": 1819 + }, + { + "epoch": 0.10017062028730254, + "grad_norm": 0.847023069858551, + "learning_rate": 9.944516562200004e-06, + "loss": 0.8007, + "step": 1820 + }, + { + "epoch": 0.1002256590896582, + "grad_norm": 0.8817011117935181, + "learning_rate": 9.944452147577844e-06, + "loss": 0.8819, + "step": 1821 + }, + { + "epoch": 0.10028069789201387, + "grad_norm": 0.8560144901275635, + "learning_rate": 9.944387695794505e-06, + "loss": 0.8219, + "step": 1822 + }, + { + "epoch": 0.10033573669436953, + "grad_norm": 0.9358342885971069, + "learning_rate": 9.944323206850472e-06, + "loss": 0.8533, + "step": 1823 + }, + { + "epoch": 0.10039077549672519, + "grad_norm": 0.8327087163925171, + "learning_rate": 9.94425868074623e-06, + "loss": 0.8359, + "step": 1824 + }, + { + "epoch": 0.10044581429908085, + "grad_norm": 1.0590367317199707, + "learning_rate": 9.944194117482263e-06, + "loss": 0.9659, + "step": 1825 + }, + { + "epoch": 0.10050085310143651, + "grad_norm": 0.8739829063415527, + "learning_rate": 9.944129517059055e-06, + "loss": 0.7868, + "step": 1826 + }, + { + "epoch": 0.10055589190379217, + "grad_norm": 0.8465235233306885, + "learning_rate": 9.944064879477093e-06, + "loss": 0.8554, + "step": 1827 + }, + { + "epoch": 0.10061093070614784, + "grad_norm": 0.9068321585655212, + "learning_rate": 9.944000204736864e-06, + "loss": 0.8648, + "step": 1828 + }, + { + "epoch": 0.1006659695085035, + "grad_norm": 0.8308066725730896, + "learning_rate": 9.943935492838853e-06, + "loss": 0.8471, + "step": 1829 + }, + { + "epoch": 0.10072100831085916, + "grad_norm": 0.9973901510238647, + "learning_rate": 9.943870743783545e-06, + "loss": 0.9398, + "step": 1830 + }, + { + "epoch": 0.10077604711321482, + "grad_norm": 0.8532593250274658, + "learning_rate": 9.94380595757143e-06, + "loss": 0.9001, + "step": 1831 + }, + { + "epoch": 0.10083108591557048, + "grad_norm": 0.8571139574050903, + "learning_rate": 9.94374113420299e-06, + "loss": 0.85, + "step": 1832 + }, + { + "epoch": 0.10088612471792614, + "grad_norm": 0.905624508857727, + "learning_rate": 9.943676273678717e-06, + "loss": 0.9587, + "step": 1833 + }, + { + "epoch": 0.1009411635202818, + "grad_norm": 1.0224663019180298, + "learning_rate": 9.943611375999097e-06, + "loss": 0.8236, + "step": 1834 + }, + { + "epoch": 0.10099620232263747, + "grad_norm": 0.8900588154792786, + "learning_rate": 9.943546441164615e-06, + "loss": 0.877, + "step": 1835 + }, + { + "epoch": 0.10105124112499313, + "grad_norm": 0.8852938413619995, + "learning_rate": 9.943481469175765e-06, + "loss": 0.9521, + "step": 1836 + }, + { + "epoch": 0.10110627992734877, + "grad_norm": 0.9249371290206909, + "learning_rate": 9.943416460033027e-06, + "loss": 0.8541, + "step": 1837 + }, + { + "epoch": 0.10116131872970444, + "grad_norm": 0.8533583283424377, + "learning_rate": 9.943351413736897e-06, + "loss": 0.8571, + "step": 1838 + }, + { + "epoch": 0.1012163575320601, + "grad_norm": 0.743800699710846, + "learning_rate": 9.94328633028786e-06, + "loss": 0.749, + "step": 1839 + }, + { + "epoch": 0.10127139633441576, + "grad_norm": 0.7836641669273376, + "learning_rate": 9.943221209686407e-06, + "loss": 0.8237, + "step": 1840 + }, + { + "epoch": 0.10132643513677142, + "grad_norm": 0.800782322883606, + "learning_rate": 9.943156051933024e-06, + "loss": 0.8323, + "step": 1841 + }, + { + "epoch": 0.10138147393912708, + "grad_norm": 0.7531478404998779, + "learning_rate": 9.943090857028206e-06, + "loss": 0.8041, + "step": 1842 + }, + { + "epoch": 0.10143651274148274, + "grad_norm": 0.9837996959686279, + "learning_rate": 9.94302562497244e-06, + "loss": 0.8084, + "step": 1843 + }, + { + "epoch": 0.1014915515438384, + "grad_norm": 0.8038331866264343, + "learning_rate": 9.942960355766216e-06, + "loss": 0.8454, + "step": 1844 + }, + { + "epoch": 0.10154659034619407, + "grad_norm": 0.7822145819664001, + "learning_rate": 9.942895049410024e-06, + "loss": 0.8137, + "step": 1845 + }, + { + "epoch": 0.10160162914854973, + "grad_norm": 0.8222663998603821, + "learning_rate": 9.942829705904358e-06, + "loss": 0.8981, + "step": 1846 + }, + { + "epoch": 0.10165666795090539, + "grad_norm": 1.0095717906951904, + "learning_rate": 9.942764325249707e-06, + "loss": 0.9159, + "step": 1847 + }, + { + "epoch": 0.10171170675326105, + "grad_norm": 0.8264054656028748, + "learning_rate": 9.942698907446561e-06, + "loss": 0.9233, + "step": 1848 + }, + { + "epoch": 0.10176674555561671, + "grad_norm": 0.8244288563728333, + "learning_rate": 9.942633452495414e-06, + "loss": 0.8507, + "step": 1849 + }, + { + "epoch": 0.10182178435797237, + "grad_norm": 0.8457715511322021, + "learning_rate": 9.942567960396755e-06, + "loss": 0.7897, + "step": 1850 + }, + { + "epoch": 0.10187682316032803, + "grad_norm": 0.8356698155403137, + "learning_rate": 9.94250243115108e-06, + "loss": 0.7927, + "step": 1851 + }, + { + "epoch": 0.1019318619626837, + "grad_norm": 0.8251230716705322, + "learning_rate": 9.94243686475888e-06, + "loss": 0.8977, + "step": 1852 + }, + { + "epoch": 0.10198690076503936, + "grad_norm": 0.8370125889778137, + "learning_rate": 9.942371261220647e-06, + "loss": 0.8204, + "step": 1853 + }, + { + "epoch": 0.10204193956739502, + "grad_norm": 1.6722066402435303, + "learning_rate": 9.942305620536876e-06, + "loss": 0.9284, + "step": 1854 + }, + { + "epoch": 0.10209697836975068, + "grad_norm": 0.8424906730651855, + "learning_rate": 9.942239942708057e-06, + "loss": 0.833, + "step": 1855 + }, + { + "epoch": 0.10215201717210634, + "grad_norm": 0.7475115656852722, + "learning_rate": 9.942174227734686e-06, + "loss": 0.6158, + "step": 1856 + }, + { + "epoch": 0.102207055974462, + "grad_norm": 0.8652095198631287, + "learning_rate": 9.942108475617256e-06, + "loss": 0.8781, + "step": 1857 + }, + { + "epoch": 0.10226209477681765, + "grad_norm": 1.0621691942214966, + "learning_rate": 9.942042686356263e-06, + "loss": 1.0276, + "step": 1858 + }, + { + "epoch": 0.10231713357917331, + "grad_norm": 1.113357424736023, + "learning_rate": 9.941976859952199e-06, + "loss": 0.8799, + "step": 1859 + }, + { + "epoch": 0.10237217238152897, + "grad_norm": 0.9153568148612976, + "learning_rate": 9.94191099640556e-06, + "loss": 0.7988, + "step": 1860 + }, + { + "epoch": 0.10242721118388463, + "grad_norm": 0.9217341542243958, + "learning_rate": 9.941845095716842e-06, + "loss": 0.7785, + "step": 1861 + }, + { + "epoch": 0.1024822499862403, + "grad_norm": 0.8702190518379211, + "learning_rate": 9.941779157886538e-06, + "loss": 0.7648, + "step": 1862 + }, + { + "epoch": 0.10253728878859596, + "grad_norm": 0.8609822988510132, + "learning_rate": 9.941713182915144e-06, + "loss": 0.9095, + "step": 1863 + }, + { + "epoch": 0.10259232759095162, + "grad_norm": 0.7766719460487366, + "learning_rate": 9.941647170803157e-06, + "loss": 0.6984, + "step": 1864 + }, + { + "epoch": 0.10264736639330728, + "grad_norm": 0.8497375249862671, + "learning_rate": 9.941581121551074e-06, + "loss": 0.9161, + "step": 1865 + }, + { + "epoch": 0.10270240519566294, + "grad_norm": 0.8007600903511047, + "learning_rate": 9.941515035159388e-06, + "loss": 0.8099, + "step": 1866 + }, + { + "epoch": 0.1027574439980186, + "grad_norm": 0.7932959794998169, + "learning_rate": 9.941448911628599e-06, + "loss": 0.8049, + "step": 1867 + }, + { + "epoch": 0.10281248280037426, + "grad_norm": 1.3169244527816772, + "learning_rate": 9.941382750959203e-06, + "loss": 0.8601, + "step": 1868 + }, + { + "epoch": 0.10286752160272992, + "grad_norm": 0.8011140823364258, + "learning_rate": 9.941316553151696e-06, + "loss": 0.8397, + "step": 1869 + }, + { + "epoch": 0.10292256040508559, + "grad_norm": 0.811210572719574, + "learning_rate": 9.941250318206577e-06, + "loss": 0.7863, + "step": 1870 + }, + { + "epoch": 0.10297759920744125, + "grad_norm": 0.8172751665115356, + "learning_rate": 9.941184046124342e-06, + "loss": 0.8114, + "step": 1871 + }, + { + "epoch": 0.10303263800979691, + "grad_norm": 0.8072887063026428, + "learning_rate": 9.941117736905493e-06, + "loss": 0.8928, + "step": 1872 + }, + { + "epoch": 0.10308767681215257, + "grad_norm": 0.9111380577087402, + "learning_rate": 9.941051390550524e-06, + "loss": 0.866, + "step": 1873 + }, + { + "epoch": 0.10314271561450823, + "grad_norm": 0.8158383369445801, + "learning_rate": 9.940985007059936e-06, + "loss": 0.7805, + "step": 1874 + }, + { + "epoch": 0.1031977544168639, + "grad_norm": 0.8858961462974548, + "learning_rate": 9.940918586434226e-06, + "loss": 0.8424, + "step": 1875 + }, + { + "epoch": 0.10325279321921955, + "grad_norm": 0.8835182189941406, + "learning_rate": 9.940852128673895e-06, + "loss": 0.7816, + "step": 1876 + }, + { + "epoch": 0.10330783202157522, + "grad_norm": 1.044227123260498, + "learning_rate": 9.940785633779444e-06, + "loss": 0.8952, + "step": 1877 + }, + { + "epoch": 0.10336287082393088, + "grad_norm": 0.8255050778388977, + "learning_rate": 9.940719101751367e-06, + "loss": 0.8215, + "step": 1878 + }, + { + "epoch": 0.10341790962628654, + "grad_norm": 0.8561689257621765, + "learning_rate": 9.940652532590172e-06, + "loss": 0.9686, + "step": 1879 + }, + { + "epoch": 0.10347294842864219, + "grad_norm": 0.8798959255218506, + "learning_rate": 9.94058592629635e-06, + "loss": 0.8993, + "step": 1880 + }, + { + "epoch": 0.10352798723099785, + "grad_norm": 0.9292098879814148, + "learning_rate": 9.940519282870411e-06, + "loss": 0.8536, + "step": 1881 + }, + { + "epoch": 0.10358302603335351, + "grad_norm": 0.8865400552749634, + "learning_rate": 9.940452602312851e-06, + "loss": 0.8024, + "step": 1882 + }, + { + "epoch": 0.10363806483570917, + "grad_norm": 0.8985510468482971, + "learning_rate": 9.94038588462417e-06, + "loss": 0.7748, + "step": 1883 + }, + { + "epoch": 0.10369310363806483, + "grad_norm": 0.9973617196083069, + "learning_rate": 9.940319129804872e-06, + "loss": 0.875, + "step": 1884 + }, + { + "epoch": 0.1037481424404205, + "grad_norm": 0.8615350723266602, + "learning_rate": 9.940252337855458e-06, + "loss": 0.904, + "step": 1885 + }, + { + "epoch": 0.10380318124277615, + "grad_norm": 0.8752412796020508, + "learning_rate": 9.940185508776429e-06, + "loss": 0.8735, + "step": 1886 + }, + { + "epoch": 0.10385822004513182, + "grad_norm": 0.8639446496963501, + "learning_rate": 9.94011864256829e-06, + "loss": 0.7952, + "step": 1887 + }, + { + "epoch": 0.10391325884748748, + "grad_norm": 0.7932116389274597, + "learning_rate": 9.94005173923154e-06, + "loss": 0.8721, + "step": 1888 + }, + { + "epoch": 0.10396829764984314, + "grad_norm": 0.8573791980743408, + "learning_rate": 9.939984798766685e-06, + "loss": 0.9271, + "step": 1889 + }, + { + "epoch": 0.1040233364521988, + "grad_norm": 0.9080122113227844, + "learning_rate": 9.939917821174225e-06, + "loss": 0.8991, + "step": 1890 + }, + { + "epoch": 0.10407837525455446, + "grad_norm": 0.7883808612823486, + "learning_rate": 9.939850806454664e-06, + "loss": 0.6895, + "step": 1891 + }, + { + "epoch": 0.10413341405691012, + "grad_norm": 0.8067768216133118, + "learning_rate": 9.93978375460851e-06, + "loss": 0.835, + "step": 1892 + }, + { + "epoch": 0.10418845285926578, + "grad_norm": 0.8756459951400757, + "learning_rate": 9.939716665636262e-06, + "loss": 0.8144, + "step": 1893 + }, + { + "epoch": 0.10424349166162145, + "grad_norm": 0.8056700825691223, + "learning_rate": 9.939649539538425e-06, + "loss": 0.7454, + "step": 1894 + }, + { + "epoch": 0.10429853046397711, + "grad_norm": 1.0756300687789917, + "learning_rate": 9.939582376315505e-06, + "loss": 0.8096, + "step": 1895 + }, + { + "epoch": 0.10435356926633277, + "grad_norm": 0.8938102126121521, + "learning_rate": 9.939515175968006e-06, + "loss": 0.7496, + "step": 1896 + }, + { + "epoch": 0.10440860806868843, + "grad_norm": 0.9371656775474548, + "learning_rate": 9.939447938496434e-06, + "loss": 0.9817, + "step": 1897 + }, + { + "epoch": 0.10446364687104409, + "grad_norm": 1.0216082334518433, + "learning_rate": 9.939380663901292e-06, + "loss": 0.8804, + "step": 1898 + }, + { + "epoch": 0.10451868567339975, + "grad_norm": 0.8791126012802124, + "learning_rate": 9.939313352183088e-06, + "loss": 0.7811, + "step": 1899 + }, + { + "epoch": 0.10457372447575541, + "grad_norm": 0.9925445914268494, + "learning_rate": 9.939246003342326e-06, + "loss": 0.8892, + "step": 1900 + }, + { + "epoch": 0.10462876327811106, + "grad_norm": 1.0459916591644287, + "learning_rate": 9.939178617379514e-06, + "loss": 0.7938, + "step": 1901 + }, + { + "epoch": 0.10468380208046672, + "grad_norm": 0.9103816747665405, + "learning_rate": 9.93911119429516e-06, + "loss": 0.8282, + "step": 1902 + }, + { + "epoch": 0.10473884088282238, + "grad_norm": 0.9602296352386475, + "learning_rate": 9.939043734089764e-06, + "loss": 0.919, + "step": 1903 + }, + { + "epoch": 0.10479387968517805, + "grad_norm": 0.9529246687889099, + "learning_rate": 9.93897623676384e-06, + "loss": 0.9469, + "step": 1904 + }, + { + "epoch": 0.10484891848753371, + "grad_norm": 0.9619705080986023, + "learning_rate": 9.938908702317893e-06, + "loss": 0.9371, + "step": 1905 + }, + { + "epoch": 0.10490395728988937, + "grad_norm": 1.0106935501098633, + "learning_rate": 9.938841130752428e-06, + "loss": 0.7502, + "step": 1906 + }, + { + "epoch": 0.10495899609224503, + "grad_norm": 0.913985013961792, + "learning_rate": 9.938773522067957e-06, + "loss": 0.8172, + "step": 1907 + }, + { + "epoch": 0.10501403489460069, + "grad_norm": 0.9474983215332031, + "learning_rate": 9.938705876264985e-06, + "loss": 0.8999, + "step": 1908 + }, + { + "epoch": 0.10506907369695635, + "grad_norm": 0.9185097813606262, + "learning_rate": 9.938638193344024e-06, + "loss": 0.8976, + "step": 1909 + }, + { + "epoch": 0.10512411249931201, + "grad_norm": 0.7633675932884216, + "learning_rate": 9.938570473305578e-06, + "loss": 0.7777, + "step": 1910 + }, + { + "epoch": 0.10517915130166768, + "grad_norm": 0.9547691345214844, + "learning_rate": 9.938502716150159e-06, + "loss": 0.8154, + "step": 1911 + }, + { + "epoch": 0.10523419010402334, + "grad_norm": 0.8556191921234131, + "learning_rate": 9.938434921878275e-06, + "loss": 0.828, + "step": 1912 + }, + { + "epoch": 0.105289228906379, + "grad_norm": 0.9826140999794006, + "learning_rate": 9.938367090490437e-06, + "loss": 0.8085, + "step": 1913 + }, + { + "epoch": 0.10534426770873466, + "grad_norm": 0.8610432744026184, + "learning_rate": 9.938299221987154e-06, + "loss": 0.9103, + "step": 1914 + }, + { + "epoch": 0.10539930651109032, + "grad_norm": 0.8383543491363525, + "learning_rate": 9.938231316368934e-06, + "loss": 0.8182, + "step": 1915 + }, + { + "epoch": 0.10545434531344598, + "grad_norm": 0.8552964925765991, + "learning_rate": 9.93816337363629e-06, + "loss": 0.8024, + "step": 1916 + }, + { + "epoch": 0.10550938411580164, + "grad_norm": 0.9255730509757996, + "learning_rate": 9.938095393789732e-06, + "loss": 0.8566, + "step": 1917 + }, + { + "epoch": 0.1055644229181573, + "grad_norm": 0.9882987141609192, + "learning_rate": 9.938027376829774e-06, + "loss": 0.7119, + "step": 1918 + }, + { + "epoch": 0.10561946172051297, + "grad_norm": 1.139404535293579, + "learning_rate": 9.93795932275692e-06, + "loss": 0.8839, + "step": 1919 + }, + { + "epoch": 0.10567450052286863, + "grad_norm": 1.004782795906067, + "learning_rate": 9.937891231571686e-06, + "loss": 0.904, + "step": 1920 + }, + { + "epoch": 0.10572953932522429, + "grad_norm": 0.8437260389328003, + "learning_rate": 9.937823103274585e-06, + "loss": 0.7942, + "step": 1921 + }, + { + "epoch": 0.10578457812757995, + "grad_norm": 1.1388722658157349, + "learning_rate": 9.937754937866127e-06, + "loss": 0.9491, + "step": 1922 + }, + { + "epoch": 0.1058396169299356, + "grad_norm": 0.9266740083694458, + "learning_rate": 9.937686735346823e-06, + "loss": 0.9067, + "step": 1923 + }, + { + "epoch": 0.10589465573229126, + "grad_norm": 0.7536123991012573, + "learning_rate": 9.93761849571719e-06, + "loss": 0.6533, + "step": 1924 + }, + { + "epoch": 0.10594969453464692, + "grad_norm": 0.8781737089157104, + "learning_rate": 9.937550218977737e-06, + "loss": 0.8319, + "step": 1925 + }, + { + "epoch": 0.10600473333700258, + "grad_norm": 0.8577924966812134, + "learning_rate": 9.937481905128976e-06, + "loss": 0.8604, + "step": 1926 + }, + { + "epoch": 0.10605977213935824, + "grad_norm": 0.8351713418960571, + "learning_rate": 9.937413554171424e-06, + "loss": 0.946, + "step": 1927 + }, + { + "epoch": 0.1061148109417139, + "grad_norm": 0.971491813659668, + "learning_rate": 9.937345166105594e-06, + "loss": 0.7383, + "step": 1928 + }, + { + "epoch": 0.10616984974406957, + "grad_norm": 0.8020079731941223, + "learning_rate": 9.937276740932001e-06, + "loss": 0.7468, + "step": 1929 + }, + { + "epoch": 0.10622488854642523, + "grad_norm": 0.9057347178459167, + "learning_rate": 9.937208278651153e-06, + "loss": 0.8223, + "step": 1930 + }, + { + "epoch": 0.10627992734878089, + "grad_norm": 0.8384734392166138, + "learning_rate": 9.937139779263574e-06, + "loss": 0.8773, + "step": 1931 + }, + { + "epoch": 0.10633496615113655, + "grad_norm": 0.8732065558433533, + "learning_rate": 9.93707124276977e-06, + "loss": 0.8265, + "step": 1932 + }, + { + "epoch": 0.10639000495349221, + "grad_norm": 0.8744868040084839, + "learning_rate": 9.937002669170264e-06, + "loss": 0.8497, + "step": 1933 + }, + { + "epoch": 0.10644504375584787, + "grad_norm": 0.8589879870414734, + "learning_rate": 9.936934058465564e-06, + "loss": 0.8116, + "step": 1934 + }, + { + "epoch": 0.10650008255820353, + "grad_norm": 0.8614563941955566, + "learning_rate": 9.936865410656192e-06, + "loss": 0.7823, + "step": 1935 + }, + { + "epoch": 0.1065551213605592, + "grad_norm": 0.8381434082984924, + "learning_rate": 9.93679672574266e-06, + "loss": 0.7889, + "step": 1936 + }, + { + "epoch": 0.10661016016291486, + "grad_norm": 0.9834293127059937, + "learning_rate": 9.936728003725484e-06, + "loss": 0.8358, + "step": 1937 + }, + { + "epoch": 0.10666519896527052, + "grad_norm": 0.8461851477622986, + "learning_rate": 9.936659244605184e-06, + "loss": 0.8408, + "step": 1938 + }, + { + "epoch": 0.10672023776762618, + "grad_norm": 1.0186371803283691, + "learning_rate": 9.936590448382273e-06, + "loss": 0.8118, + "step": 1939 + }, + { + "epoch": 0.10677527656998184, + "grad_norm": 0.866321325302124, + "learning_rate": 9.93652161505727e-06, + "loss": 0.8696, + "step": 1940 + }, + { + "epoch": 0.1068303153723375, + "grad_norm": 0.9179622530937195, + "learning_rate": 9.936452744630692e-06, + "loss": 0.8419, + "step": 1941 + }, + { + "epoch": 0.10688535417469316, + "grad_norm": 0.8250496983528137, + "learning_rate": 9.936383837103057e-06, + "loss": 0.8511, + "step": 1942 + }, + { + "epoch": 0.10694039297704883, + "grad_norm": 0.8475700616836548, + "learning_rate": 9.936314892474883e-06, + "loss": 0.8404, + "step": 1943 + }, + { + "epoch": 0.10699543177940447, + "grad_norm": 0.774334192276001, + "learning_rate": 9.936245910746684e-06, + "loss": 0.7461, + "step": 1944 + }, + { + "epoch": 0.10705047058176013, + "grad_norm": 0.9313948154449463, + "learning_rate": 9.936176891918986e-06, + "loss": 0.8486, + "step": 1945 + }, + { + "epoch": 0.1071055093841158, + "grad_norm": 0.8784124255180359, + "learning_rate": 9.936107835992304e-06, + "loss": 0.84, + "step": 1946 + }, + { + "epoch": 0.10716054818647146, + "grad_norm": 0.9087465405464172, + "learning_rate": 9.936038742967154e-06, + "loss": 0.9012, + "step": 1947 + }, + { + "epoch": 0.10721558698882712, + "grad_norm": 0.8462012410163879, + "learning_rate": 9.93596961284406e-06, + "loss": 0.9193, + "step": 1948 + }, + { + "epoch": 0.10727062579118278, + "grad_norm": 0.8984553813934326, + "learning_rate": 9.935900445623538e-06, + "loss": 0.781, + "step": 1949 + }, + { + "epoch": 0.10732566459353844, + "grad_norm": 0.9197295308113098, + "learning_rate": 9.935831241306111e-06, + "loss": 0.8861, + "step": 1950 + }, + { + "epoch": 0.1073807033958941, + "grad_norm": 0.8452801704406738, + "learning_rate": 9.935761999892296e-06, + "loss": 0.8649, + "step": 1951 + }, + { + "epoch": 0.10743574219824976, + "grad_norm": 0.8047192096710205, + "learning_rate": 9.935692721382618e-06, + "loss": 0.8704, + "step": 1952 + }, + { + "epoch": 0.10749078100060543, + "grad_norm": 0.9536359906196594, + "learning_rate": 9.935623405777593e-06, + "loss": 0.7803, + "step": 1953 + }, + { + "epoch": 0.10754581980296109, + "grad_norm": 0.8215291500091553, + "learning_rate": 9.935554053077744e-06, + "loss": 0.8247, + "step": 1954 + }, + { + "epoch": 0.10760085860531675, + "grad_norm": 0.9261930584907532, + "learning_rate": 9.93548466328359e-06, + "loss": 0.8594, + "step": 1955 + }, + { + "epoch": 0.10765589740767241, + "grad_norm": 0.7973492741584778, + "learning_rate": 9.935415236395656e-06, + "loss": 0.7464, + "step": 1956 + }, + { + "epoch": 0.10771093621002807, + "grad_norm": 0.9328988790512085, + "learning_rate": 9.935345772414463e-06, + "loss": 0.8472, + "step": 1957 + }, + { + "epoch": 0.10776597501238373, + "grad_norm": 0.9490759968757629, + "learning_rate": 9.935276271340532e-06, + "loss": 0.806, + "step": 1958 + }, + { + "epoch": 0.1078210138147394, + "grad_norm": 0.9149925112724304, + "learning_rate": 9.935206733174385e-06, + "loss": 0.8741, + "step": 1959 + }, + { + "epoch": 0.10787605261709506, + "grad_norm": 1.0074039697647095, + "learning_rate": 9.935137157916546e-06, + "loss": 0.8493, + "step": 1960 + }, + { + "epoch": 0.10793109141945072, + "grad_norm": 0.8783678412437439, + "learning_rate": 9.935067545567535e-06, + "loss": 0.8132, + "step": 1961 + }, + { + "epoch": 0.10798613022180638, + "grad_norm": 0.8273885250091553, + "learning_rate": 9.934997896127879e-06, + "loss": 0.7448, + "step": 1962 + }, + { + "epoch": 0.10804116902416204, + "grad_norm": 0.761947512626648, + "learning_rate": 9.9349282095981e-06, + "loss": 0.7933, + "step": 1963 + }, + { + "epoch": 0.1080962078265177, + "grad_norm": 0.814809262752533, + "learning_rate": 9.934858485978722e-06, + "loss": 0.7551, + "step": 1964 + }, + { + "epoch": 0.10815124662887336, + "grad_norm": 0.8108895421028137, + "learning_rate": 9.934788725270266e-06, + "loss": 0.6787, + "step": 1965 + }, + { + "epoch": 0.10820628543122901, + "grad_norm": 0.8669139742851257, + "learning_rate": 9.934718927473262e-06, + "loss": 0.8395, + "step": 1966 + }, + { + "epoch": 0.10826132423358467, + "grad_norm": 0.9093756079673767, + "learning_rate": 9.93464909258823e-06, + "loss": 0.8341, + "step": 1967 + }, + { + "epoch": 0.10831636303594033, + "grad_norm": 0.8923841714859009, + "learning_rate": 9.934579220615697e-06, + "loss": 0.9422, + "step": 1968 + }, + { + "epoch": 0.108371401838296, + "grad_norm": 0.850429117679596, + "learning_rate": 9.934509311556186e-06, + "loss": 0.8446, + "step": 1969 + }, + { + "epoch": 0.10842644064065166, + "grad_norm": 0.8762460350990295, + "learning_rate": 9.934439365410224e-06, + "loss": 0.7788, + "step": 1970 + }, + { + "epoch": 0.10848147944300732, + "grad_norm": 0.9700387716293335, + "learning_rate": 9.934369382178338e-06, + "loss": 0.8455, + "step": 1971 + }, + { + "epoch": 0.10853651824536298, + "grad_norm": 0.8003185987472534, + "learning_rate": 9.934299361861053e-06, + "loss": 0.8026, + "step": 1972 + }, + { + "epoch": 0.10859155704771864, + "grad_norm": 0.9626984596252441, + "learning_rate": 9.934229304458893e-06, + "loss": 0.8219, + "step": 1973 + }, + { + "epoch": 0.1086465958500743, + "grad_norm": 0.8722280859947205, + "learning_rate": 9.934159209972386e-06, + "loss": 0.8866, + "step": 1974 + }, + { + "epoch": 0.10870163465242996, + "grad_norm": 0.838736355304718, + "learning_rate": 9.934089078402061e-06, + "loss": 0.7723, + "step": 1975 + }, + { + "epoch": 0.10875667345478562, + "grad_norm": 0.8373032808303833, + "learning_rate": 9.934018909748443e-06, + "loss": 0.9003, + "step": 1976 + }, + { + "epoch": 0.10881171225714129, + "grad_norm": 0.8704653978347778, + "learning_rate": 9.93394870401206e-06, + "loss": 0.8926, + "step": 1977 + }, + { + "epoch": 0.10886675105949695, + "grad_norm": 0.8088163733482361, + "learning_rate": 9.933878461193437e-06, + "loss": 0.8059, + "step": 1978 + }, + { + "epoch": 0.10892178986185261, + "grad_norm": 0.856421947479248, + "learning_rate": 9.933808181293108e-06, + "loss": 0.8447, + "step": 1979 + }, + { + "epoch": 0.10897682866420827, + "grad_norm": 0.9676237106323242, + "learning_rate": 9.933737864311595e-06, + "loss": 0.9009, + "step": 1980 + }, + { + "epoch": 0.10903186746656393, + "grad_norm": 0.7955103516578674, + "learning_rate": 9.933667510249428e-06, + "loss": 0.881, + "step": 1981 + }, + { + "epoch": 0.10908690626891959, + "grad_norm": 0.7935854196548462, + "learning_rate": 9.933597119107136e-06, + "loss": 0.8773, + "step": 1982 + }, + { + "epoch": 0.10914194507127525, + "grad_norm": 0.7726008296012878, + "learning_rate": 9.933526690885251e-06, + "loss": 0.8133, + "step": 1983 + }, + { + "epoch": 0.10919698387363092, + "grad_norm": 0.8577712178230286, + "learning_rate": 9.9334562255843e-06, + "loss": 0.7455, + "step": 1984 + }, + { + "epoch": 0.10925202267598658, + "grad_norm": 0.9996447563171387, + "learning_rate": 9.933385723204812e-06, + "loss": 0.7312, + "step": 1985 + }, + { + "epoch": 0.10930706147834224, + "grad_norm": 0.9600629806518555, + "learning_rate": 9.933315183747318e-06, + "loss": 0.8792, + "step": 1986 + }, + { + "epoch": 0.10936210028069789, + "grad_norm": 0.9126206636428833, + "learning_rate": 9.933244607212347e-06, + "loss": 1.0023, + "step": 1987 + }, + { + "epoch": 0.10941713908305355, + "grad_norm": 0.774153470993042, + "learning_rate": 9.93317399360043e-06, + "loss": 0.7877, + "step": 1988 + }, + { + "epoch": 0.10947217788540921, + "grad_norm": 0.848495364189148, + "learning_rate": 9.933103342912096e-06, + "loss": 0.8825, + "step": 1989 + }, + { + "epoch": 0.10952721668776487, + "grad_norm": 0.806408166885376, + "learning_rate": 9.933032655147881e-06, + "loss": 0.7389, + "step": 1990 + }, + { + "epoch": 0.10958225549012053, + "grad_norm": 0.8579222559928894, + "learning_rate": 9.932961930308312e-06, + "loss": 0.8283, + "step": 1991 + }, + { + "epoch": 0.10963729429247619, + "grad_norm": 0.7548109292984009, + "learning_rate": 9.93289116839392e-06, + "loss": 0.7971, + "step": 1992 + }, + { + "epoch": 0.10969233309483185, + "grad_norm": 0.7954711318016052, + "learning_rate": 9.93282036940524e-06, + "loss": 0.849, + "step": 1993 + }, + { + "epoch": 0.10974737189718752, + "grad_norm": 0.7911425232887268, + "learning_rate": 9.932749533342802e-06, + "loss": 0.86, + "step": 1994 + }, + { + "epoch": 0.10980241069954318, + "grad_norm": 0.8505094051361084, + "learning_rate": 9.932678660207141e-06, + "loss": 0.7871, + "step": 1995 + }, + { + "epoch": 0.10985744950189884, + "grad_norm": 0.809612512588501, + "learning_rate": 9.932607749998784e-06, + "loss": 0.8337, + "step": 1996 + }, + { + "epoch": 0.1099124883042545, + "grad_norm": 0.738523006439209, + "learning_rate": 9.93253680271827e-06, + "loss": 0.7634, + "step": 1997 + }, + { + "epoch": 0.10996752710661016, + "grad_norm": 0.8434372544288635, + "learning_rate": 9.932465818366128e-06, + "loss": 0.7987, + "step": 1998 + }, + { + "epoch": 0.11002256590896582, + "grad_norm": 0.8068081140518188, + "learning_rate": 9.932394796942895e-06, + "loss": 0.9496, + "step": 1999 + }, + { + "epoch": 0.11007760471132148, + "grad_norm": 0.754342794418335, + "learning_rate": 9.932323738449103e-06, + "loss": 0.7355, + "step": 2000 + }, + { + "epoch": 0.11013264351367714, + "grad_norm": 0.8830806612968445, + "learning_rate": 9.932252642885285e-06, + "loss": 0.8458, + "step": 2001 + }, + { + "epoch": 0.1101876823160328, + "grad_norm": 0.9915485978126526, + "learning_rate": 9.932181510251977e-06, + "loss": 0.8116, + "step": 2002 + }, + { + "epoch": 0.11024272111838847, + "grad_norm": 0.858368992805481, + "learning_rate": 9.932110340549712e-06, + "loss": 0.8354, + "step": 2003 + }, + { + "epoch": 0.11029775992074413, + "grad_norm": 0.8591521382331848, + "learning_rate": 9.932039133779028e-06, + "loss": 0.8316, + "step": 2004 + }, + { + "epoch": 0.11035279872309979, + "grad_norm": 0.8714838624000549, + "learning_rate": 9.931967889940455e-06, + "loss": 0.8106, + "step": 2005 + }, + { + "epoch": 0.11040783752545545, + "grad_norm": 0.8082797527313232, + "learning_rate": 9.931896609034534e-06, + "loss": 0.7762, + "step": 2006 + }, + { + "epoch": 0.11046287632781111, + "grad_norm": 0.9226199984550476, + "learning_rate": 9.931825291061797e-06, + "loss": 0.8641, + "step": 2007 + }, + { + "epoch": 0.11051791513016677, + "grad_norm": 0.8883050680160522, + "learning_rate": 9.931753936022783e-06, + "loss": 0.9014, + "step": 2008 + }, + { + "epoch": 0.11057295393252242, + "grad_norm": 0.9024807810783386, + "learning_rate": 9.931682543918024e-06, + "loss": 0.9085, + "step": 2009 + }, + { + "epoch": 0.11062799273487808, + "grad_norm": 0.8381460905075073, + "learning_rate": 9.931611114748062e-06, + "loss": 0.8043, + "step": 2010 + }, + { + "epoch": 0.11068303153723374, + "grad_norm": 1.1222339868545532, + "learning_rate": 9.931539648513429e-06, + "loss": 0.8388, + "step": 2011 + }, + { + "epoch": 0.1107380703395894, + "grad_norm": 0.9710868000984192, + "learning_rate": 9.931468145214665e-06, + "loss": 0.8934, + "step": 2012 + }, + { + "epoch": 0.11079310914194507, + "grad_norm": 0.9821141958236694, + "learning_rate": 9.931396604852304e-06, + "loss": 0.931, + "step": 2013 + }, + { + "epoch": 0.11084814794430073, + "grad_norm": 1.0658717155456543, + "learning_rate": 9.931325027426889e-06, + "loss": 0.9032, + "step": 2014 + }, + { + "epoch": 0.11090318674665639, + "grad_norm": 0.8836946487426758, + "learning_rate": 9.931253412938956e-06, + "loss": 0.9131, + "step": 2015 + }, + { + "epoch": 0.11095822554901205, + "grad_norm": 0.8438361883163452, + "learning_rate": 9.93118176138904e-06, + "loss": 0.8674, + "step": 2016 + }, + { + "epoch": 0.11101326435136771, + "grad_norm": 0.928142786026001, + "learning_rate": 9.93111007277768e-06, + "loss": 0.8882, + "step": 2017 + }, + { + "epoch": 0.11106830315372337, + "grad_norm": 0.9176276922225952, + "learning_rate": 9.93103834710542e-06, + "loss": 0.8904, + "step": 2018 + }, + { + "epoch": 0.11112334195607904, + "grad_norm": 1.0462889671325684, + "learning_rate": 9.930966584372795e-06, + "loss": 0.8029, + "step": 2019 + }, + { + "epoch": 0.1111783807584347, + "grad_norm": 0.7627375721931458, + "learning_rate": 9.930894784580344e-06, + "loss": 0.8474, + "step": 2020 + }, + { + "epoch": 0.11123341956079036, + "grad_norm": 1.0545588731765747, + "learning_rate": 9.93082294772861e-06, + "loss": 0.7985, + "step": 2021 + }, + { + "epoch": 0.11128845836314602, + "grad_norm": 0.9752298593521118, + "learning_rate": 9.93075107381813e-06, + "loss": 0.8725, + "step": 2022 + }, + { + "epoch": 0.11134349716550168, + "grad_norm": 0.8403159379959106, + "learning_rate": 9.930679162849444e-06, + "loss": 0.8854, + "step": 2023 + }, + { + "epoch": 0.11139853596785734, + "grad_norm": 0.8879380226135254, + "learning_rate": 9.930607214823094e-06, + "loss": 0.7269, + "step": 2024 + }, + { + "epoch": 0.111453574770213, + "grad_norm": 0.907256543636322, + "learning_rate": 9.930535229739618e-06, + "loss": 0.8145, + "step": 2025 + }, + { + "epoch": 0.11150861357256867, + "grad_norm": 1.1066968441009521, + "learning_rate": 9.93046320759956e-06, + "loss": 0.9281, + "step": 2026 + }, + { + "epoch": 0.11156365237492433, + "grad_norm": 0.9226258397102356, + "learning_rate": 9.930391148403462e-06, + "loss": 0.9048, + "step": 2027 + }, + { + "epoch": 0.11161869117727999, + "grad_norm": 0.9652156829833984, + "learning_rate": 9.930319052151862e-06, + "loss": 0.9321, + "step": 2028 + }, + { + "epoch": 0.11167372997963565, + "grad_norm": 0.9102638363838196, + "learning_rate": 9.930246918845305e-06, + "loss": 0.8169, + "step": 2029 + }, + { + "epoch": 0.1117287687819913, + "grad_norm": 0.7765716314315796, + "learning_rate": 9.93017474848433e-06, + "loss": 0.7691, + "step": 2030 + }, + { + "epoch": 0.11178380758434696, + "grad_norm": 0.9053775072097778, + "learning_rate": 9.930102541069484e-06, + "loss": 0.782, + "step": 2031 + }, + { + "epoch": 0.11183884638670262, + "grad_norm": 0.8892827033996582, + "learning_rate": 9.930030296601306e-06, + "loss": 0.8575, + "step": 2032 + }, + { + "epoch": 0.11189388518905828, + "grad_norm": 0.8947604894638062, + "learning_rate": 9.929958015080339e-06, + "loss": 0.8607, + "step": 2033 + }, + { + "epoch": 0.11194892399141394, + "grad_norm": 0.8936871290206909, + "learning_rate": 9.929885696507127e-06, + "loss": 0.8111, + "step": 2034 + }, + { + "epoch": 0.1120039627937696, + "grad_norm": 0.9579165577888489, + "learning_rate": 9.929813340882214e-06, + "loss": 0.911, + "step": 2035 + }, + { + "epoch": 0.11205900159612527, + "grad_norm": 0.7885386347770691, + "learning_rate": 9.929740948206146e-06, + "loss": 0.8074, + "step": 2036 + }, + { + "epoch": 0.11211404039848093, + "grad_norm": 0.817939281463623, + "learning_rate": 9.929668518479462e-06, + "loss": 0.8451, + "step": 2037 + }, + { + "epoch": 0.11216907920083659, + "grad_norm": 0.8695761561393738, + "learning_rate": 9.92959605170271e-06, + "loss": 0.7158, + "step": 2038 + }, + { + "epoch": 0.11222411800319225, + "grad_norm": 0.8569639325141907, + "learning_rate": 9.929523547876433e-06, + "loss": 0.8568, + "step": 2039 + }, + { + "epoch": 0.11227915680554791, + "grad_norm": 0.8569897413253784, + "learning_rate": 9.929451007001176e-06, + "loss": 0.8971, + "step": 2040 + }, + { + "epoch": 0.11233419560790357, + "grad_norm": 0.8520069718360901, + "learning_rate": 9.929378429077487e-06, + "loss": 0.9027, + "step": 2041 + }, + { + "epoch": 0.11238923441025923, + "grad_norm": 0.9338961839675903, + "learning_rate": 9.929305814105907e-06, + "loss": 0.8646, + "step": 2042 + }, + { + "epoch": 0.1124442732126149, + "grad_norm": 0.8497192859649658, + "learning_rate": 9.929233162086985e-06, + "loss": 0.9068, + "step": 2043 + }, + { + "epoch": 0.11249931201497056, + "grad_norm": 0.8570863008499146, + "learning_rate": 9.929160473021267e-06, + "loss": 0.962, + "step": 2044 + }, + { + "epoch": 0.11255435081732622, + "grad_norm": 0.9072359800338745, + "learning_rate": 9.929087746909296e-06, + "loss": 0.8454, + "step": 2045 + }, + { + "epoch": 0.11260938961968188, + "grad_norm": 0.7920698523521423, + "learning_rate": 9.929014983751623e-06, + "loss": 0.8031, + "step": 2046 + }, + { + "epoch": 0.11266442842203754, + "grad_norm": 1.0180169343948364, + "learning_rate": 9.928942183548791e-06, + "loss": 0.7759, + "step": 2047 + }, + { + "epoch": 0.1127194672243932, + "grad_norm": 0.8746892809867859, + "learning_rate": 9.928869346301351e-06, + "loss": 0.9038, + "step": 2048 + }, + { + "epoch": 0.11277450602674886, + "grad_norm": 0.8283438086509705, + "learning_rate": 9.928796472009846e-06, + "loss": 0.8883, + "step": 2049 + }, + { + "epoch": 0.11282954482910452, + "grad_norm": 1.321917176246643, + "learning_rate": 9.928723560674828e-06, + "loss": 0.835, + "step": 2050 + }, + { + "epoch": 0.11288458363146017, + "grad_norm": 0.9356202483177185, + "learning_rate": 9.928650612296841e-06, + "loss": 0.8077, + "step": 2051 + }, + { + "epoch": 0.11293962243381583, + "grad_norm": 0.8493767380714417, + "learning_rate": 9.928577626876439e-06, + "loss": 0.8295, + "step": 2052 + }, + { + "epoch": 0.1129946612361715, + "grad_norm": 0.784818708896637, + "learning_rate": 9.928504604414164e-06, + "loss": 0.8322, + "step": 2053 + }, + { + "epoch": 0.11304970003852716, + "grad_norm": 0.9095364809036255, + "learning_rate": 9.928431544910567e-06, + "loss": 0.8757, + "step": 2054 + }, + { + "epoch": 0.11310473884088282, + "grad_norm": 0.8889689445495605, + "learning_rate": 9.9283584483662e-06, + "loss": 0.8583, + "step": 2055 + }, + { + "epoch": 0.11315977764323848, + "grad_norm": 0.8702652454376221, + "learning_rate": 9.928285314781607e-06, + "loss": 0.8414, + "step": 2056 + }, + { + "epoch": 0.11321481644559414, + "grad_norm": 0.8531168699264526, + "learning_rate": 9.928212144157342e-06, + "loss": 0.7844, + "step": 2057 + }, + { + "epoch": 0.1132698552479498, + "grad_norm": 1.0250271558761597, + "learning_rate": 9.928138936493956e-06, + "loss": 0.8766, + "step": 2058 + }, + { + "epoch": 0.11332489405030546, + "grad_norm": 0.7963449358940125, + "learning_rate": 9.928065691791996e-06, + "loss": 0.8166, + "step": 2059 + }, + { + "epoch": 0.11337993285266112, + "grad_norm": 1.1033011674880981, + "learning_rate": 9.927992410052013e-06, + "loss": 0.8748, + "step": 2060 + }, + { + "epoch": 0.11343497165501679, + "grad_norm": 0.8760959506034851, + "learning_rate": 9.927919091274558e-06, + "loss": 0.8623, + "step": 2061 + }, + { + "epoch": 0.11349001045737245, + "grad_norm": 1.1783028841018677, + "learning_rate": 9.927845735460182e-06, + "loss": 0.9144, + "step": 2062 + }, + { + "epoch": 0.11354504925972811, + "grad_norm": 0.8868625164031982, + "learning_rate": 9.927772342609437e-06, + "loss": 0.8614, + "step": 2063 + }, + { + "epoch": 0.11360008806208377, + "grad_norm": 0.8784704804420471, + "learning_rate": 9.927698912722874e-06, + "loss": 0.7802, + "step": 2064 + }, + { + "epoch": 0.11365512686443943, + "grad_norm": 1.0090643167495728, + "learning_rate": 9.927625445801046e-06, + "loss": 0.8876, + "step": 2065 + }, + { + "epoch": 0.1137101656667951, + "grad_norm": 0.7624390721321106, + "learning_rate": 9.927551941844502e-06, + "loss": 0.794, + "step": 2066 + }, + { + "epoch": 0.11376520446915075, + "grad_norm": 0.7814189791679382, + "learning_rate": 9.927478400853798e-06, + "loss": 0.8176, + "step": 2067 + }, + { + "epoch": 0.11382024327150642, + "grad_norm": 0.876338541507721, + "learning_rate": 9.927404822829486e-06, + "loss": 0.8634, + "step": 2068 + }, + { + "epoch": 0.11387528207386208, + "grad_norm": 0.7931430339813232, + "learning_rate": 9.927331207772117e-06, + "loss": 0.8012, + "step": 2069 + }, + { + "epoch": 0.11393032087621774, + "grad_norm": 1.0064504146575928, + "learning_rate": 9.927257555682246e-06, + "loss": 0.8321, + "step": 2070 + }, + { + "epoch": 0.1139853596785734, + "grad_norm": 0.8233053684234619, + "learning_rate": 9.927183866560425e-06, + "loss": 0.8004, + "step": 2071 + }, + { + "epoch": 0.11404039848092906, + "grad_norm": 1.0106632709503174, + "learning_rate": 9.927110140407211e-06, + "loss": 0.8627, + "step": 2072 + }, + { + "epoch": 0.11409543728328471, + "grad_norm": 0.8262843489646912, + "learning_rate": 9.927036377223155e-06, + "loss": 0.737, + "step": 2073 + }, + { + "epoch": 0.11415047608564037, + "grad_norm": 0.9349029660224915, + "learning_rate": 9.926962577008813e-06, + "loss": 0.9049, + "step": 2074 + }, + { + "epoch": 0.11420551488799603, + "grad_norm": 0.8689929842948914, + "learning_rate": 9.926888739764739e-06, + "loss": 0.7858, + "step": 2075 + }, + { + "epoch": 0.1142605536903517, + "grad_norm": 0.8442347645759583, + "learning_rate": 9.926814865491487e-06, + "loss": 0.8145, + "step": 2076 + }, + { + "epoch": 0.11431559249270735, + "grad_norm": 0.9143397212028503, + "learning_rate": 9.926740954189615e-06, + "loss": 0.8025, + "step": 2077 + }, + { + "epoch": 0.11437063129506302, + "grad_norm": 1.293251395225525, + "learning_rate": 9.926667005859676e-06, + "loss": 1.0256, + "step": 2078 + }, + { + "epoch": 0.11442567009741868, + "grad_norm": 0.9661351442337036, + "learning_rate": 9.926593020502226e-06, + "loss": 0.991, + "step": 2079 + }, + { + "epoch": 0.11448070889977434, + "grad_norm": 0.8110861778259277, + "learning_rate": 9.926518998117823e-06, + "loss": 0.7129, + "step": 2080 + }, + { + "epoch": 0.11453574770213, + "grad_norm": 0.8351119160652161, + "learning_rate": 9.92644493870702e-06, + "loss": 0.8894, + "step": 2081 + }, + { + "epoch": 0.11459078650448566, + "grad_norm": 0.8492733240127563, + "learning_rate": 9.926370842270377e-06, + "loss": 0.8039, + "step": 2082 + }, + { + "epoch": 0.11464582530684132, + "grad_norm": 0.895353376865387, + "learning_rate": 9.92629670880845e-06, + "loss": 0.8743, + "step": 2083 + }, + { + "epoch": 0.11470086410919698, + "grad_norm": 0.7871271967887878, + "learning_rate": 9.926222538321795e-06, + "loss": 0.8426, + "step": 2084 + }, + { + "epoch": 0.11475590291155265, + "grad_norm": 0.8904643058776855, + "learning_rate": 9.92614833081097e-06, + "loss": 0.8454, + "step": 2085 + }, + { + "epoch": 0.11481094171390831, + "grad_norm": 0.9166308641433716, + "learning_rate": 9.926074086276532e-06, + "loss": 0.9162, + "step": 2086 + }, + { + "epoch": 0.11486598051626397, + "grad_norm": 0.8730728626251221, + "learning_rate": 9.92599980471904e-06, + "loss": 0.8524, + "step": 2087 + }, + { + "epoch": 0.11492101931861963, + "grad_norm": 0.7932829260826111, + "learning_rate": 9.925925486139052e-06, + "loss": 0.7838, + "step": 2088 + }, + { + "epoch": 0.11497605812097529, + "grad_norm": 1.0033760070800781, + "learning_rate": 9.925851130537127e-06, + "loss": 0.8746, + "step": 2089 + }, + { + "epoch": 0.11503109692333095, + "grad_norm": 0.7783192992210388, + "learning_rate": 9.925776737913823e-06, + "loss": 0.7308, + "step": 2090 + }, + { + "epoch": 0.11508613572568661, + "grad_norm": 0.8441587686538696, + "learning_rate": 9.925702308269702e-06, + "loss": 0.7933, + "step": 2091 + }, + { + "epoch": 0.11514117452804228, + "grad_norm": 0.9433023929595947, + "learning_rate": 9.925627841605319e-06, + "loss": 0.7857, + "step": 2092 + }, + { + "epoch": 0.11519621333039794, + "grad_norm": 0.8958256244659424, + "learning_rate": 9.925553337921235e-06, + "loss": 0.9116, + "step": 2093 + }, + { + "epoch": 0.11525125213275358, + "grad_norm": 0.7610845565795898, + "learning_rate": 9.925478797218011e-06, + "loss": 0.8006, + "step": 2094 + }, + { + "epoch": 0.11530629093510925, + "grad_norm": 0.7977023720741272, + "learning_rate": 9.925404219496207e-06, + "loss": 0.8068, + "step": 2095 + }, + { + "epoch": 0.11536132973746491, + "grad_norm": 0.8087283372879028, + "learning_rate": 9.925329604756383e-06, + "loss": 0.7968, + "step": 2096 + }, + { + "epoch": 0.11541636853982057, + "grad_norm": 1.1066477298736572, + "learning_rate": 9.925254952999102e-06, + "loss": 0.8167, + "step": 2097 + }, + { + "epoch": 0.11547140734217623, + "grad_norm": 0.7806832194328308, + "learning_rate": 9.925180264224921e-06, + "loss": 0.8069, + "step": 2098 + }, + { + "epoch": 0.11552644614453189, + "grad_norm": 0.7745190858840942, + "learning_rate": 9.925105538434406e-06, + "loss": 0.7968, + "step": 2099 + }, + { + "epoch": 0.11558148494688755, + "grad_norm": 0.9045543074607849, + "learning_rate": 9.925030775628113e-06, + "loss": 0.8417, + "step": 2100 + }, + { + "epoch": 0.11563652374924321, + "grad_norm": 1.2962623834609985, + "learning_rate": 9.924955975806608e-06, + "loss": 0.8162, + "step": 2101 + }, + { + "epoch": 0.11569156255159888, + "grad_norm": 0.8571485877037048, + "learning_rate": 9.924881138970453e-06, + "loss": 0.8581, + "step": 2102 + }, + { + "epoch": 0.11574660135395454, + "grad_norm": 0.8326650857925415, + "learning_rate": 9.92480626512021e-06, + "loss": 0.8438, + "step": 2103 + }, + { + "epoch": 0.1158016401563102, + "grad_norm": 0.7973701357841492, + "learning_rate": 9.924731354256441e-06, + "loss": 0.8337, + "step": 2104 + }, + { + "epoch": 0.11585667895866586, + "grad_norm": 0.8614075779914856, + "learning_rate": 9.924656406379708e-06, + "loss": 0.8275, + "step": 2105 + }, + { + "epoch": 0.11591171776102152, + "grad_norm": 0.7911350131034851, + "learning_rate": 9.924581421490577e-06, + "loss": 0.8032, + "step": 2106 + }, + { + "epoch": 0.11596675656337718, + "grad_norm": 0.8763116598129272, + "learning_rate": 9.92450639958961e-06, + "loss": 0.8725, + "step": 2107 + }, + { + "epoch": 0.11602179536573284, + "grad_norm": 0.9754133224487305, + "learning_rate": 9.92443134067737e-06, + "loss": 0.9115, + "step": 2108 + }, + { + "epoch": 0.1160768341680885, + "grad_norm": 0.7783731818199158, + "learning_rate": 9.924356244754425e-06, + "loss": 0.8223, + "step": 2109 + }, + { + "epoch": 0.11613187297044417, + "grad_norm": 0.865301787853241, + "learning_rate": 9.924281111821335e-06, + "loss": 0.8053, + "step": 2110 + }, + { + "epoch": 0.11618691177279983, + "grad_norm": 0.8654297590255737, + "learning_rate": 9.924205941878666e-06, + "loss": 0.716, + "step": 2111 + }, + { + "epoch": 0.11624195057515549, + "grad_norm": 0.7646550536155701, + "learning_rate": 9.924130734926982e-06, + "loss": 0.8027, + "step": 2112 + }, + { + "epoch": 0.11629698937751115, + "grad_norm": 0.810587465763092, + "learning_rate": 9.924055490966851e-06, + "loss": 0.7416, + "step": 2113 + }, + { + "epoch": 0.11635202817986681, + "grad_norm": 0.8610082268714905, + "learning_rate": 9.923980209998838e-06, + "loss": 0.8527, + "step": 2114 + }, + { + "epoch": 0.11640706698222247, + "grad_norm": 0.8409233689308167, + "learning_rate": 9.923904892023506e-06, + "loss": 0.8169, + "step": 2115 + }, + { + "epoch": 0.11646210578457812, + "grad_norm": 0.7786587476730347, + "learning_rate": 9.923829537041425e-06, + "loss": 0.6897, + "step": 2116 + }, + { + "epoch": 0.11651714458693378, + "grad_norm": 0.852908730506897, + "learning_rate": 9.923754145053158e-06, + "loss": 0.7821, + "step": 2117 + }, + { + "epoch": 0.11657218338928944, + "grad_norm": 0.9130391478538513, + "learning_rate": 9.923678716059273e-06, + "loss": 1.0377, + "step": 2118 + }, + { + "epoch": 0.1166272221916451, + "grad_norm": 0.8371701240539551, + "learning_rate": 9.923603250060336e-06, + "loss": 0.8312, + "step": 2119 + }, + { + "epoch": 0.11668226099400077, + "grad_norm": 0.8045756220817566, + "learning_rate": 9.923527747056916e-06, + "loss": 0.7971, + "step": 2120 + }, + { + "epoch": 0.11673729979635643, + "grad_norm": 0.8832160234451294, + "learning_rate": 9.923452207049577e-06, + "loss": 0.7362, + "step": 2121 + }, + { + "epoch": 0.11679233859871209, + "grad_norm": 0.8253088593482971, + "learning_rate": 9.923376630038893e-06, + "loss": 0.8177, + "step": 2122 + }, + { + "epoch": 0.11684737740106775, + "grad_norm": 0.7953168749809265, + "learning_rate": 9.923301016025424e-06, + "loss": 0.7053, + "step": 2123 + }, + { + "epoch": 0.11690241620342341, + "grad_norm": 0.7256457805633545, + "learning_rate": 9.923225365009745e-06, + "loss": 0.7554, + "step": 2124 + }, + { + "epoch": 0.11695745500577907, + "grad_norm": 0.9896693229675293, + "learning_rate": 9.923149676992424e-06, + "loss": 0.8285, + "step": 2125 + }, + { + "epoch": 0.11701249380813473, + "grad_norm": 0.7846312522888184, + "learning_rate": 9.923073951974023e-06, + "loss": 0.7527, + "step": 2126 + }, + { + "epoch": 0.1170675326104904, + "grad_norm": 0.8949825167655945, + "learning_rate": 9.92299818995512e-06, + "loss": 0.8545, + "step": 2127 + }, + { + "epoch": 0.11712257141284606, + "grad_norm": 1.0023548603057861, + "learning_rate": 9.922922390936278e-06, + "loss": 0.7668, + "step": 2128 + }, + { + "epoch": 0.11717761021520172, + "grad_norm": 0.8663881421089172, + "learning_rate": 9.92284655491807e-06, + "loss": 0.8073, + "step": 2129 + }, + { + "epoch": 0.11723264901755738, + "grad_norm": 0.8274385929107666, + "learning_rate": 9.922770681901064e-06, + "loss": 0.9002, + "step": 2130 + }, + { + "epoch": 0.11728768781991304, + "grad_norm": 0.8508959412574768, + "learning_rate": 9.922694771885832e-06, + "loss": 0.9325, + "step": 2131 + }, + { + "epoch": 0.1173427266222687, + "grad_norm": 0.8176792860031128, + "learning_rate": 9.922618824872946e-06, + "loss": 0.8415, + "step": 2132 + }, + { + "epoch": 0.11739776542462436, + "grad_norm": 0.770951509475708, + "learning_rate": 9.922542840862971e-06, + "loss": 0.8051, + "step": 2133 + }, + { + "epoch": 0.11745280422698003, + "grad_norm": 0.8558167219161987, + "learning_rate": 9.922466819856484e-06, + "loss": 0.85, + "step": 2134 + }, + { + "epoch": 0.11750784302933569, + "grad_norm": 0.8288151025772095, + "learning_rate": 9.922390761854053e-06, + "loss": 0.8141, + "step": 2135 + }, + { + "epoch": 0.11756288183169135, + "grad_norm": 0.8220882415771484, + "learning_rate": 9.922314666856252e-06, + "loss": 0.8109, + "step": 2136 + }, + { + "epoch": 0.117617920634047, + "grad_norm": 0.7875000238418579, + "learning_rate": 9.92223853486365e-06, + "loss": 0.9085, + "step": 2137 + }, + { + "epoch": 0.11767295943640266, + "grad_norm": 0.8052374124526978, + "learning_rate": 9.922162365876822e-06, + "loss": 0.8785, + "step": 2138 + }, + { + "epoch": 0.11772799823875832, + "grad_norm": 1.0311180353164673, + "learning_rate": 9.922086159896338e-06, + "loss": 0.9112, + "step": 2139 + }, + { + "epoch": 0.11778303704111398, + "grad_norm": 0.943911075592041, + "learning_rate": 9.922009916922773e-06, + "loss": 0.8332, + "step": 2140 + }, + { + "epoch": 0.11783807584346964, + "grad_norm": 0.8156648278236389, + "learning_rate": 9.921933636956697e-06, + "loss": 0.8837, + "step": 2141 + }, + { + "epoch": 0.1178931146458253, + "grad_norm": 0.860292375087738, + "learning_rate": 9.921857319998688e-06, + "loss": 0.7963, + "step": 2142 + }, + { + "epoch": 0.11794815344818096, + "grad_norm": 0.8861456513404846, + "learning_rate": 9.921780966049315e-06, + "loss": 0.8335, + "step": 2143 + }, + { + "epoch": 0.11800319225053663, + "grad_norm": 0.793533205986023, + "learning_rate": 9.921704575109155e-06, + "loss": 0.7881, + "step": 2144 + }, + { + "epoch": 0.11805823105289229, + "grad_norm": 0.8039320111274719, + "learning_rate": 9.921628147178781e-06, + "loss": 0.8369, + "step": 2145 + }, + { + "epoch": 0.11811326985524795, + "grad_norm": 0.8785450458526611, + "learning_rate": 9.921551682258765e-06, + "loss": 0.7981, + "step": 2146 + }, + { + "epoch": 0.11816830865760361, + "grad_norm": 0.810251772403717, + "learning_rate": 9.921475180349687e-06, + "loss": 0.7926, + "step": 2147 + }, + { + "epoch": 0.11822334745995927, + "grad_norm": 0.8470801115036011, + "learning_rate": 9.921398641452117e-06, + "loss": 0.8061, + "step": 2148 + }, + { + "epoch": 0.11827838626231493, + "grad_norm": 0.8147469162940979, + "learning_rate": 9.921322065566633e-06, + "loss": 0.7906, + "step": 2149 + }, + { + "epoch": 0.1183334250646706, + "grad_norm": 0.8792327046394348, + "learning_rate": 9.92124545269381e-06, + "loss": 0.9025, + "step": 2150 + }, + { + "epoch": 0.11838846386702626, + "grad_norm": 0.794607400894165, + "learning_rate": 9.921168802834223e-06, + "loss": 0.8284, + "step": 2151 + }, + { + "epoch": 0.11844350266938192, + "grad_norm": 0.8601556420326233, + "learning_rate": 9.921092115988447e-06, + "loss": 0.8196, + "step": 2152 + }, + { + "epoch": 0.11849854147173758, + "grad_norm": 0.786967933177948, + "learning_rate": 9.921015392157062e-06, + "loss": 0.8744, + "step": 2153 + }, + { + "epoch": 0.11855358027409324, + "grad_norm": 0.8481432199478149, + "learning_rate": 9.920938631340641e-06, + "loss": 0.7206, + "step": 2154 + }, + { + "epoch": 0.1186086190764489, + "grad_norm": 0.8025142550468445, + "learning_rate": 9.920861833539765e-06, + "loss": 0.8126, + "step": 2155 + }, + { + "epoch": 0.11866365787880456, + "grad_norm": 0.9853057265281677, + "learning_rate": 9.920784998755006e-06, + "loss": 0.8883, + "step": 2156 + }, + { + "epoch": 0.11871869668116022, + "grad_norm": 1.0008476972579956, + "learning_rate": 9.920708126986947e-06, + "loss": 0.9326, + "step": 2157 + }, + { + "epoch": 0.11877373548351589, + "grad_norm": 0.837347686290741, + "learning_rate": 9.920631218236161e-06, + "loss": 0.9002, + "step": 2158 + }, + { + "epoch": 0.11882877428587153, + "grad_norm": 0.7866735458374023, + "learning_rate": 9.920554272503227e-06, + "loss": 0.765, + "step": 2159 + }, + { + "epoch": 0.1188838130882272, + "grad_norm": 0.8714935779571533, + "learning_rate": 9.920477289788726e-06, + "loss": 1.0294, + "step": 2160 + }, + { + "epoch": 0.11893885189058286, + "grad_norm": 1.0671826601028442, + "learning_rate": 9.920400270093234e-06, + "loss": 0.8341, + "step": 2161 + }, + { + "epoch": 0.11899389069293852, + "grad_norm": 0.8594604134559631, + "learning_rate": 9.92032321341733e-06, + "loss": 0.8731, + "step": 2162 + }, + { + "epoch": 0.11904892949529418, + "grad_norm": 0.8387738466262817, + "learning_rate": 9.920246119761597e-06, + "loss": 0.7898, + "step": 2163 + }, + { + "epoch": 0.11910396829764984, + "grad_norm": 0.8957195281982422, + "learning_rate": 9.920168989126608e-06, + "loss": 0.8475, + "step": 2164 + }, + { + "epoch": 0.1191590071000055, + "grad_norm": 0.8224207162857056, + "learning_rate": 9.920091821512948e-06, + "loss": 0.7944, + "step": 2165 + }, + { + "epoch": 0.11921404590236116, + "grad_norm": 1.0309031009674072, + "learning_rate": 9.920014616921192e-06, + "loss": 0.8992, + "step": 2166 + }, + { + "epoch": 0.11926908470471682, + "grad_norm": 0.7300832271575928, + "learning_rate": 9.919937375351925e-06, + "loss": 0.7016, + "step": 2167 + }, + { + "epoch": 0.11932412350707249, + "grad_norm": 0.7565537691116333, + "learning_rate": 9.919860096805724e-06, + "loss": 0.8113, + "step": 2168 + }, + { + "epoch": 0.11937916230942815, + "grad_norm": 1.0101505517959595, + "learning_rate": 9.919782781283174e-06, + "loss": 0.8765, + "step": 2169 + }, + { + "epoch": 0.11943420111178381, + "grad_norm": 0.8369461894035339, + "learning_rate": 9.919705428784852e-06, + "loss": 0.8248, + "step": 2170 + }, + { + "epoch": 0.11948923991413947, + "grad_norm": 0.8106105327606201, + "learning_rate": 9.919628039311342e-06, + "loss": 0.8585, + "step": 2171 + }, + { + "epoch": 0.11954427871649513, + "grad_norm": 0.7863745093345642, + "learning_rate": 9.919550612863224e-06, + "loss": 0.8393, + "step": 2172 + }, + { + "epoch": 0.11959931751885079, + "grad_norm": 0.8664719462394714, + "learning_rate": 9.919473149441081e-06, + "loss": 0.8882, + "step": 2173 + }, + { + "epoch": 0.11965435632120645, + "grad_norm": 0.6977574825286865, + "learning_rate": 9.919395649045494e-06, + "loss": 0.7264, + "step": 2174 + }, + { + "epoch": 0.11970939512356212, + "grad_norm": 0.8000102639198303, + "learning_rate": 9.919318111677045e-06, + "loss": 0.7828, + "step": 2175 + }, + { + "epoch": 0.11976443392591778, + "grad_norm": 0.868228018283844, + "learning_rate": 9.91924053733632e-06, + "loss": 0.7904, + "step": 2176 + }, + { + "epoch": 0.11981947272827344, + "grad_norm": 0.839080274105072, + "learning_rate": 9.9191629260239e-06, + "loss": 0.7663, + "step": 2177 + }, + { + "epoch": 0.1198745115306291, + "grad_norm": 0.8222747445106506, + "learning_rate": 9.919085277740366e-06, + "loss": 0.7208, + "step": 2178 + }, + { + "epoch": 0.11992955033298476, + "grad_norm": 1.4550986289978027, + "learning_rate": 9.919007592486304e-06, + "loss": 0.8154, + "step": 2179 + }, + { + "epoch": 0.11998458913534041, + "grad_norm": 0.9110257625579834, + "learning_rate": 9.9189298702623e-06, + "loss": 0.8134, + "step": 2180 + }, + { + "epoch": 0.12003962793769607, + "grad_norm": 0.84796142578125, + "learning_rate": 9.918852111068935e-06, + "loss": 0.8074, + "step": 2181 + }, + { + "epoch": 0.12009466674005173, + "grad_norm": 0.8134179711341858, + "learning_rate": 9.918774314906793e-06, + "loss": 0.6335, + "step": 2182 + }, + { + "epoch": 0.12014970554240739, + "grad_norm": 0.8481448888778687, + "learning_rate": 9.918696481776461e-06, + "loss": 0.8804, + "step": 2183 + }, + { + "epoch": 0.12020474434476305, + "grad_norm": 0.88057941198349, + "learning_rate": 9.918618611678523e-06, + "loss": 0.9326, + "step": 2184 + }, + { + "epoch": 0.12025978314711872, + "grad_norm": 0.8435977697372437, + "learning_rate": 9.918540704613564e-06, + "loss": 0.8141, + "step": 2185 + }, + { + "epoch": 0.12031482194947438, + "grad_norm": 0.8186982870101929, + "learning_rate": 9.918462760582169e-06, + "loss": 0.837, + "step": 2186 + }, + { + "epoch": 0.12036986075183004, + "grad_norm": 0.887783944606781, + "learning_rate": 9.918384779584924e-06, + "loss": 0.8062, + "step": 2187 + }, + { + "epoch": 0.1204248995541857, + "grad_norm": 0.9368415474891663, + "learning_rate": 9.918306761622417e-06, + "loss": 1.0098, + "step": 2188 + }, + { + "epoch": 0.12047993835654136, + "grad_norm": 0.8443986773490906, + "learning_rate": 9.918228706695232e-06, + "loss": 0.8178, + "step": 2189 + }, + { + "epoch": 0.12053497715889702, + "grad_norm": 0.7897284626960754, + "learning_rate": 9.918150614803956e-06, + "loss": 0.8013, + "step": 2190 + }, + { + "epoch": 0.12059001596125268, + "grad_norm": 0.886012077331543, + "learning_rate": 9.91807248594918e-06, + "loss": 0.8141, + "step": 2191 + }, + { + "epoch": 0.12064505476360834, + "grad_norm": 0.8585757613182068, + "learning_rate": 9.917994320131484e-06, + "loss": 0.8381, + "step": 2192 + }, + { + "epoch": 0.120700093565964, + "grad_norm": 1.6192269325256348, + "learning_rate": 9.917916117351459e-06, + "loss": 0.9082, + "step": 2193 + }, + { + "epoch": 0.12075513236831967, + "grad_norm": 1.160414457321167, + "learning_rate": 9.917837877609695e-06, + "loss": 0.8673, + "step": 2194 + }, + { + "epoch": 0.12081017117067533, + "grad_norm": 0.8363412022590637, + "learning_rate": 9.917759600906775e-06, + "loss": 0.816, + "step": 2195 + }, + { + "epoch": 0.12086520997303099, + "grad_norm": 0.8344097137451172, + "learning_rate": 9.917681287243292e-06, + "loss": 0.8629, + "step": 2196 + }, + { + "epoch": 0.12092024877538665, + "grad_norm": 0.9817582368850708, + "learning_rate": 9.917602936619834e-06, + "loss": 0.8106, + "step": 2197 + }, + { + "epoch": 0.12097528757774231, + "grad_norm": 0.8828088641166687, + "learning_rate": 9.917524549036987e-06, + "loss": 0.8465, + "step": 2198 + }, + { + "epoch": 0.12103032638009797, + "grad_norm": 0.8428277969360352, + "learning_rate": 9.917446124495344e-06, + "loss": 0.7721, + "step": 2199 + }, + { + "epoch": 0.12108536518245364, + "grad_norm": 0.8748664855957031, + "learning_rate": 9.917367662995489e-06, + "loss": 0.8679, + "step": 2200 + }, + { + "epoch": 0.1211404039848093, + "grad_norm": 0.8652347922325134, + "learning_rate": 9.917289164538018e-06, + "loss": 0.8906, + "step": 2201 + }, + { + "epoch": 0.12119544278716494, + "grad_norm": 1.157142162322998, + "learning_rate": 9.917210629123518e-06, + "loss": 0.9046, + "step": 2202 + }, + { + "epoch": 0.1212504815895206, + "grad_norm": 0.8186333179473877, + "learning_rate": 9.917132056752576e-06, + "loss": 0.8494, + "step": 2203 + }, + { + "epoch": 0.12130552039187627, + "grad_norm": 0.7769078612327576, + "learning_rate": 9.917053447425788e-06, + "loss": 0.8018, + "step": 2204 + }, + { + "epoch": 0.12136055919423193, + "grad_norm": 0.9190469980239868, + "learning_rate": 9.916974801143742e-06, + "loss": 0.8206, + "step": 2205 + }, + { + "epoch": 0.12141559799658759, + "grad_norm": 1.2200725078582764, + "learning_rate": 9.91689611790703e-06, + "loss": 0.9109, + "step": 2206 + }, + { + "epoch": 0.12147063679894325, + "grad_norm": 0.7902093529701233, + "learning_rate": 9.916817397716243e-06, + "loss": 0.8314, + "step": 2207 + }, + { + "epoch": 0.12152567560129891, + "grad_norm": 0.8160610198974609, + "learning_rate": 9.91673864057197e-06, + "loss": 0.8605, + "step": 2208 + }, + { + "epoch": 0.12158071440365457, + "grad_norm": 0.833163857460022, + "learning_rate": 9.916659846474807e-06, + "loss": 0.8125, + "step": 2209 + }, + { + "epoch": 0.12163575320601024, + "grad_norm": 0.776314377784729, + "learning_rate": 9.916581015425346e-06, + "loss": 0.8137, + "step": 2210 + }, + { + "epoch": 0.1216907920083659, + "grad_norm": 0.8525915145874023, + "learning_rate": 9.916502147424178e-06, + "loss": 0.8703, + "step": 2211 + }, + { + "epoch": 0.12174583081072156, + "grad_norm": 0.8268684148788452, + "learning_rate": 9.916423242471895e-06, + "loss": 0.7775, + "step": 2212 + }, + { + "epoch": 0.12180086961307722, + "grad_norm": 0.8717706799507141, + "learning_rate": 9.916344300569091e-06, + "loss": 0.8002, + "step": 2213 + }, + { + "epoch": 0.12185590841543288, + "grad_norm": 0.9499961137771606, + "learning_rate": 9.91626532171636e-06, + "loss": 0.8861, + "step": 2214 + }, + { + "epoch": 0.12191094721778854, + "grad_norm": 0.9521885514259338, + "learning_rate": 9.916186305914296e-06, + "loss": 0.7602, + "step": 2215 + }, + { + "epoch": 0.1219659860201442, + "grad_norm": 0.8945447206497192, + "learning_rate": 9.916107253163488e-06, + "loss": 0.8603, + "step": 2216 + }, + { + "epoch": 0.12202102482249987, + "grad_norm": 0.8232392072677612, + "learning_rate": 9.916028163464536e-06, + "loss": 0.8419, + "step": 2217 + }, + { + "epoch": 0.12207606362485553, + "grad_norm": 0.8183467984199524, + "learning_rate": 9.915949036818032e-06, + "loss": 0.9038, + "step": 2218 + }, + { + "epoch": 0.12213110242721119, + "grad_norm": 0.7805467247962952, + "learning_rate": 9.915869873224571e-06, + "loss": 0.7313, + "step": 2219 + }, + { + "epoch": 0.12218614122956685, + "grad_norm": 0.838101327419281, + "learning_rate": 9.915790672684749e-06, + "loss": 0.7973, + "step": 2220 + }, + { + "epoch": 0.12224118003192251, + "grad_norm": 0.7795171141624451, + "learning_rate": 9.915711435199158e-06, + "loss": 0.7796, + "step": 2221 + }, + { + "epoch": 0.12229621883427817, + "grad_norm": 0.7971234917640686, + "learning_rate": 9.915632160768398e-06, + "loss": 0.8309, + "step": 2222 + }, + { + "epoch": 0.12235125763663382, + "grad_norm": 0.8543851375579834, + "learning_rate": 9.915552849393061e-06, + "loss": 0.7826, + "step": 2223 + }, + { + "epoch": 0.12240629643898948, + "grad_norm": 0.9315086007118225, + "learning_rate": 9.915473501073744e-06, + "loss": 0.9294, + "step": 2224 + }, + { + "epoch": 0.12246133524134514, + "grad_norm": 0.8794427514076233, + "learning_rate": 9.915394115811046e-06, + "loss": 0.8968, + "step": 2225 + }, + { + "epoch": 0.1225163740437008, + "grad_norm": 0.9499204754829407, + "learning_rate": 9.91531469360556e-06, + "loss": 0.9841, + "step": 2226 + }, + { + "epoch": 0.12257141284605647, + "grad_norm": 0.9233788251876831, + "learning_rate": 9.915235234457885e-06, + "loss": 0.7794, + "step": 2227 + }, + { + "epoch": 0.12262645164841213, + "grad_norm": 0.8971870541572571, + "learning_rate": 9.915155738368618e-06, + "loss": 0.919, + "step": 2228 + }, + { + "epoch": 0.12268149045076779, + "grad_norm": 0.8122105002403259, + "learning_rate": 9.915076205338356e-06, + "loss": 0.8227, + "step": 2229 + }, + { + "epoch": 0.12273652925312345, + "grad_norm": 0.7878004908561707, + "learning_rate": 9.914996635367696e-06, + "loss": 0.7622, + "step": 2230 + }, + { + "epoch": 0.12279156805547911, + "grad_norm": 0.8229606747627258, + "learning_rate": 9.914917028457238e-06, + "loss": 0.8265, + "step": 2231 + }, + { + "epoch": 0.12284660685783477, + "grad_norm": 0.8972312808036804, + "learning_rate": 9.914837384607578e-06, + "loss": 0.8914, + "step": 2232 + }, + { + "epoch": 0.12290164566019043, + "grad_norm": 0.762922465801239, + "learning_rate": 9.914757703819318e-06, + "loss": 0.6853, + "step": 2233 + }, + { + "epoch": 0.1229566844625461, + "grad_norm": 0.8949442505836487, + "learning_rate": 9.914677986093054e-06, + "loss": 0.8303, + "step": 2234 + }, + { + "epoch": 0.12301172326490176, + "grad_norm": 1.0220820903778076, + "learning_rate": 9.914598231429384e-06, + "loss": 1.0027, + "step": 2235 + }, + { + "epoch": 0.12306676206725742, + "grad_norm": 0.8265436887741089, + "learning_rate": 9.914518439828911e-06, + "loss": 0.8317, + "step": 2236 + }, + { + "epoch": 0.12312180086961308, + "grad_norm": 0.780444324016571, + "learning_rate": 9.914438611292231e-06, + "loss": 0.756, + "step": 2237 + }, + { + "epoch": 0.12317683967196874, + "grad_norm": 0.8569482564926147, + "learning_rate": 9.914358745819948e-06, + "loss": 0.8126, + "step": 2238 + }, + { + "epoch": 0.1232318784743244, + "grad_norm": 0.8167145848274231, + "learning_rate": 9.91427884341266e-06, + "loss": 0.8345, + "step": 2239 + }, + { + "epoch": 0.12328691727668006, + "grad_norm": 0.7915990948677063, + "learning_rate": 9.914198904070967e-06, + "loss": 0.7416, + "step": 2240 + }, + { + "epoch": 0.12334195607903573, + "grad_norm": 0.8568083047866821, + "learning_rate": 9.91411892779547e-06, + "loss": 0.8329, + "step": 2241 + }, + { + "epoch": 0.12339699488139139, + "grad_norm": 1.1727303266525269, + "learning_rate": 9.914038914586772e-06, + "loss": 0.8421, + "step": 2242 + }, + { + "epoch": 0.12345203368374705, + "grad_norm": 0.8706398010253906, + "learning_rate": 9.913958864445472e-06, + "loss": 0.9013, + "step": 2243 + }, + { + "epoch": 0.12350707248610271, + "grad_norm": 0.8376144170761108, + "learning_rate": 9.913878777372173e-06, + "loss": 0.8456, + "step": 2244 + }, + { + "epoch": 0.12356211128845836, + "grad_norm": 0.8388974070549011, + "learning_rate": 9.913798653367478e-06, + "loss": 0.787, + "step": 2245 + }, + { + "epoch": 0.12361715009081402, + "grad_norm": 0.8625446557998657, + "learning_rate": 9.913718492431984e-06, + "loss": 0.7758, + "step": 2246 + }, + { + "epoch": 0.12367218889316968, + "grad_norm": 0.8805570006370544, + "learning_rate": 9.913638294566299e-06, + "loss": 0.8755, + "step": 2247 + }, + { + "epoch": 0.12372722769552534, + "grad_norm": 0.8102611899375916, + "learning_rate": 9.913558059771025e-06, + "loss": 0.8495, + "step": 2248 + }, + { + "epoch": 0.123782266497881, + "grad_norm": 0.8506311774253845, + "learning_rate": 9.913477788046762e-06, + "loss": 0.7413, + "step": 2249 + }, + { + "epoch": 0.12383730530023666, + "grad_norm": 1.0789196491241455, + "learning_rate": 9.913397479394116e-06, + "loss": 0.8993, + "step": 2250 + }, + { + "epoch": 0.12389234410259232, + "grad_norm": 1.5664849281311035, + "learning_rate": 9.91331713381369e-06, + "loss": 0.8322, + "step": 2251 + }, + { + "epoch": 0.12394738290494799, + "grad_norm": 1.1347390413284302, + "learning_rate": 9.913236751306085e-06, + "loss": 0.8756, + "step": 2252 + }, + { + "epoch": 0.12400242170730365, + "grad_norm": 0.8111063241958618, + "learning_rate": 9.913156331871911e-06, + "loss": 0.831, + "step": 2253 + }, + { + "epoch": 0.12405746050965931, + "grad_norm": 0.817812979221344, + "learning_rate": 9.913075875511769e-06, + "loss": 0.8531, + "step": 2254 + }, + { + "epoch": 0.12411249931201497, + "grad_norm": 0.7678318619728088, + "learning_rate": 9.912995382226263e-06, + "loss": 0.8028, + "step": 2255 + }, + { + "epoch": 0.12416753811437063, + "grad_norm": 0.8207805156707764, + "learning_rate": 9.912914852015998e-06, + "loss": 0.8856, + "step": 2256 + }, + { + "epoch": 0.1242225769167263, + "grad_norm": 0.978484570980072, + "learning_rate": 9.912834284881582e-06, + "loss": 0.933, + "step": 2257 + }, + { + "epoch": 0.12427761571908195, + "grad_norm": 0.9215858578681946, + "learning_rate": 9.912753680823617e-06, + "loss": 0.7771, + "step": 2258 + }, + { + "epoch": 0.12433265452143762, + "grad_norm": 0.8542179465293884, + "learning_rate": 9.91267303984271e-06, + "loss": 0.8652, + "step": 2259 + }, + { + "epoch": 0.12438769332379328, + "grad_norm": 0.7985575199127197, + "learning_rate": 9.912592361939469e-06, + "loss": 0.7011, + "step": 2260 + }, + { + "epoch": 0.12444273212614894, + "grad_norm": 0.8868670463562012, + "learning_rate": 9.912511647114498e-06, + "loss": 0.8222, + "step": 2261 + }, + { + "epoch": 0.1244977709285046, + "grad_norm": 0.7966209650039673, + "learning_rate": 9.912430895368405e-06, + "loss": 0.776, + "step": 2262 + }, + { + "epoch": 0.12455280973086026, + "grad_norm": 0.7844830751419067, + "learning_rate": 9.912350106701796e-06, + "loss": 0.7513, + "step": 2263 + }, + { + "epoch": 0.12460784853321592, + "grad_norm": 0.7788559794425964, + "learning_rate": 9.912269281115278e-06, + "loss": 0.8517, + "step": 2264 + }, + { + "epoch": 0.12466288733557158, + "grad_norm": 0.778225839138031, + "learning_rate": 9.912188418609461e-06, + "loss": 0.7504, + "step": 2265 + }, + { + "epoch": 0.12471792613792723, + "grad_norm": 0.7955968976020813, + "learning_rate": 9.912107519184947e-06, + "loss": 0.8152, + "step": 2266 + }, + { + "epoch": 0.1247729649402829, + "grad_norm": 1.1202566623687744, + "learning_rate": 9.912026582842352e-06, + "loss": 0.9325, + "step": 2267 + }, + { + "epoch": 0.12482800374263855, + "grad_norm": 0.9762749671936035, + "learning_rate": 9.911945609582279e-06, + "loss": 0.9027, + "step": 2268 + }, + { + "epoch": 0.12488304254499422, + "grad_norm": 0.8311051726341248, + "learning_rate": 9.911864599405336e-06, + "loss": 0.838, + "step": 2269 + }, + { + "epoch": 0.12493808134734988, + "grad_norm": 1.0136815309524536, + "learning_rate": 9.911783552312134e-06, + "loss": 0.9288, + "step": 2270 + }, + { + "epoch": 0.12499312014970554, + "grad_norm": 0.7960494160652161, + "learning_rate": 9.911702468303282e-06, + "loss": 0.8007, + "step": 2271 + }, + { + "epoch": 0.1250481589520612, + "grad_norm": 0.9980880618095398, + "learning_rate": 9.911621347379388e-06, + "loss": 0.8613, + "step": 2272 + }, + { + "epoch": 0.12510319775441686, + "grad_norm": 0.8916807770729065, + "learning_rate": 9.911540189541065e-06, + "loss": 0.8783, + "step": 2273 + }, + { + "epoch": 0.12515823655677252, + "grad_norm": 0.9455892443656921, + "learning_rate": 9.911458994788919e-06, + "loss": 0.8676, + "step": 2274 + }, + { + "epoch": 0.12521327535912818, + "grad_norm": 0.7649906277656555, + "learning_rate": 9.911377763123561e-06, + "loss": 0.7763, + "step": 2275 + }, + { + "epoch": 0.12526831416148385, + "grad_norm": 0.8971202373504639, + "learning_rate": 9.911296494545604e-06, + "loss": 0.9022, + "step": 2276 + }, + { + "epoch": 0.1253233529638395, + "grad_norm": 0.833678126335144, + "learning_rate": 9.911215189055657e-06, + "loss": 0.8401, + "step": 2277 + }, + { + "epoch": 0.12537839176619517, + "grad_norm": 0.8967958688735962, + "learning_rate": 9.911133846654331e-06, + "loss": 0.8678, + "step": 2278 + }, + { + "epoch": 0.12543343056855083, + "grad_norm": 0.8195546865463257, + "learning_rate": 9.911052467342239e-06, + "loss": 0.842, + "step": 2279 + }, + { + "epoch": 0.1254884693709065, + "grad_norm": 1.095815896987915, + "learning_rate": 9.910971051119988e-06, + "loss": 0.845, + "step": 2280 + }, + { + "epoch": 0.12554350817326215, + "grad_norm": 0.9452629685401917, + "learning_rate": 9.910889597988197e-06, + "loss": 0.8971, + "step": 2281 + }, + { + "epoch": 0.12559854697561781, + "grad_norm": 0.9872332215309143, + "learning_rate": 9.910808107947471e-06, + "loss": 0.7994, + "step": 2282 + }, + { + "epoch": 0.12565358577797348, + "grad_norm": 0.7761966586112976, + "learning_rate": 9.910726580998427e-06, + "loss": 0.7791, + "step": 2283 + }, + { + "epoch": 0.12570862458032914, + "grad_norm": 0.8950315713882446, + "learning_rate": 9.910645017141678e-06, + "loss": 0.8499, + "step": 2284 + }, + { + "epoch": 0.1257636633826848, + "grad_norm": 0.8796371221542358, + "learning_rate": 9.910563416377834e-06, + "loss": 0.8587, + "step": 2285 + }, + { + "epoch": 0.12581870218504046, + "grad_norm": 0.8291982412338257, + "learning_rate": 9.91048177870751e-06, + "loss": 0.9166, + "step": 2286 + }, + { + "epoch": 0.12587374098739612, + "grad_norm": 0.758369505405426, + "learning_rate": 9.91040010413132e-06, + "loss": 0.8305, + "step": 2287 + }, + { + "epoch": 0.12592877978975178, + "grad_norm": 0.8775640726089478, + "learning_rate": 9.910318392649876e-06, + "loss": 0.8513, + "step": 2288 + }, + { + "epoch": 0.12598381859210744, + "grad_norm": 0.8581671118736267, + "learning_rate": 9.910236644263796e-06, + "loss": 0.8134, + "step": 2289 + }, + { + "epoch": 0.1260388573944631, + "grad_norm": 0.8570736050605774, + "learning_rate": 9.910154858973689e-06, + "loss": 0.826, + "step": 2290 + }, + { + "epoch": 0.12609389619681877, + "grad_norm": 0.8712487816810608, + "learning_rate": 9.910073036780173e-06, + "loss": 0.8042, + "step": 2291 + }, + { + "epoch": 0.12614893499917443, + "grad_norm": 0.7584837675094604, + "learning_rate": 9.909991177683862e-06, + "loss": 0.7715, + "step": 2292 + }, + { + "epoch": 0.1262039738015301, + "grad_norm": 0.8618917465209961, + "learning_rate": 9.909909281685373e-06, + "loss": 0.8755, + "step": 2293 + }, + { + "epoch": 0.12625901260388575, + "grad_norm": 0.9530277848243713, + "learning_rate": 9.90982734878532e-06, + "loss": 0.8538, + "step": 2294 + }, + { + "epoch": 0.1263140514062414, + "grad_norm": 0.8394436836242676, + "learning_rate": 9.909745378984319e-06, + "loss": 0.8401, + "step": 2295 + }, + { + "epoch": 0.12636909020859707, + "grad_norm": 0.8224034309387207, + "learning_rate": 9.909663372282984e-06, + "loss": 0.7201, + "step": 2296 + }, + { + "epoch": 0.12642412901095273, + "grad_norm": 0.8215349912643433, + "learning_rate": 9.909581328681934e-06, + "loss": 0.8824, + "step": 2297 + }, + { + "epoch": 0.12647916781330837, + "grad_norm": 0.839389443397522, + "learning_rate": 9.909499248181786e-06, + "loss": 0.8056, + "step": 2298 + }, + { + "epoch": 0.12653420661566403, + "grad_norm": 0.9440048933029175, + "learning_rate": 9.909417130783156e-06, + "loss": 0.908, + "step": 2299 + }, + { + "epoch": 0.1265892454180197, + "grad_norm": 0.8336486220359802, + "learning_rate": 9.90933497648666e-06, + "loss": 0.8382, + "step": 2300 + }, + { + "epoch": 0.12664428422037535, + "grad_norm": 1.1541366577148438, + "learning_rate": 9.909252785292918e-06, + "loss": 0.8782, + "step": 2301 + }, + { + "epoch": 0.12669932302273101, + "grad_norm": 0.8730320334434509, + "learning_rate": 9.909170557202545e-06, + "loss": 0.7687, + "step": 2302 + }, + { + "epoch": 0.12675436182508668, + "grad_norm": 0.9927527904510498, + "learning_rate": 9.90908829221616e-06, + "loss": 0.8134, + "step": 2303 + }, + { + "epoch": 0.12680940062744234, + "grad_norm": 0.9521791338920593, + "learning_rate": 9.909005990334381e-06, + "loss": 0.9187, + "step": 2304 + }, + { + "epoch": 0.126864439429798, + "grad_norm": 0.8012455701828003, + "learning_rate": 9.908923651557828e-06, + "loss": 0.8581, + "step": 2305 + }, + { + "epoch": 0.12691947823215366, + "grad_norm": 0.8882689476013184, + "learning_rate": 9.90884127588712e-06, + "loss": 0.9317, + "step": 2306 + }, + { + "epoch": 0.12697451703450932, + "grad_norm": 0.8408340215682983, + "learning_rate": 9.908758863322872e-06, + "loss": 0.8444, + "step": 2307 + }, + { + "epoch": 0.12702955583686498, + "grad_norm": 0.7856307029724121, + "learning_rate": 9.908676413865709e-06, + "loss": 0.8457, + "step": 2308 + }, + { + "epoch": 0.12708459463922064, + "grad_norm": 0.9459167718887329, + "learning_rate": 9.908593927516247e-06, + "loss": 0.8153, + "step": 2309 + }, + { + "epoch": 0.1271396334415763, + "grad_norm": 0.8629655838012695, + "learning_rate": 9.908511404275107e-06, + "loss": 0.8279, + "step": 2310 + }, + { + "epoch": 0.12719467224393197, + "grad_norm": 1.2012875080108643, + "learning_rate": 9.90842884414291e-06, + "loss": 1.4388, + "step": 2311 + }, + { + "epoch": 0.12724971104628763, + "grad_norm": 1.20725417137146, + "learning_rate": 9.908346247120274e-06, + "loss": 0.8704, + "step": 2312 + }, + { + "epoch": 0.1273047498486433, + "grad_norm": 0.8152929544448853, + "learning_rate": 9.908263613207822e-06, + "loss": 0.8618, + "step": 2313 + }, + { + "epoch": 0.12735978865099895, + "grad_norm": 0.8400965332984924, + "learning_rate": 9.908180942406175e-06, + "loss": 0.7881, + "step": 2314 + }, + { + "epoch": 0.1274148274533546, + "grad_norm": 0.8856974840164185, + "learning_rate": 9.908098234715956e-06, + "loss": 0.9073, + "step": 2315 + }, + { + "epoch": 0.12746986625571027, + "grad_norm": 0.8708439469337463, + "learning_rate": 9.908015490137782e-06, + "loss": 0.8099, + "step": 2316 + }, + { + "epoch": 0.12752490505806593, + "grad_norm": 0.8632444143295288, + "learning_rate": 9.907932708672277e-06, + "loss": 0.8472, + "step": 2317 + }, + { + "epoch": 0.1275799438604216, + "grad_norm": 0.8977149128913879, + "learning_rate": 9.907849890320062e-06, + "loss": 0.8878, + "step": 2318 + }, + { + "epoch": 0.12763498266277726, + "grad_norm": 0.8589425086975098, + "learning_rate": 9.907767035081765e-06, + "loss": 0.7905, + "step": 2319 + }, + { + "epoch": 0.12769002146513292, + "grad_norm": 0.9873501062393188, + "learning_rate": 9.907684142958002e-06, + "loss": 0.9002, + "step": 2320 + }, + { + "epoch": 0.12774506026748858, + "grad_norm": 0.8963840007781982, + "learning_rate": 9.9076012139494e-06, + "loss": 0.92, + "step": 2321 + }, + { + "epoch": 0.12780009906984424, + "grad_norm": 0.7933574318885803, + "learning_rate": 9.90751824805658e-06, + "loss": 0.7664, + "step": 2322 + }, + { + "epoch": 0.1278551378721999, + "grad_norm": 0.9660933017730713, + "learning_rate": 9.907435245280167e-06, + "loss": 0.9162, + "step": 2323 + }, + { + "epoch": 0.12791017667455556, + "grad_norm": 0.8698949217796326, + "learning_rate": 9.907352205620783e-06, + "loss": 0.7988, + "step": 2324 + }, + { + "epoch": 0.12796521547691123, + "grad_norm": 0.9077615141868591, + "learning_rate": 9.907269129079055e-06, + "loss": 0.8581, + "step": 2325 + }, + { + "epoch": 0.1280202542792669, + "grad_norm": 0.9128179550170898, + "learning_rate": 9.907186015655607e-06, + "loss": 0.8552, + "step": 2326 + }, + { + "epoch": 0.12807529308162255, + "grad_norm": 0.9321265816688538, + "learning_rate": 9.907102865351062e-06, + "loss": 0.889, + "step": 2327 + }, + { + "epoch": 0.1281303318839782, + "grad_norm": 0.9687464833259583, + "learning_rate": 9.907019678166044e-06, + "loss": 0.7944, + "step": 2328 + }, + { + "epoch": 0.12818537068633387, + "grad_norm": 0.862223207950592, + "learning_rate": 9.90693645410118e-06, + "loss": 0.7699, + "step": 2329 + }, + { + "epoch": 0.12824040948868953, + "grad_norm": 0.9662127494812012, + "learning_rate": 9.906853193157095e-06, + "loss": 0.7818, + "step": 2330 + }, + { + "epoch": 0.1282954482910452, + "grad_norm": 0.8008295297622681, + "learning_rate": 9.906769895334413e-06, + "loss": 0.8443, + "step": 2331 + }, + { + "epoch": 0.12835048709340086, + "grad_norm": 0.8638464212417603, + "learning_rate": 9.906686560633765e-06, + "loss": 0.8438, + "step": 2332 + }, + { + "epoch": 0.12840552589575652, + "grad_norm": 0.9215866327285767, + "learning_rate": 9.906603189055773e-06, + "loss": 0.7481, + "step": 2333 + }, + { + "epoch": 0.12846056469811218, + "grad_norm": 0.7926739454269409, + "learning_rate": 9.906519780601066e-06, + "loss": 0.7404, + "step": 2334 + }, + { + "epoch": 0.12851560350046784, + "grad_norm": 0.9590242505073547, + "learning_rate": 9.906436335270268e-06, + "loss": 0.8319, + "step": 2335 + }, + { + "epoch": 0.1285706423028235, + "grad_norm": 1.0300076007843018, + "learning_rate": 9.906352853064009e-06, + "loss": 0.8635, + "step": 2336 + }, + { + "epoch": 0.12862568110517916, + "grad_norm": 0.8401443958282471, + "learning_rate": 9.906269333982915e-06, + "loss": 0.9584, + "step": 2337 + }, + { + "epoch": 0.12868071990753482, + "grad_norm": 0.8144069910049438, + "learning_rate": 9.906185778027613e-06, + "loss": 0.7375, + "step": 2338 + }, + { + "epoch": 0.12873575870989049, + "grad_norm": 0.8513948917388916, + "learning_rate": 9.906102185198733e-06, + "loss": 0.8353, + "step": 2339 + }, + { + "epoch": 0.12879079751224615, + "grad_norm": 0.8243077397346497, + "learning_rate": 9.906018555496903e-06, + "loss": 0.8665, + "step": 2340 + }, + { + "epoch": 0.12884583631460178, + "grad_norm": 0.8699066042900085, + "learning_rate": 9.905934888922749e-06, + "loss": 0.8537, + "step": 2341 + }, + { + "epoch": 0.12890087511695744, + "grad_norm": 1.0980210304260254, + "learning_rate": 9.905851185476902e-06, + "loss": 0.8887, + "step": 2342 + }, + { + "epoch": 0.1289559139193131, + "grad_norm": 0.8189190030097961, + "learning_rate": 9.905767445159992e-06, + "loss": 0.8467, + "step": 2343 + }, + { + "epoch": 0.12901095272166876, + "grad_norm": 0.8273541331291199, + "learning_rate": 9.905683667972645e-06, + "loss": 0.8701, + "step": 2344 + }, + { + "epoch": 0.12906599152402443, + "grad_norm": 0.8987969160079956, + "learning_rate": 9.905599853915496e-06, + "loss": 0.909, + "step": 2345 + }, + { + "epoch": 0.1291210303263801, + "grad_norm": 0.818268895149231, + "learning_rate": 9.905516002989168e-06, + "loss": 0.7946, + "step": 2346 + }, + { + "epoch": 0.12917606912873575, + "grad_norm": 0.7401725053787231, + "learning_rate": 9.905432115194296e-06, + "loss": 0.7006, + "step": 2347 + }, + { + "epoch": 0.1292311079310914, + "grad_norm": 0.8263179659843445, + "learning_rate": 9.905348190531511e-06, + "loss": 0.7768, + "step": 2348 + }, + { + "epoch": 0.12928614673344707, + "grad_norm": 0.9241918921470642, + "learning_rate": 9.90526422900144e-06, + "loss": 0.8593, + "step": 2349 + }, + { + "epoch": 0.12934118553580273, + "grad_norm": 0.7804501056671143, + "learning_rate": 9.905180230604718e-06, + "loss": 0.7607, + "step": 2350 + }, + { + "epoch": 0.1293962243381584, + "grad_norm": 0.9408491253852844, + "learning_rate": 9.905096195341973e-06, + "loss": 0.8906, + "step": 2351 + }, + { + "epoch": 0.12945126314051406, + "grad_norm": 1.0356301069259644, + "learning_rate": 9.905012123213838e-06, + "loss": 0.8051, + "step": 2352 + }, + { + "epoch": 0.12950630194286972, + "grad_norm": 0.8546886444091797, + "learning_rate": 9.904928014220945e-06, + "loss": 0.7543, + "step": 2353 + }, + { + "epoch": 0.12956134074522538, + "grad_norm": 0.9229897856712341, + "learning_rate": 9.904843868363927e-06, + "loss": 0.8823, + "step": 2354 + }, + { + "epoch": 0.12961637954758104, + "grad_norm": 0.8364199995994568, + "learning_rate": 9.904759685643414e-06, + "loss": 0.8825, + "step": 2355 + }, + { + "epoch": 0.1296714183499367, + "grad_norm": 0.9092077016830444, + "learning_rate": 9.90467546606004e-06, + "loss": 0.8721, + "step": 2356 + }, + { + "epoch": 0.12972645715229236, + "grad_norm": 1.042973518371582, + "learning_rate": 9.904591209614441e-06, + "loss": 0.7984, + "step": 2357 + }, + { + "epoch": 0.12978149595464802, + "grad_norm": 0.7262618541717529, + "learning_rate": 9.904506916307243e-06, + "loss": 0.6721, + "step": 2358 + }, + { + "epoch": 0.12983653475700369, + "grad_norm": 0.7562826871871948, + "learning_rate": 9.904422586139086e-06, + "loss": 0.7702, + "step": 2359 + }, + { + "epoch": 0.12989157355935935, + "grad_norm": 0.8821595907211304, + "learning_rate": 9.904338219110603e-06, + "loss": 0.8555, + "step": 2360 + }, + { + "epoch": 0.129946612361715, + "grad_norm": 1.0340098142623901, + "learning_rate": 9.904253815222424e-06, + "loss": 0.9004, + "step": 2361 + }, + { + "epoch": 0.13000165116407067, + "grad_norm": 0.8533693552017212, + "learning_rate": 9.904169374475188e-06, + "loss": 0.836, + "step": 2362 + }, + { + "epoch": 0.13005668996642633, + "grad_norm": 0.8564199805259705, + "learning_rate": 9.904084896869528e-06, + "loss": 0.9281, + "step": 2363 + }, + { + "epoch": 0.130111728768782, + "grad_norm": 0.7817538976669312, + "learning_rate": 9.904000382406079e-06, + "loss": 0.7444, + "step": 2364 + }, + { + "epoch": 0.13016676757113765, + "grad_norm": 1.1420893669128418, + "learning_rate": 9.903915831085473e-06, + "loss": 0.9116, + "step": 2365 + }, + { + "epoch": 0.13022180637349332, + "grad_norm": 0.9671920537948608, + "learning_rate": 9.903831242908351e-06, + "loss": 0.899, + "step": 2366 + }, + { + "epoch": 0.13027684517584898, + "grad_norm": 0.8528717756271362, + "learning_rate": 9.903746617875345e-06, + "loss": 0.7231, + "step": 2367 + }, + { + "epoch": 0.13033188397820464, + "grad_norm": 0.786960244178772, + "learning_rate": 9.903661955987091e-06, + "loss": 0.7997, + "step": 2368 + }, + { + "epoch": 0.1303869227805603, + "grad_norm": 0.941683292388916, + "learning_rate": 9.903577257244228e-06, + "loss": 0.9127, + "step": 2369 + }, + { + "epoch": 0.13044196158291596, + "grad_norm": 0.886900007724762, + "learning_rate": 9.903492521647391e-06, + "loss": 0.9086, + "step": 2370 + }, + { + "epoch": 0.13049700038527162, + "grad_norm": 0.9924801588058472, + "learning_rate": 9.903407749197216e-06, + "loss": 0.9055, + "step": 2371 + }, + { + "epoch": 0.13055203918762728, + "grad_norm": 0.6998724341392517, + "learning_rate": 9.903322939894342e-06, + "loss": 0.6972, + "step": 2372 + }, + { + "epoch": 0.13060707798998294, + "grad_norm": 0.8448702096939087, + "learning_rate": 9.903238093739404e-06, + "loss": 0.7862, + "step": 2373 + }, + { + "epoch": 0.1306621167923386, + "grad_norm": 0.8557441830635071, + "learning_rate": 9.90315321073304e-06, + "loss": 0.8364, + "step": 2374 + }, + { + "epoch": 0.13071715559469427, + "grad_norm": 0.7978441119194031, + "learning_rate": 9.903068290875892e-06, + "loss": 0.7671, + "step": 2375 + }, + { + "epoch": 0.13077219439704993, + "grad_norm": 0.781315803527832, + "learning_rate": 9.902983334168594e-06, + "loss": 0.7963, + "step": 2376 + }, + { + "epoch": 0.1308272331994056, + "grad_norm": 0.7326155304908752, + "learning_rate": 9.902898340611785e-06, + "loss": 0.8, + "step": 2377 + }, + { + "epoch": 0.13088227200176125, + "grad_norm": 0.7693139314651489, + "learning_rate": 9.902813310206105e-06, + "loss": 0.8459, + "step": 2378 + }, + { + "epoch": 0.1309373108041169, + "grad_norm": 0.9441308975219727, + "learning_rate": 9.902728242952191e-06, + "loss": 0.8519, + "step": 2379 + }, + { + "epoch": 0.13099234960647257, + "grad_norm": 0.8350616693496704, + "learning_rate": 9.902643138850686e-06, + "loss": 0.876, + "step": 2380 + }, + { + "epoch": 0.13104738840882824, + "grad_norm": 0.8675554394721985, + "learning_rate": 9.902557997902227e-06, + "loss": 0.8172, + "step": 2381 + }, + { + "epoch": 0.1311024272111839, + "grad_norm": 0.9618930220603943, + "learning_rate": 9.902472820107454e-06, + "loss": 0.8852, + "step": 2382 + }, + { + "epoch": 0.13115746601353956, + "grad_norm": 0.862341046333313, + "learning_rate": 9.902387605467007e-06, + "loss": 0.9256, + "step": 2383 + }, + { + "epoch": 0.1312125048158952, + "grad_norm": 0.8749859929084778, + "learning_rate": 9.902302353981527e-06, + "loss": 0.8809, + "step": 2384 + }, + { + "epoch": 0.13126754361825085, + "grad_norm": 0.9061958193778992, + "learning_rate": 9.902217065651657e-06, + "loss": 0.779, + "step": 2385 + }, + { + "epoch": 0.13132258242060652, + "grad_norm": 0.8909298777580261, + "learning_rate": 9.902131740478033e-06, + "loss": 0.8203, + "step": 2386 + }, + { + "epoch": 0.13137762122296218, + "grad_norm": 0.8507269024848938, + "learning_rate": 9.902046378461302e-06, + "loss": 0.776, + "step": 2387 + }, + { + "epoch": 0.13143266002531784, + "grad_norm": 0.9577299356460571, + "learning_rate": 9.901960979602101e-06, + "loss": 0.8104, + "step": 2388 + }, + { + "epoch": 0.1314876988276735, + "grad_norm": 0.9244948625564575, + "learning_rate": 9.901875543901074e-06, + "loss": 0.9035, + "step": 2389 + }, + { + "epoch": 0.13154273763002916, + "grad_norm": 0.7534334063529968, + "learning_rate": 9.901790071358861e-06, + "loss": 0.7262, + "step": 2390 + }, + { + "epoch": 0.13159777643238482, + "grad_norm": 0.8920090198516846, + "learning_rate": 9.901704561976106e-06, + "loss": 0.932, + "step": 2391 + }, + { + "epoch": 0.13165281523474048, + "grad_norm": 0.8524243235588074, + "learning_rate": 9.901619015753455e-06, + "loss": 0.8107, + "step": 2392 + }, + { + "epoch": 0.13170785403709614, + "grad_norm": 0.8170381784439087, + "learning_rate": 9.901533432691543e-06, + "loss": 0.8814, + "step": 2393 + }, + { + "epoch": 0.1317628928394518, + "grad_norm": 0.8281697034835815, + "learning_rate": 9.90144781279102e-06, + "loss": 0.8221, + "step": 2394 + }, + { + "epoch": 0.13181793164180747, + "grad_norm": 0.9283351302146912, + "learning_rate": 9.901362156052528e-06, + "loss": 0.8346, + "step": 2395 + }, + { + "epoch": 0.13187297044416313, + "grad_norm": 0.8331275582313538, + "learning_rate": 9.901276462476708e-06, + "loss": 0.7498, + "step": 2396 + }, + { + "epoch": 0.1319280092465188, + "grad_norm": 0.8427191972732544, + "learning_rate": 9.901190732064207e-06, + "loss": 0.8265, + "step": 2397 + }, + { + "epoch": 0.13198304804887445, + "grad_norm": 0.8510351777076721, + "learning_rate": 9.901104964815669e-06, + "loss": 0.8369, + "step": 2398 + }, + { + "epoch": 0.1320380868512301, + "grad_norm": 0.8468914031982422, + "learning_rate": 9.901019160731738e-06, + "loss": 0.8585, + "step": 2399 + }, + { + "epoch": 0.13209312565358577, + "grad_norm": 0.8302182555198669, + "learning_rate": 9.900933319813058e-06, + "loss": 0.8611, + "step": 2400 + }, + { + "epoch": 0.13214816445594144, + "grad_norm": 0.8527448773384094, + "learning_rate": 9.900847442060277e-06, + "loss": 0.899, + "step": 2401 + }, + { + "epoch": 0.1322032032582971, + "grad_norm": 0.8354688286781311, + "learning_rate": 9.900761527474037e-06, + "loss": 0.8083, + "step": 2402 + }, + { + "epoch": 0.13225824206065276, + "grad_norm": 0.8612173795700073, + "learning_rate": 9.900675576054986e-06, + "loss": 0.8124, + "step": 2403 + }, + { + "epoch": 0.13231328086300842, + "grad_norm": 0.7424876689910889, + "learning_rate": 9.900589587803767e-06, + "loss": 0.6884, + "step": 2404 + }, + { + "epoch": 0.13236831966536408, + "grad_norm": 0.8431115746498108, + "learning_rate": 9.90050356272103e-06, + "loss": 0.9575, + "step": 2405 + }, + { + "epoch": 0.13242335846771974, + "grad_norm": 0.7958092093467712, + "learning_rate": 9.90041750080742e-06, + "loss": 0.7608, + "step": 2406 + }, + { + "epoch": 0.1324783972700754, + "grad_norm": 0.926258385181427, + "learning_rate": 9.900331402063583e-06, + "loss": 0.9072, + "step": 2407 + }, + { + "epoch": 0.13253343607243107, + "grad_norm": 0.7952526807785034, + "learning_rate": 9.900245266490169e-06, + "loss": 0.8001, + "step": 2408 + }, + { + "epoch": 0.13258847487478673, + "grad_norm": 0.8309933543205261, + "learning_rate": 9.900159094087822e-06, + "loss": 0.9154, + "step": 2409 + }, + { + "epoch": 0.1326435136771424, + "grad_norm": 0.858007550239563, + "learning_rate": 9.90007288485719e-06, + "loss": 0.855, + "step": 2410 + }, + { + "epoch": 0.13269855247949805, + "grad_norm": 0.9513822197914124, + "learning_rate": 9.899986638798923e-06, + "loss": 0.8162, + "step": 2411 + }, + { + "epoch": 0.1327535912818537, + "grad_norm": 0.8387427926063538, + "learning_rate": 9.899900355913668e-06, + "loss": 0.8955, + "step": 2412 + }, + { + "epoch": 0.13280863008420937, + "grad_norm": 0.7727940678596497, + "learning_rate": 9.899814036202073e-06, + "loss": 0.6765, + "step": 2413 + }, + { + "epoch": 0.13286366888656503, + "grad_norm": 0.7760928869247437, + "learning_rate": 9.899727679664788e-06, + "loss": 0.7179, + "step": 2414 + }, + { + "epoch": 0.1329187076889207, + "grad_norm": 0.7798073887825012, + "learning_rate": 9.899641286302462e-06, + "loss": 0.8541, + "step": 2415 + }, + { + "epoch": 0.13297374649127636, + "grad_norm": 0.8302769660949707, + "learning_rate": 9.899554856115743e-06, + "loss": 0.8925, + "step": 2416 + }, + { + "epoch": 0.13302878529363202, + "grad_norm": 0.8300751447677612, + "learning_rate": 9.89946838910528e-06, + "loss": 0.7489, + "step": 2417 + }, + { + "epoch": 0.13308382409598768, + "grad_norm": 0.8032094240188599, + "learning_rate": 9.899381885271725e-06, + "loss": 0.811, + "step": 2418 + }, + { + "epoch": 0.13313886289834334, + "grad_norm": 5.237870216369629, + "learning_rate": 9.899295344615727e-06, + "loss": 0.7609, + "step": 2419 + }, + { + "epoch": 0.133193901700699, + "grad_norm": 0.8145740628242493, + "learning_rate": 9.899208767137935e-06, + "loss": 0.8435, + "step": 2420 + }, + { + "epoch": 0.13324894050305466, + "grad_norm": 0.9716018438339233, + "learning_rate": 9.899122152839004e-06, + "loss": 0.7924, + "step": 2421 + }, + { + "epoch": 0.13330397930541033, + "grad_norm": 0.7846183776855469, + "learning_rate": 9.899035501719582e-06, + "loss": 0.8941, + "step": 2422 + }, + { + "epoch": 0.133359018107766, + "grad_norm": 0.7653689980506897, + "learning_rate": 9.89894881378032e-06, + "loss": 0.811, + "step": 2423 + }, + { + "epoch": 0.13341405691012165, + "grad_norm": 0.8221875429153442, + "learning_rate": 9.89886208902187e-06, + "loss": 0.8131, + "step": 2424 + }, + { + "epoch": 0.1334690957124773, + "grad_norm": 0.7422335147857666, + "learning_rate": 9.898775327444885e-06, + "loss": 0.6366, + "step": 2425 + }, + { + "epoch": 0.13352413451483297, + "grad_norm": 0.8072695136070251, + "learning_rate": 9.898688529050014e-06, + "loss": 0.7989, + "step": 2426 + }, + { + "epoch": 0.1335791733171886, + "grad_norm": 0.7717600464820862, + "learning_rate": 9.898601693837911e-06, + "loss": 0.7524, + "step": 2427 + }, + { + "epoch": 0.13363421211954427, + "grad_norm": 0.8070919513702393, + "learning_rate": 9.898514821809231e-06, + "loss": 0.7724, + "step": 2428 + }, + { + "epoch": 0.13368925092189993, + "grad_norm": 0.8184726238250732, + "learning_rate": 9.898427912964624e-06, + "loss": 0.845, + "step": 2429 + }, + { + "epoch": 0.1337442897242556, + "grad_norm": 0.8168759346008301, + "learning_rate": 9.898340967304744e-06, + "loss": 0.8377, + "step": 2430 + }, + { + "epoch": 0.13379932852661125, + "grad_norm": 0.8701872825622559, + "learning_rate": 9.898253984830244e-06, + "loss": 0.908, + "step": 2431 + }, + { + "epoch": 0.1338543673289669, + "grad_norm": 0.8092133402824402, + "learning_rate": 9.898166965541779e-06, + "loss": 0.866, + "step": 2432 + }, + { + "epoch": 0.13390940613132257, + "grad_norm": 0.8337095975875854, + "learning_rate": 9.898079909440002e-06, + "loss": 0.8622, + "step": 2433 + }, + { + "epoch": 0.13396444493367823, + "grad_norm": 1.1016209125518799, + "learning_rate": 9.897992816525567e-06, + "loss": 0.8486, + "step": 2434 + }, + { + "epoch": 0.1340194837360339, + "grad_norm": 0.8136518597602844, + "learning_rate": 9.89790568679913e-06, + "loss": 0.8681, + "step": 2435 + }, + { + "epoch": 0.13407452253838956, + "grad_norm": 0.8202341794967651, + "learning_rate": 9.897818520261344e-06, + "loss": 0.9144, + "step": 2436 + }, + { + "epoch": 0.13412956134074522, + "grad_norm": 0.8836861848831177, + "learning_rate": 9.897731316912866e-06, + "loss": 0.8643, + "step": 2437 + }, + { + "epoch": 0.13418460014310088, + "grad_norm": 0.9040210247039795, + "learning_rate": 9.89764407675435e-06, + "loss": 0.7681, + "step": 2438 + }, + { + "epoch": 0.13423963894545654, + "grad_norm": 0.8762359619140625, + "learning_rate": 9.897556799786452e-06, + "loss": 0.8765, + "step": 2439 + }, + { + "epoch": 0.1342946777478122, + "grad_norm": 0.8859462738037109, + "learning_rate": 9.897469486009827e-06, + "loss": 0.9051, + "step": 2440 + }, + { + "epoch": 0.13434971655016786, + "grad_norm": 0.7727539539337158, + "learning_rate": 9.897382135425134e-06, + "loss": 0.7397, + "step": 2441 + }, + { + "epoch": 0.13440475535252353, + "grad_norm": 0.9018967151641846, + "learning_rate": 9.897294748033028e-06, + "loss": 0.8542, + "step": 2442 + }, + { + "epoch": 0.1344597941548792, + "grad_norm": 0.8228337168693542, + "learning_rate": 9.897207323834165e-06, + "loss": 0.7585, + "step": 2443 + }, + { + "epoch": 0.13451483295723485, + "grad_norm": 0.7509974241256714, + "learning_rate": 9.897119862829203e-06, + "loss": 0.7285, + "step": 2444 + }, + { + "epoch": 0.1345698717595905, + "grad_norm": 0.9225835800170898, + "learning_rate": 9.897032365018797e-06, + "loss": 0.8352, + "step": 2445 + }, + { + "epoch": 0.13462491056194617, + "grad_norm": 0.800981879234314, + "learning_rate": 9.896944830403609e-06, + "loss": 0.7352, + "step": 2446 + }, + { + "epoch": 0.13467994936430183, + "grad_norm": 0.8263673186302185, + "learning_rate": 9.896857258984294e-06, + "loss": 0.8426, + "step": 2447 + }, + { + "epoch": 0.1347349881666575, + "grad_norm": 0.8857110738754272, + "learning_rate": 9.89676965076151e-06, + "loss": 0.8078, + "step": 2448 + }, + { + "epoch": 0.13479002696901315, + "grad_norm": 0.8637158274650574, + "learning_rate": 9.896682005735916e-06, + "loss": 0.8688, + "step": 2449 + }, + { + "epoch": 0.13484506577136882, + "grad_norm": 0.9050095081329346, + "learning_rate": 9.89659432390817e-06, + "loss": 0.831, + "step": 2450 + }, + { + "epoch": 0.13490010457372448, + "grad_norm": 0.829757034778595, + "learning_rate": 9.896506605278933e-06, + "loss": 0.8095, + "step": 2451 + }, + { + "epoch": 0.13495514337608014, + "grad_norm": 0.8910449743270874, + "learning_rate": 9.896418849848864e-06, + "loss": 0.9134, + "step": 2452 + }, + { + "epoch": 0.1350101821784358, + "grad_norm": 0.8856307864189148, + "learning_rate": 9.89633105761862e-06, + "loss": 0.8171, + "step": 2453 + }, + { + "epoch": 0.13506522098079146, + "grad_norm": 0.8159938454627991, + "learning_rate": 9.896243228588864e-06, + "loss": 0.8205, + "step": 2454 + }, + { + "epoch": 0.13512025978314712, + "grad_norm": 0.8200929760932922, + "learning_rate": 9.896155362760254e-06, + "loss": 0.7529, + "step": 2455 + }, + { + "epoch": 0.13517529858550278, + "grad_norm": 0.7591279149055481, + "learning_rate": 9.89606746013345e-06, + "loss": 0.8205, + "step": 2456 + }, + { + "epoch": 0.13523033738785845, + "grad_norm": 0.8598676323890686, + "learning_rate": 9.895979520709114e-06, + "loss": 0.8212, + "step": 2457 + }, + { + "epoch": 0.1352853761902141, + "grad_norm": 0.7290365099906921, + "learning_rate": 9.895891544487905e-06, + "loss": 0.7893, + "step": 2458 + }, + { + "epoch": 0.13534041499256977, + "grad_norm": 0.8040594458580017, + "learning_rate": 9.895803531470487e-06, + "loss": 0.8358, + "step": 2459 + }, + { + "epoch": 0.13539545379492543, + "grad_norm": 0.9286525249481201, + "learning_rate": 9.895715481657522e-06, + "loss": 0.8104, + "step": 2460 + }, + { + "epoch": 0.1354504925972811, + "grad_norm": 0.843054473400116, + "learning_rate": 9.895627395049668e-06, + "loss": 0.7872, + "step": 2461 + }, + { + "epoch": 0.13550553139963675, + "grad_norm": 0.7894387245178223, + "learning_rate": 9.895539271647588e-06, + "loss": 0.8615, + "step": 2462 + }, + { + "epoch": 0.13556057020199241, + "grad_norm": 0.9185294508934021, + "learning_rate": 9.895451111451948e-06, + "loss": 0.8732, + "step": 2463 + }, + { + "epoch": 0.13561560900434808, + "grad_norm": 0.8586474657058716, + "learning_rate": 9.895362914463405e-06, + "loss": 0.9658, + "step": 2464 + }, + { + "epoch": 0.13567064780670374, + "grad_norm": 0.8810474276542664, + "learning_rate": 9.895274680682628e-06, + "loss": 0.8622, + "step": 2465 + }, + { + "epoch": 0.1357256866090594, + "grad_norm": 0.8862990736961365, + "learning_rate": 9.895186410110273e-06, + "loss": 0.916, + "step": 2466 + }, + { + "epoch": 0.13578072541141506, + "grad_norm": 0.7916743159294128, + "learning_rate": 9.89509810274701e-06, + "loss": 0.837, + "step": 2467 + }, + { + "epoch": 0.13583576421377072, + "grad_norm": 0.9063515663146973, + "learning_rate": 9.8950097585935e-06, + "loss": 0.8065, + "step": 2468 + }, + { + "epoch": 0.13589080301612638, + "grad_norm": 0.7656043767929077, + "learning_rate": 9.894921377650405e-06, + "loss": 0.7064, + "step": 2469 + }, + { + "epoch": 0.13594584181848202, + "grad_norm": 1.0630278587341309, + "learning_rate": 9.894832959918392e-06, + "loss": 0.8168, + "step": 2470 + }, + { + "epoch": 0.13600088062083768, + "grad_norm": 0.9118956923484802, + "learning_rate": 9.894744505398126e-06, + "loss": 0.8972, + "step": 2471 + }, + { + "epoch": 0.13605591942319334, + "grad_norm": 0.8989213705062866, + "learning_rate": 9.89465601409027e-06, + "loss": 0.8374, + "step": 2472 + }, + { + "epoch": 0.136110958225549, + "grad_norm": 0.9398229718208313, + "learning_rate": 9.894567485995489e-06, + "loss": 0.8956, + "step": 2473 + }, + { + "epoch": 0.13616599702790466, + "grad_norm": 0.7980280518531799, + "learning_rate": 9.894478921114449e-06, + "loss": 0.8055, + "step": 2474 + }, + { + "epoch": 0.13622103583026032, + "grad_norm": 0.8910034894943237, + "learning_rate": 9.894390319447816e-06, + "loss": 0.8371, + "step": 2475 + }, + { + "epoch": 0.13627607463261598, + "grad_norm": 0.7848070859909058, + "learning_rate": 9.894301680996255e-06, + "loss": 0.8024, + "step": 2476 + }, + { + "epoch": 0.13633111343497165, + "grad_norm": 0.8538175821304321, + "learning_rate": 9.894213005760434e-06, + "loss": 0.8819, + "step": 2477 + }, + { + "epoch": 0.1363861522373273, + "grad_norm": 0.7885367274284363, + "learning_rate": 9.894124293741017e-06, + "loss": 0.7916, + "step": 2478 + }, + { + "epoch": 0.13644119103968297, + "grad_norm": 0.8555673956871033, + "learning_rate": 9.894035544938672e-06, + "loss": 0.8521, + "step": 2479 + }, + { + "epoch": 0.13649622984203863, + "grad_norm": 0.8104771971702576, + "learning_rate": 9.893946759354066e-06, + "loss": 0.8437, + "step": 2480 + }, + { + "epoch": 0.1365512686443943, + "grad_norm": 0.9131864309310913, + "learning_rate": 9.893857936987866e-06, + "loss": 0.8123, + "step": 2481 + }, + { + "epoch": 0.13660630744674995, + "grad_norm": 0.9414293766021729, + "learning_rate": 9.893769077840739e-06, + "loss": 0.7897, + "step": 2482 + }, + { + "epoch": 0.13666134624910561, + "grad_norm": 0.823265016078949, + "learning_rate": 9.893680181913355e-06, + "loss": 0.847, + "step": 2483 + }, + { + "epoch": 0.13671638505146128, + "grad_norm": 0.82098788022995, + "learning_rate": 9.89359124920638e-06, + "loss": 0.7823, + "step": 2484 + }, + { + "epoch": 0.13677142385381694, + "grad_norm": 0.817551851272583, + "learning_rate": 9.893502279720483e-06, + "loss": 0.8084, + "step": 2485 + }, + { + "epoch": 0.1368264626561726, + "grad_norm": 1.0722150802612305, + "learning_rate": 9.893413273456333e-06, + "loss": 0.7394, + "step": 2486 + }, + { + "epoch": 0.13688150145852826, + "grad_norm": 0.8045433759689331, + "learning_rate": 9.893324230414598e-06, + "loss": 0.7528, + "step": 2487 + }, + { + "epoch": 0.13693654026088392, + "grad_norm": 0.8694071173667908, + "learning_rate": 9.893235150595949e-06, + "loss": 0.803, + "step": 2488 + }, + { + "epoch": 0.13699157906323958, + "grad_norm": 0.8238615989685059, + "learning_rate": 9.893146034001054e-06, + "loss": 0.7909, + "step": 2489 + }, + { + "epoch": 0.13704661786559524, + "grad_norm": 0.7782405018806458, + "learning_rate": 9.893056880630583e-06, + "loss": 0.6859, + "step": 2490 + }, + { + "epoch": 0.1371016566679509, + "grad_norm": 0.7865599989891052, + "learning_rate": 9.892967690485207e-06, + "loss": 0.7982, + "step": 2491 + }, + { + "epoch": 0.13715669547030657, + "grad_norm": 0.768120288848877, + "learning_rate": 9.892878463565595e-06, + "loss": 0.8234, + "step": 2492 + }, + { + "epoch": 0.13721173427266223, + "grad_norm": 0.812493085861206, + "learning_rate": 9.89278919987242e-06, + "loss": 0.9152, + "step": 2493 + }, + { + "epoch": 0.1372667730750179, + "grad_norm": 0.7256335616111755, + "learning_rate": 9.892699899406348e-06, + "loss": 0.6703, + "step": 2494 + }, + { + "epoch": 0.13732181187737355, + "grad_norm": 0.8022804260253906, + "learning_rate": 9.892610562168054e-06, + "loss": 0.7918, + "step": 2495 + }, + { + "epoch": 0.1373768506797292, + "grad_norm": 0.8204907774925232, + "learning_rate": 9.89252118815821e-06, + "loss": 0.9094, + "step": 2496 + }, + { + "epoch": 0.13743188948208487, + "grad_norm": 0.9986788630485535, + "learning_rate": 9.892431777377484e-06, + "loss": 0.8921, + "step": 2497 + }, + { + "epoch": 0.13748692828444053, + "grad_norm": 0.7937983870506287, + "learning_rate": 9.892342329826554e-06, + "loss": 0.8048, + "step": 2498 + }, + { + "epoch": 0.1375419670867962, + "grad_norm": 0.9295744895935059, + "learning_rate": 9.892252845506086e-06, + "loss": 0.755, + "step": 2499 + }, + { + "epoch": 0.13759700588915186, + "grad_norm": 0.7920984625816345, + "learning_rate": 9.892163324416757e-06, + "loss": 0.7603, + "step": 2500 + }, + { + "epoch": 0.13765204469150752, + "grad_norm": 0.9229464530944824, + "learning_rate": 9.892073766559236e-06, + "loss": 0.8115, + "step": 2501 + }, + { + "epoch": 0.13770708349386318, + "grad_norm": 0.8205353021621704, + "learning_rate": 9.8919841719342e-06, + "loss": 0.8357, + "step": 2502 + }, + { + "epoch": 0.13776212229621884, + "grad_norm": 0.86461341381073, + "learning_rate": 9.891894540542318e-06, + "loss": 0.748, + "step": 2503 + }, + { + "epoch": 0.1378171610985745, + "grad_norm": 0.767145574092865, + "learning_rate": 9.891804872384267e-06, + "loss": 0.7404, + "step": 2504 + }, + { + "epoch": 0.13787219990093016, + "grad_norm": 0.7492040991783142, + "learning_rate": 9.891715167460721e-06, + "loss": 0.6958, + "step": 2505 + }, + { + "epoch": 0.13792723870328583, + "grad_norm": 0.8643150329589844, + "learning_rate": 9.891625425772353e-06, + "loss": 0.8408, + "step": 2506 + }, + { + "epoch": 0.1379822775056415, + "grad_norm": 0.8026981353759766, + "learning_rate": 9.891535647319838e-06, + "loss": 0.7895, + "step": 2507 + }, + { + "epoch": 0.13803731630799715, + "grad_norm": 1.2780394554138184, + "learning_rate": 9.89144583210385e-06, + "loss": 0.9113, + "step": 2508 + }, + { + "epoch": 0.1380923551103528, + "grad_norm": 0.8476191163063049, + "learning_rate": 9.891355980125064e-06, + "loss": 0.8224, + "step": 2509 + }, + { + "epoch": 0.13814739391270847, + "grad_norm": 1.048682689666748, + "learning_rate": 9.891266091384157e-06, + "loss": 0.8913, + "step": 2510 + }, + { + "epoch": 0.13820243271506413, + "grad_norm": 1.0314993858337402, + "learning_rate": 9.891176165881801e-06, + "loss": 0.8315, + "step": 2511 + }, + { + "epoch": 0.1382574715174198, + "grad_norm": 0.9500058889389038, + "learning_rate": 9.891086203618676e-06, + "loss": 0.9185, + "step": 2512 + }, + { + "epoch": 0.13831251031977543, + "grad_norm": 0.7860653400421143, + "learning_rate": 9.890996204595457e-06, + "loss": 0.804, + "step": 2513 + }, + { + "epoch": 0.1383675491221311, + "grad_norm": 0.8354741930961609, + "learning_rate": 9.89090616881282e-06, + "loss": 0.8214, + "step": 2514 + }, + { + "epoch": 0.13842258792448675, + "grad_norm": 0.9115905165672302, + "learning_rate": 9.890816096271438e-06, + "loss": 0.8801, + "step": 2515 + }, + { + "epoch": 0.1384776267268424, + "grad_norm": 0.8852075338363647, + "learning_rate": 9.890725986971994e-06, + "loss": 0.8821, + "step": 2516 + }, + { + "epoch": 0.13853266552919807, + "grad_norm": 0.804314374923706, + "learning_rate": 9.890635840915164e-06, + "loss": 0.8412, + "step": 2517 + }, + { + "epoch": 0.13858770433155373, + "grad_norm": 0.8242805600166321, + "learning_rate": 9.890545658101623e-06, + "loss": 0.8447, + "step": 2518 + }, + { + "epoch": 0.1386427431339094, + "grad_norm": 0.8385655879974365, + "learning_rate": 9.890455438532048e-06, + "loss": 0.8161, + "step": 2519 + }, + { + "epoch": 0.13869778193626506, + "grad_norm": 0.7950524687767029, + "learning_rate": 9.89036518220712e-06, + "loss": 0.8024, + "step": 2520 + }, + { + "epoch": 0.13875282073862072, + "grad_norm": 1.0031861066818237, + "learning_rate": 9.890274889127518e-06, + "loss": 0.8399, + "step": 2521 + }, + { + "epoch": 0.13880785954097638, + "grad_norm": 0.8403242230415344, + "learning_rate": 9.890184559293917e-06, + "loss": 0.8115, + "step": 2522 + }, + { + "epoch": 0.13886289834333204, + "grad_norm": 0.8389976024627686, + "learning_rate": 9.890094192706998e-06, + "loss": 0.9573, + "step": 2523 + }, + { + "epoch": 0.1389179371456877, + "grad_norm": 0.8408516645431519, + "learning_rate": 9.890003789367442e-06, + "loss": 0.8572, + "step": 2524 + }, + { + "epoch": 0.13897297594804336, + "grad_norm": 0.7607787251472473, + "learning_rate": 9.889913349275925e-06, + "loss": 0.8119, + "step": 2525 + }, + { + "epoch": 0.13902801475039903, + "grad_norm": 0.7696373462677002, + "learning_rate": 9.889822872433127e-06, + "loss": 0.8287, + "step": 2526 + }, + { + "epoch": 0.1390830535527547, + "grad_norm": 0.8518380522727966, + "learning_rate": 9.889732358839732e-06, + "loss": 0.9008, + "step": 2527 + }, + { + "epoch": 0.13913809235511035, + "grad_norm": 0.8851314783096313, + "learning_rate": 9.889641808496416e-06, + "loss": 0.8148, + "step": 2528 + }, + { + "epoch": 0.139193131157466, + "grad_norm": 0.9245797395706177, + "learning_rate": 9.889551221403862e-06, + "loss": 0.846, + "step": 2529 + }, + { + "epoch": 0.13924816995982167, + "grad_norm": 0.8445762991905212, + "learning_rate": 9.889460597562748e-06, + "loss": 0.8306, + "step": 2530 + }, + { + "epoch": 0.13930320876217733, + "grad_norm": 0.9149277806282043, + "learning_rate": 9.88936993697376e-06, + "loss": 0.8033, + "step": 2531 + }, + { + "epoch": 0.139358247564533, + "grad_norm": 0.894666850566864, + "learning_rate": 9.889279239637572e-06, + "loss": 0.8299, + "step": 2532 + }, + { + "epoch": 0.13941328636688866, + "grad_norm": 1.2897371053695679, + "learning_rate": 9.889188505554871e-06, + "loss": 0.7776, + "step": 2533 + }, + { + "epoch": 0.13946832516924432, + "grad_norm": 0.8927022218704224, + "learning_rate": 9.889097734726341e-06, + "loss": 0.8706, + "step": 2534 + }, + { + "epoch": 0.13952336397159998, + "grad_norm": 0.7688571214675903, + "learning_rate": 9.889006927152658e-06, + "loss": 0.8191, + "step": 2535 + }, + { + "epoch": 0.13957840277395564, + "grad_norm": 0.926671028137207, + "learning_rate": 9.88891608283451e-06, + "loss": 0.7489, + "step": 2536 + }, + { + "epoch": 0.1396334415763113, + "grad_norm": 0.8316965699195862, + "learning_rate": 9.888825201772577e-06, + "loss": 0.7783, + "step": 2537 + }, + { + "epoch": 0.13968848037866696, + "grad_norm": 0.8619750738143921, + "learning_rate": 9.88873428396754e-06, + "loss": 0.8269, + "step": 2538 + }, + { + "epoch": 0.13974351918102262, + "grad_norm": 0.8588540554046631, + "learning_rate": 9.888643329420086e-06, + "loss": 0.8133, + "step": 2539 + }, + { + "epoch": 0.13979855798337829, + "grad_norm": 0.7947841882705688, + "learning_rate": 9.8885523381309e-06, + "loss": 0.8041, + "step": 2540 + }, + { + "epoch": 0.13985359678573395, + "grad_norm": 0.8440257906913757, + "learning_rate": 9.888461310100661e-06, + "loss": 0.8324, + "step": 2541 + }, + { + "epoch": 0.1399086355880896, + "grad_norm": 0.7842260003089905, + "learning_rate": 9.888370245330055e-06, + "loss": 0.8031, + "step": 2542 + }, + { + "epoch": 0.13996367439044527, + "grad_norm": 0.8108223080635071, + "learning_rate": 9.888279143819768e-06, + "loss": 0.7998, + "step": 2543 + }, + { + "epoch": 0.14001871319280093, + "grad_norm": 0.9748625159263611, + "learning_rate": 9.888188005570482e-06, + "loss": 0.9553, + "step": 2544 + }, + { + "epoch": 0.1400737519951566, + "grad_norm": 0.8465562462806702, + "learning_rate": 9.888096830582883e-06, + "loss": 0.7884, + "step": 2545 + }, + { + "epoch": 0.14012879079751225, + "grad_norm": 0.9339833855628967, + "learning_rate": 9.88800561885766e-06, + "loss": 0.8135, + "step": 2546 + }, + { + "epoch": 0.14018382959986792, + "grad_norm": 0.7749297022819519, + "learning_rate": 9.887914370395492e-06, + "loss": 0.8411, + "step": 2547 + }, + { + "epoch": 0.14023886840222358, + "grad_norm": 0.862606942653656, + "learning_rate": 9.887823085197068e-06, + "loss": 0.7631, + "step": 2548 + }, + { + "epoch": 0.14029390720457924, + "grad_norm": 1.3383793830871582, + "learning_rate": 9.887731763263076e-06, + "loss": 0.7979, + "step": 2549 + }, + { + "epoch": 0.1403489460069349, + "grad_norm": 0.8092008233070374, + "learning_rate": 9.887640404594199e-06, + "loss": 0.7566, + "step": 2550 + }, + { + "epoch": 0.14040398480929056, + "grad_norm": 0.9233745336532593, + "learning_rate": 9.887549009191126e-06, + "loss": 0.8954, + "step": 2551 + }, + { + "epoch": 0.14045902361164622, + "grad_norm": 0.8533664345741272, + "learning_rate": 9.887457577054542e-06, + "loss": 0.8311, + "step": 2552 + }, + { + "epoch": 0.14051406241400188, + "grad_norm": 0.7679287791252136, + "learning_rate": 9.887366108185135e-06, + "loss": 0.7641, + "step": 2553 + }, + { + "epoch": 0.14056910121635754, + "grad_norm": 0.7998354434967041, + "learning_rate": 9.887274602583594e-06, + "loss": 0.7759, + "step": 2554 + }, + { + "epoch": 0.1406241400187132, + "grad_norm": 0.8877138495445251, + "learning_rate": 9.887183060250605e-06, + "loss": 0.8928, + "step": 2555 + }, + { + "epoch": 0.14067917882106884, + "grad_norm": 0.8022066354751587, + "learning_rate": 9.887091481186855e-06, + "loss": 0.8233, + "step": 2556 + }, + { + "epoch": 0.1407342176234245, + "grad_norm": 0.8419097065925598, + "learning_rate": 9.886999865393035e-06, + "loss": 0.8044, + "step": 2557 + }, + { + "epoch": 0.14078925642578016, + "grad_norm": 0.9581286311149597, + "learning_rate": 9.88690821286983e-06, + "loss": 0.8531, + "step": 2558 + }, + { + "epoch": 0.14084429522813582, + "grad_norm": 0.894851803779602, + "learning_rate": 9.886816523617933e-06, + "loss": 0.8594, + "step": 2559 + }, + { + "epoch": 0.14089933403049149, + "grad_norm": 0.7813432812690735, + "learning_rate": 9.886724797638032e-06, + "loss": 0.7311, + "step": 2560 + }, + { + "epoch": 0.14095437283284715, + "grad_norm": 0.8194118142127991, + "learning_rate": 9.886633034930814e-06, + "loss": 0.8067, + "step": 2561 + }, + { + "epoch": 0.1410094116352028, + "grad_norm": 0.8091121912002563, + "learning_rate": 9.88654123549697e-06, + "loss": 0.7558, + "step": 2562 + }, + { + "epoch": 0.14106445043755847, + "grad_norm": 0.8334764242172241, + "learning_rate": 9.88644939933719e-06, + "loss": 0.8375, + "step": 2563 + }, + { + "epoch": 0.14111948923991413, + "grad_norm": 0.8283817768096924, + "learning_rate": 9.886357526452166e-06, + "loss": 0.7839, + "step": 2564 + }, + { + "epoch": 0.1411745280422698, + "grad_norm": 0.8708772659301758, + "learning_rate": 9.886265616842585e-06, + "loss": 0.8193, + "step": 2565 + }, + { + "epoch": 0.14122956684462545, + "grad_norm": 0.9883641600608826, + "learning_rate": 9.886173670509141e-06, + "loss": 0.9409, + "step": 2566 + }, + { + "epoch": 0.14128460564698112, + "grad_norm": 0.8601766228675842, + "learning_rate": 9.886081687452523e-06, + "loss": 0.9391, + "step": 2567 + }, + { + "epoch": 0.14133964444933678, + "grad_norm": 0.8729620575904846, + "learning_rate": 9.885989667673422e-06, + "loss": 0.8372, + "step": 2568 + }, + { + "epoch": 0.14139468325169244, + "grad_norm": 0.7899564504623413, + "learning_rate": 9.885897611172532e-06, + "loss": 0.7773, + "step": 2569 + }, + { + "epoch": 0.1414497220540481, + "grad_norm": 0.8120512962341309, + "learning_rate": 9.885805517950542e-06, + "loss": 0.887, + "step": 2570 + }, + { + "epoch": 0.14150476085640376, + "grad_norm": 0.8475256562232971, + "learning_rate": 9.885713388008148e-06, + "loss": 0.7935, + "step": 2571 + }, + { + "epoch": 0.14155979965875942, + "grad_norm": 0.7669919729232788, + "learning_rate": 9.885621221346038e-06, + "loss": 0.7728, + "step": 2572 + }, + { + "epoch": 0.14161483846111508, + "grad_norm": 0.8298916220664978, + "learning_rate": 9.885529017964906e-06, + "loss": 0.7723, + "step": 2573 + }, + { + "epoch": 0.14166987726347074, + "grad_norm": 0.8630721569061279, + "learning_rate": 9.885436777865447e-06, + "loss": 0.8395, + "step": 2574 + }, + { + "epoch": 0.1417249160658264, + "grad_norm": 0.7566008567810059, + "learning_rate": 9.885344501048352e-06, + "loss": 0.806, + "step": 2575 + }, + { + "epoch": 0.14177995486818207, + "grad_norm": 0.7870769500732422, + "learning_rate": 9.885252187514316e-06, + "loss": 0.7683, + "step": 2576 + }, + { + "epoch": 0.14183499367053773, + "grad_norm": 0.879648745059967, + "learning_rate": 9.885159837264033e-06, + "loss": 0.8472, + "step": 2577 + }, + { + "epoch": 0.1418900324728934, + "grad_norm": 0.76839280128479, + "learning_rate": 9.885067450298196e-06, + "loss": 0.8534, + "step": 2578 + }, + { + "epoch": 0.14194507127524905, + "grad_norm": 0.8268701434135437, + "learning_rate": 9.884975026617498e-06, + "loss": 0.7799, + "step": 2579 + }, + { + "epoch": 0.1420001100776047, + "grad_norm": 0.8226090669631958, + "learning_rate": 9.884882566222638e-06, + "loss": 0.6756, + "step": 2580 + }, + { + "epoch": 0.14205514887996037, + "grad_norm": 0.8299756050109863, + "learning_rate": 9.884790069114307e-06, + "loss": 0.734, + "step": 2581 + }, + { + "epoch": 0.14211018768231604, + "grad_norm": 0.8241812586784363, + "learning_rate": 9.8846975352932e-06, + "loss": 0.8335, + "step": 2582 + }, + { + "epoch": 0.1421652264846717, + "grad_norm": 0.8458926677703857, + "learning_rate": 9.884604964760016e-06, + "loss": 0.7376, + "step": 2583 + }, + { + "epoch": 0.14222026528702736, + "grad_norm": 0.876966655254364, + "learning_rate": 9.884512357515447e-06, + "loss": 0.9414, + "step": 2584 + }, + { + "epoch": 0.14227530408938302, + "grad_norm": 0.770252525806427, + "learning_rate": 9.88441971356019e-06, + "loss": 0.8312, + "step": 2585 + }, + { + "epoch": 0.14233034289173868, + "grad_norm": 0.7883023023605347, + "learning_rate": 9.884327032894945e-06, + "loss": 0.8568, + "step": 2586 + }, + { + "epoch": 0.14238538169409434, + "grad_norm": 0.9092289209365845, + "learning_rate": 9.884234315520405e-06, + "loss": 0.9078, + "step": 2587 + }, + { + "epoch": 0.14244042049645, + "grad_norm": 0.7946531176567078, + "learning_rate": 9.884141561437266e-06, + "loss": 0.6895, + "step": 2588 + }, + { + "epoch": 0.14249545929880567, + "grad_norm": 0.7791070342063904, + "learning_rate": 9.884048770646227e-06, + "loss": 0.6984, + "step": 2589 + }, + { + "epoch": 0.14255049810116133, + "grad_norm": 0.7775537371635437, + "learning_rate": 9.883955943147982e-06, + "loss": 0.7568, + "step": 2590 + }, + { + "epoch": 0.142605536903517, + "grad_norm": 0.7735158801078796, + "learning_rate": 9.883863078943234e-06, + "loss": 0.8215, + "step": 2591 + }, + { + "epoch": 0.14266057570587265, + "grad_norm": 0.881365180015564, + "learning_rate": 9.88377017803268e-06, + "loss": 0.8817, + "step": 2592 + }, + { + "epoch": 0.1427156145082283, + "grad_norm": 0.8643443584442139, + "learning_rate": 9.883677240417014e-06, + "loss": 0.8024, + "step": 2593 + }, + { + "epoch": 0.14277065331058397, + "grad_norm": 0.885713517665863, + "learning_rate": 9.883584266096938e-06, + "loss": 0.7612, + "step": 2594 + }, + { + "epoch": 0.14282569211293963, + "grad_norm": 0.771340012550354, + "learning_rate": 9.88349125507315e-06, + "loss": 0.8293, + "step": 2595 + }, + { + "epoch": 0.1428807309152953, + "grad_norm": 0.8284093737602234, + "learning_rate": 9.88339820734635e-06, + "loss": 0.8539, + "step": 2596 + }, + { + "epoch": 0.14293576971765096, + "grad_norm": 0.9597725868225098, + "learning_rate": 9.883305122917233e-06, + "loss": 0.9054, + "step": 2597 + }, + { + "epoch": 0.14299080852000662, + "grad_norm": 0.7552937269210815, + "learning_rate": 9.883212001786504e-06, + "loss": 0.8047, + "step": 2598 + }, + { + "epoch": 0.14304584732236225, + "grad_norm": 0.8008492588996887, + "learning_rate": 9.883118843954861e-06, + "loss": 0.802, + "step": 2599 + }, + { + "epoch": 0.1431008861247179, + "grad_norm": 0.8169753551483154, + "learning_rate": 9.883025649423003e-06, + "loss": 0.8837, + "step": 2600 + }, + { + "epoch": 0.14315592492707357, + "grad_norm": 0.8521036505699158, + "learning_rate": 9.882932418191632e-06, + "loss": 0.8266, + "step": 2601 + }, + { + "epoch": 0.14321096372942924, + "grad_norm": 0.8647341728210449, + "learning_rate": 9.882839150261449e-06, + "loss": 0.8949, + "step": 2602 + }, + { + "epoch": 0.1432660025317849, + "grad_norm": 0.9236162304878235, + "learning_rate": 9.882745845633153e-06, + "loss": 0.8474, + "step": 2603 + }, + { + "epoch": 0.14332104133414056, + "grad_norm": 0.8422677516937256, + "learning_rate": 9.882652504307445e-06, + "loss": 0.8396, + "step": 2604 + }, + { + "epoch": 0.14337608013649622, + "grad_norm": 0.902036190032959, + "learning_rate": 9.88255912628503e-06, + "loss": 0.8075, + "step": 2605 + }, + { + "epoch": 0.14343111893885188, + "grad_norm": 0.8972339630126953, + "learning_rate": 9.882465711566605e-06, + "loss": 0.8143, + "step": 2606 + }, + { + "epoch": 0.14348615774120754, + "grad_norm": 0.8025243282318115, + "learning_rate": 9.882372260152877e-06, + "loss": 0.771, + "step": 2607 + }, + { + "epoch": 0.1435411965435632, + "grad_norm": 0.8260911107063293, + "learning_rate": 9.882278772044545e-06, + "loss": 0.7679, + "step": 2608 + }, + { + "epoch": 0.14359623534591887, + "grad_norm": 0.8069774508476257, + "learning_rate": 9.882185247242313e-06, + "loss": 0.8489, + "step": 2609 + }, + { + "epoch": 0.14365127414827453, + "grad_norm": 0.8702567219734192, + "learning_rate": 9.882091685746883e-06, + "loss": 0.9258, + "step": 2610 + }, + { + "epoch": 0.1437063129506302, + "grad_norm": 0.8841683268547058, + "learning_rate": 9.881998087558959e-06, + "loss": 0.7858, + "step": 2611 + }, + { + "epoch": 0.14376135175298585, + "grad_norm": 0.7302986979484558, + "learning_rate": 9.881904452679246e-06, + "loss": 0.7339, + "step": 2612 + }, + { + "epoch": 0.1438163905553415, + "grad_norm": 0.7852466106414795, + "learning_rate": 9.881810781108442e-06, + "loss": 0.8397, + "step": 2613 + }, + { + "epoch": 0.14387142935769717, + "grad_norm": 0.7986249327659607, + "learning_rate": 9.881717072847258e-06, + "loss": 0.7573, + "step": 2614 + }, + { + "epoch": 0.14392646816005283, + "grad_norm": 0.750000536441803, + "learning_rate": 9.881623327896395e-06, + "loss": 0.7128, + "step": 2615 + }, + { + "epoch": 0.1439815069624085, + "grad_norm": 0.8796436786651611, + "learning_rate": 9.881529546256557e-06, + "loss": 0.9364, + "step": 2616 + }, + { + "epoch": 0.14403654576476416, + "grad_norm": 0.8621297478675842, + "learning_rate": 9.881435727928449e-06, + "loss": 0.9323, + "step": 2617 + }, + { + "epoch": 0.14409158456711982, + "grad_norm": 0.8213173151016235, + "learning_rate": 9.881341872912777e-06, + "loss": 0.7746, + "step": 2618 + }, + { + "epoch": 0.14414662336947548, + "grad_norm": 0.7761938571929932, + "learning_rate": 9.881247981210247e-06, + "loss": 0.8065, + "step": 2619 + }, + { + "epoch": 0.14420166217183114, + "grad_norm": 0.8333988785743713, + "learning_rate": 9.881154052821564e-06, + "loss": 0.8727, + "step": 2620 + }, + { + "epoch": 0.1442567009741868, + "grad_norm": 0.7263909578323364, + "learning_rate": 9.881060087747433e-06, + "loss": 0.8194, + "step": 2621 + }, + { + "epoch": 0.14431173977654246, + "grad_norm": 0.7472667098045349, + "learning_rate": 9.880966085988562e-06, + "loss": 0.77, + "step": 2622 + }, + { + "epoch": 0.14436677857889813, + "grad_norm": 0.7999943494796753, + "learning_rate": 9.880872047545656e-06, + "loss": 0.7936, + "step": 2623 + }, + { + "epoch": 0.1444218173812538, + "grad_norm": 0.8359610438346863, + "learning_rate": 9.88077797241942e-06, + "loss": 0.7946, + "step": 2624 + }, + { + "epoch": 0.14447685618360945, + "grad_norm": 0.8666403889656067, + "learning_rate": 9.880683860610566e-06, + "loss": 0.8152, + "step": 2625 + }, + { + "epoch": 0.1445318949859651, + "grad_norm": 0.7883741855621338, + "learning_rate": 9.880589712119797e-06, + "loss": 0.7972, + "step": 2626 + }, + { + "epoch": 0.14458693378832077, + "grad_norm": 0.8048827648162842, + "learning_rate": 9.880495526947824e-06, + "loss": 0.8221, + "step": 2627 + }, + { + "epoch": 0.14464197259067643, + "grad_norm": 0.718292236328125, + "learning_rate": 9.88040130509535e-06, + "loss": 0.7648, + "step": 2628 + }, + { + "epoch": 0.1446970113930321, + "grad_norm": 0.7748421430587769, + "learning_rate": 9.880307046563088e-06, + "loss": 0.8146, + "step": 2629 + }, + { + "epoch": 0.14475205019538775, + "grad_norm": 0.8015987873077393, + "learning_rate": 9.880212751351745e-06, + "loss": 0.7935, + "step": 2630 + }, + { + "epoch": 0.14480708899774342, + "grad_norm": 0.7628459930419922, + "learning_rate": 9.88011841946203e-06, + "loss": 0.7469, + "step": 2631 + }, + { + "epoch": 0.14486212780009908, + "grad_norm": 0.7152888774871826, + "learning_rate": 9.88002405089465e-06, + "loss": 0.7721, + "step": 2632 + }, + { + "epoch": 0.14491716660245474, + "grad_norm": 0.8075545430183411, + "learning_rate": 9.879929645650315e-06, + "loss": 0.8799, + "step": 2633 + }, + { + "epoch": 0.1449722054048104, + "grad_norm": 0.7981964945793152, + "learning_rate": 9.879835203729736e-06, + "loss": 0.8265, + "step": 2634 + }, + { + "epoch": 0.14502724420716606, + "grad_norm": 0.7699866890907288, + "learning_rate": 9.879740725133623e-06, + "loss": 0.8489, + "step": 2635 + }, + { + "epoch": 0.14508228300952172, + "grad_norm": 0.7991634011268616, + "learning_rate": 9.879646209862682e-06, + "loss": 0.8754, + "step": 2636 + }, + { + "epoch": 0.14513732181187738, + "grad_norm": 0.8284991383552551, + "learning_rate": 9.879551657917628e-06, + "loss": 0.811, + "step": 2637 + }, + { + "epoch": 0.14519236061423305, + "grad_norm": 0.9189227819442749, + "learning_rate": 9.87945706929917e-06, + "loss": 0.8486, + "step": 2638 + }, + { + "epoch": 0.1452473994165887, + "grad_norm": 0.8599026799201965, + "learning_rate": 9.879362444008018e-06, + "loss": 0.8383, + "step": 2639 + }, + { + "epoch": 0.14530243821894437, + "grad_norm": 0.8764603137969971, + "learning_rate": 9.879267782044885e-06, + "loss": 0.7918, + "step": 2640 + }, + { + "epoch": 0.14535747702130003, + "grad_norm": 0.8061341047286987, + "learning_rate": 9.87917308341048e-06, + "loss": 0.8292, + "step": 2641 + }, + { + "epoch": 0.14541251582365566, + "grad_norm": 1.031220555305481, + "learning_rate": 9.879078348105518e-06, + "loss": 0.6612, + "step": 2642 + }, + { + "epoch": 0.14546755462601133, + "grad_norm": 1.014491319656372, + "learning_rate": 9.878983576130708e-06, + "loss": 0.8512, + "step": 2643 + }, + { + "epoch": 0.145522593428367, + "grad_norm": 0.8365896940231323, + "learning_rate": 9.878888767486764e-06, + "loss": 0.7995, + "step": 2644 + }, + { + "epoch": 0.14557763223072265, + "grad_norm": 0.8086197972297668, + "learning_rate": 9.878793922174397e-06, + "loss": 0.8069, + "step": 2645 + }, + { + "epoch": 0.1456326710330783, + "grad_norm": 0.8075234889984131, + "learning_rate": 9.878699040194322e-06, + "loss": 0.8415, + "step": 2646 + }, + { + "epoch": 0.14568770983543397, + "grad_norm": 0.9413748979568481, + "learning_rate": 9.87860412154725e-06, + "loss": 0.7811, + "step": 2647 + }, + { + "epoch": 0.14574274863778963, + "grad_norm": 0.7744552493095398, + "learning_rate": 9.878509166233895e-06, + "loss": 0.7983, + "step": 2648 + }, + { + "epoch": 0.1457977874401453, + "grad_norm": 0.8184664845466614, + "learning_rate": 9.878414174254974e-06, + "loss": 0.8052, + "step": 2649 + }, + { + "epoch": 0.14585282624250095, + "grad_norm": 0.928814172744751, + "learning_rate": 9.878319145611195e-06, + "loss": 0.7695, + "step": 2650 + }, + { + "epoch": 0.14590786504485662, + "grad_norm": 0.9623318314552307, + "learning_rate": 9.878224080303276e-06, + "loss": 0.9025, + "step": 2651 + }, + { + "epoch": 0.14596290384721228, + "grad_norm": 0.866538405418396, + "learning_rate": 9.87812897833193e-06, + "loss": 0.7895, + "step": 2652 + }, + { + "epoch": 0.14601794264956794, + "grad_norm": 0.9248599410057068, + "learning_rate": 9.878033839697874e-06, + "loss": 0.8532, + "step": 2653 + }, + { + "epoch": 0.1460729814519236, + "grad_norm": 0.7866301536560059, + "learning_rate": 9.87793866440182e-06, + "loss": 0.8724, + "step": 2654 + }, + { + "epoch": 0.14612802025427926, + "grad_norm": 0.8471634387969971, + "learning_rate": 9.877843452444485e-06, + "loss": 0.9184, + "step": 2655 + }, + { + "epoch": 0.14618305905663492, + "grad_norm": 0.7367103695869446, + "learning_rate": 9.877748203826585e-06, + "loss": 0.7328, + "step": 2656 + }, + { + "epoch": 0.14623809785899058, + "grad_norm": 0.95980304479599, + "learning_rate": 9.877652918548834e-06, + "loss": 0.9274, + "step": 2657 + }, + { + "epoch": 0.14629313666134625, + "grad_norm": 1.0511064529418945, + "learning_rate": 9.87755759661195e-06, + "loss": 0.8223, + "step": 2658 + }, + { + "epoch": 0.1463481754637019, + "grad_norm": 0.7616510391235352, + "learning_rate": 9.877462238016649e-06, + "loss": 0.7473, + "step": 2659 + }, + { + "epoch": 0.14640321426605757, + "grad_norm": 0.7814056873321533, + "learning_rate": 9.877366842763647e-06, + "loss": 0.8898, + "step": 2660 + }, + { + "epoch": 0.14645825306841323, + "grad_norm": 0.8707298636436462, + "learning_rate": 9.877271410853662e-06, + "loss": 0.8792, + "step": 2661 + }, + { + "epoch": 0.1465132918707689, + "grad_norm": 0.8618701696395874, + "learning_rate": 9.877175942287409e-06, + "loss": 0.8761, + "step": 2662 + }, + { + "epoch": 0.14656833067312455, + "grad_norm": 0.9437732100486755, + "learning_rate": 9.877080437065609e-06, + "loss": 0.7922, + "step": 2663 + }, + { + "epoch": 0.14662336947548021, + "grad_norm": 0.9465780258178711, + "learning_rate": 9.876984895188976e-06, + "loss": 0.8449, + "step": 2664 + }, + { + "epoch": 0.14667840827783588, + "grad_norm": 0.7149911522865295, + "learning_rate": 9.876889316658233e-06, + "loss": 0.6408, + "step": 2665 + }, + { + "epoch": 0.14673344708019154, + "grad_norm": 0.9996811151504517, + "learning_rate": 9.876793701474092e-06, + "loss": 0.9324, + "step": 2666 + }, + { + "epoch": 0.1467884858825472, + "grad_norm": 0.7941329479217529, + "learning_rate": 9.876698049637277e-06, + "loss": 0.8115, + "step": 2667 + }, + { + "epoch": 0.14684352468490286, + "grad_norm": 0.754175066947937, + "learning_rate": 9.876602361148504e-06, + "loss": 0.7709, + "step": 2668 + }, + { + "epoch": 0.14689856348725852, + "grad_norm": 0.7867946624755859, + "learning_rate": 9.876506636008494e-06, + "loss": 0.8578, + "step": 2669 + }, + { + "epoch": 0.14695360228961418, + "grad_norm": 0.7441185116767883, + "learning_rate": 9.876410874217965e-06, + "loss": 0.8491, + "step": 2670 + }, + { + "epoch": 0.14700864109196984, + "grad_norm": 0.8414027690887451, + "learning_rate": 9.876315075777638e-06, + "loss": 0.8404, + "step": 2671 + }, + { + "epoch": 0.1470636798943255, + "grad_norm": 0.7911489009857178, + "learning_rate": 9.876219240688231e-06, + "loss": 0.8606, + "step": 2672 + }, + { + "epoch": 0.14711871869668117, + "grad_norm": 0.8601381778717041, + "learning_rate": 9.876123368950465e-06, + "loss": 0.7753, + "step": 2673 + }, + { + "epoch": 0.14717375749903683, + "grad_norm": 0.8672378659248352, + "learning_rate": 9.876027460565062e-06, + "loss": 0.7763, + "step": 2674 + }, + { + "epoch": 0.1472287963013925, + "grad_norm": 0.7192933559417725, + "learning_rate": 9.875931515532742e-06, + "loss": 0.7681, + "step": 2675 + }, + { + "epoch": 0.14728383510374815, + "grad_norm": 0.7483426332473755, + "learning_rate": 9.875835533854226e-06, + "loss": 0.8129, + "step": 2676 + }, + { + "epoch": 0.1473388739061038, + "grad_norm": 0.8883694410324097, + "learning_rate": 9.875739515530235e-06, + "loss": 0.8912, + "step": 2677 + }, + { + "epoch": 0.14739391270845947, + "grad_norm": 0.8440148234367371, + "learning_rate": 9.87564346056149e-06, + "loss": 0.8411, + "step": 2678 + }, + { + "epoch": 0.14744895151081513, + "grad_norm": 0.8916668891906738, + "learning_rate": 9.875547368948715e-06, + "loss": 0.8484, + "step": 2679 + }, + { + "epoch": 0.1475039903131708, + "grad_norm": 0.805258572101593, + "learning_rate": 9.875451240692631e-06, + "loss": 0.8172, + "step": 2680 + }, + { + "epoch": 0.14755902911552646, + "grad_norm": 0.8322305679321289, + "learning_rate": 9.87535507579396e-06, + "loss": 0.809, + "step": 2681 + }, + { + "epoch": 0.14761406791788212, + "grad_norm": 0.7320597767829895, + "learning_rate": 9.875258874253424e-06, + "loss": 0.7346, + "step": 2682 + }, + { + "epoch": 0.14766910672023778, + "grad_norm": 1.018036127090454, + "learning_rate": 9.875162636071749e-06, + "loss": 0.931, + "step": 2683 + }, + { + "epoch": 0.14772414552259344, + "grad_norm": 0.8601503968238831, + "learning_rate": 9.875066361249657e-06, + "loss": 0.7689, + "step": 2684 + }, + { + "epoch": 0.14777918432494908, + "grad_norm": 0.8478472232818604, + "learning_rate": 9.87497004978787e-06, + "loss": 0.9545, + "step": 2685 + }, + { + "epoch": 0.14783422312730474, + "grad_norm": 0.7510890364646912, + "learning_rate": 9.874873701687115e-06, + "loss": 0.7794, + "step": 2686 + }, + { + "epoch": 0.1478892619296604, + "grad_norm": 0.8226999044418335, + "learning_rate": 9.874777316948112e-06, + "loss": 0.8477, + "step": 2687 + }, + { + "epoch": 0.14794430073201606, + "grad_norm": 0.8284991979598999, + "learning_rate": 9.874680895571588e-06, + "loss": 0.8498, + "step": 2688 + }, + { + "epoch": 0.14799933953437172, + "grad_norm": 0.9007356762886047, + "learning_rate": 9.874584437558267e-06, + "loss": 0.8526, + "step": 2689 + }, + { + "epoch": 0.14805437833672738, + "grad_norm": 0.8770126104354858, + "learning_rate": 9.874487942908877e-06, + "loss": 0.844, + "step": 2690 + }, + { + "epoch": 0.14810941713908304, + "grad_norm": 1.1561466455459595, + "learning_rate": 9.874391411624138e-06, + "loss": 0.976, + "step": 2691 + }, + { + "epoch": 0.1481644559414387, + "grad_norm": 0.8162640929222107, + "learning_rate": 9.874294843704777e-06, + "loss": 0.8581, + "step": 2692 + }, + { + "epoch": 0.14821949474379437, + "grad_norm": 0.8308132290840149, + "learning_rate": 9.874198239151522e-06, + "loss": 0.8303, + "step": 2693 + }, + { + "epoch": 0.14827453354615003, + "grad_norm": 0.771024227142334, + "learning_rate": 9.874101597965098e-06, + "loss": 0.8351, + "step": 2694 + }, + { + "epoch": 0.1483295723485057, + "grad_norm": 0.7588162422180176, + "learning_rate": 9.874004920146232e-06, + "loss": 0.7858, + "step": 2695 + }, + { + "epoch": 0.14838461115086135, + "grad_norm": 0.8282446265220642, + "learning_rate": 9.873908205695648e-06, + "loss": 0.8465, + "step": 2696 + }, + { + "epoch": 0.148439649953217, + "grad_norm": 0.8342786431312561, + "learning_rate": 9.873811454614076e-06, + "loss": 0.8688, + "step": 2697 + }, + { + "epoch": 0.14849468875557267, + "grad_norm": 0.7957108020782471, + "learning_rate": 9.87371466690224e-06, + "loss": 0.8381, + "step": 2698 + }, + { + "epoch": 0.14854972755792833, + "grad_norm": 0.8763726353645325, + "learning_rate": 9.87361784256087e-06, + "loss": 0.8922, + "step": 2699 + }, + { + "epoch": 0.148604766360284, + "grad_norm": 0.7760055661201477, + "learning_rate": 9.873520981590693e-06, + "loss": 0.8384, + "step": 2700 + }, + { + "epoch": 0.14865980516263966, + "grad_norm": 0.9691097736358643, + "learning_rate": 9.873424083992436e-06, + "loss": 0.8581, + "step": 2701 + }, + { + "epoch": 0.14871484396499532, + "grad_norm": 0.9072558879852295, + "learning_rate": 9.87332714976683e-06, + "loss": 0.8942, + "step": 2702 + }, + { + "epoch": 0.14876988276735098, + "grad_norm": 0.8961714506149292, + "learning_rate": 9.8732301789146e-06, + "loss": 0.8062, + "step": 2703 + }, + { + "epoch": 0.14882492156970664, + "grad_norm": 1.4835050106048584, + "learning_rate": 9.873133171436477e-06, + "loss": 0.886, + "step": 2704 + }, + { + "epoch": 0.1488799603720623, + "grad_norm": 0.8153702616691589, + "learning_rate": 9.87303612733319e-06, + "loss": 0.8369, + "step": 2705 + }, + { + "epoch": 0.14893499917441796, + "grad_norm": 0.8755800724029541, + "learning_rate": 9.872939046605467e-06, + "loss": 0.7591, + "step": 2706 + }, + { + "epoch": 0.14899003797677363, + "grad_norm": 0.8173243403434753, + "learning_rate": 9.872841929254038e-06, + "loss": 0.8626, + "step": 2707 + }, + { + "epoch": 0.1490450767791293, + "grad_norm": 0.7891639471054077, + "learning_rate": 9.872744775279634e-06, + "loss": 0.737, + "step": 2708 + }, + { + "epoch": 0.14910011558148495, + "grad_norm": 1.0270631313323975, + "learning_rate": 9.872647584682985e-06, + "loss": 0.9202, + "step": 2709 + }, + { + "epoch": 0.1491551543838406, + "grad_norm": 0.7736123204231262, + "learning_rate": 9.872550357464822e-06, + "loss": 0.7835, + "step": 2710 + }, + { + "epoch": 0.14921019318619627, + "grad_norm": 0.7791550159454346, + "learning_rate": 9.872453093625873e-06, + "loss": 0.8375, + "step": 2711 + }, + { + "epoch": 0.14926523198855193, + "grad_norm": 0.8410583734512329, + "learning_rate": 9.872355793166872e-06, + "loss": 0.877, + "step": 2712 + }, + { + "epoch": 0.1493202707909076, + "grad_norm": 0.8277738094329834, + "learning_rate": 9.87225845608855e-06, + "loss": 0.7255, + "step": 2713 + }, + { + "epoch": 0.14937530959326326, + "grad_norm": 0.8617290258407593, + "learning_rate": 9.872161082391635e-06, + "loss": 0.7885, + "step": 2714 + }, + { + "epoch": 0.14943034839561892, + "grad_norm": 0.8866406679153442, + "learning_rate": 9.872063672076864e-06, + "loss": 0.8621, + "step": 2715 + }, + { + "epoch": 0.14948538719797458, + "grad_norm": 0.7581049799919128, + "learning_rate": 9.871966225144964e-06, + "loss": 0.8177, + "step": 2716 + }, + { + "epoch": 0.14954042600033024, + "grad_norm": 0.833696722984314, + "learning_rate": 9.871868741596673e-06, + "loss": 0.8382, + "step": 2717 + }, + { + "epoch": 0.1495954648026859, + "grad_norm": 1.0857365131378174, + "learning_rate": 9.871771221432718e-06, + "loss": 0.9254, + "step": 2718 + }, + { + "epoch": 0.14965050360504156, + "grad_norm": 0.7622446417808533, + "learning_rate": 9.871673664653837e-06, + "loss": 0.832, + "step": 2719 + }, + { + "epoch": 0.14970554240739722, + "grad_norm": 0.7436832785606384, + "learning_rate": 9.871576071260758e-06, + "loss": 0.7642, + "step": 2720 + }, + { + "epoch": 0.14976058120975289, + "grad_norm": 0.8547641634941101, + "learning_rate": 9.87147844125422e-06, + "loss": 0.7584, + "step": 2721 + }, + { + "epoch": 0.14981562001210855, + "grad_norm": 0.7634096145629883, + "learning_rate": 9.871380774634953e-06, + "loss": 0.8332, + "step": 2722 + }, + { + "epoch": 0.1498706588144642, + "grad_norm": 0.7949081063270569, + "learning_rate": 9.871283071403692e-06, + "loss": 0.7812, + "step": 2723 + }, + { + "epoch": 0.14992569761681987, + "grad_norm": 0.8089914321899414, + "learning_rate": 9.871185331561171e-06, + "loss": 0.8503, + "step": 2724 + }, + { + "epoch": 0.14998073641917553, + "grad_norm": 0.8451627492904663, + "learning_rate": 9.871087555108125e-06, + "loss": 0.9101, + "step": 2725 + }, + { + "epoch": 0.1500357752215312, + "grad_norm": 0.8399865627288818, + "learning_rate": 9.87098974204529e-06, + "loss": 0.8222, + "step": 2726 + }, + { + "epoch": 0.15009081402388685, + "grad_norm": 0.7786773443222046, + "learning_rate": 9.870891892373397e-06, + "loss": 0.8069, + "step": 2727 + }, + { + "epoch": 0.1501458528262425, + "grad_norm": 0.8530564308166504, + "learning_rate": 9.870794006093188e-06, + "loss": 0.9229, + "step": 2728 + }, + { + "epoch": 0.15020089162859815, + "grad_norm": 0.7640067934989929, + "learning_rate": 9.870696083205394e-06, + "loss": 0.761, + "step": 2729 + }, + { + "epoch": 0.1502559304309538, + "grad_norm": 0.8953121900558472, + "learning_rate": 9.87059812371075e-06, + "loss": 0.8537, + "step": 2730 + }, + { + "epoch": 0.15031096923330947, + "grad_norm": 0.7779926657676697, + "learning_rate": 9.870500127609996e-06, + "loss": 0.8184, + "step": 2731 + }, + { + "epoch": 0.15036600803566513, + "grad_norm": 0.9181544184684753, + "learning_rate": 9.870402094903865e-06, + "loss": 0.8583, + "step": 2732 + }, + { + "epoch": 0.1504210468380208, + "grad_norm": 0.7629374861717224, + "learning_rate": 9.870304025593097e-06, + "loss": 0.6741, + "step": 2733 + }, + { + "epoch": 0.15047608564037646, + "grad_norm": 1.1455601453781128, + "learning_rate": 9.87020591967843e-06, + "loss": 0.8602, + "step": 2734 + }, + { + "epoch": 0.15053112444273212, + "grad_norm": 0.83924800157547, + "learning_rate": 9.870107777160596e-06, + "loss": 0.8847, + "step": 2735 + }, + { + "epoch": 0.15058616324508778, + "grad_norm": 0.9293402433395386, + "learning_rate": 9.870009598040336e-06, + "loss": 0.9008, + "step": 2736 + }, + { + "epoch": 0.15064120204744344, + "grad_norm": 0.8198057413101196, + "learning_rate": 9.869911382318389e-06, + "loss": 0.8004, + "step": 2737 + }, + { + "epoch": 0.1506962408497991, + "grad_norm": 0.8139753341674805, + "learning_rate": 9.86981312999549e-06, + "loss": 0.7316, + "step": 2738 + }, + { + "epoch": 0.15075127965215476, + "grad_norm": 0.854184091091156, + "learning_rate": 9.86971484107238e-06, + "loss": 0.9424, + "step": 2739 + }, + { + "epoch": 0.15080631845451042, + "grad_norm": 0.8626797199249268, + "learning_rate": 9.869616515549797e-06, + "loss": 0.8882, + "step": 2740 + }, + { + "epoch": 0.15086135725686609, + "grad_norm": 0.8447514176368713, + "learning_rate": 9.869518153428479e-06, + "loss": 0.7762, + "step": 2741 + }, + { + "epoch": 0.15091639605922175, + "grad_norm": 1.1359349489212036, + "learning_rate": 9.869419754709166e-06, + "loss": 0.9233, + "step": 2742 + }, + { + "epoch": 0.1509714348615774, + "grad_norm": 0.8095758557319641, + "learning_rate": 9.869321319392597e-06, + "loss": 0.8833, + "step": 2743 + }, + { + "epoch": 0.15102647366393307, + "grad_norm": 0.8364169001579285, + "learning_rate": 9.869222847479514e-06, + "loss": 0.833, + "step": 2744 + }, + { + "epoch": 0.15108151246628873, + "grad_norm": 0.7664803266525269, + "learning_rate": 9.869124338970653e-06, + "loss": 0.8125, + "step": 2745 + }, + { + "epoch": 0.1511365512686444, + "grad_norm": 0.8129634857177734, + "learning_rate": 9.86902579386676e-06, + "loss": 0.8277, + "step": 2746 + }, + { + "epoch": 0.15119159007100005, + "grad_norm": 0.8195592164993286, + "learning_rate": 9.86892721216857e-06, + "loss": 0.8489, + "step": 2747 + }, + { + "epoch": 0.15124662887335572, + "grad_norm": 0.8116651177406311, + "learning_rate": 9.868828593876827e-06, + "loss": 0.7831, + "step": 2748 + }, + { + "epoch": 0.15130166767571138, + "grad_norm": 0.8200114369392395, + "learning_rate": 9.868729938992272e-06, + "loss": 0.8956, + "step": 2749 + }, + { + "epoch": 0.15135670647806704, + "grad_norm": 0.8521816730499268, + "learning_rate": 9.868631247515645e-06, + "loss": 0.804, + "step": 2750 + }, + { + "epoch": 0.1514117452804227, + "grad_norm": 1.0386497974395752, + "learning_rate": 9.868532519447691e-06, + "loss": 0.8563, + "step": 2751 + }, + { + "epoch": 0.15146678408277836, + "grad_norm": 0.8345486521720886, + "learning_rate": 9.868433754789149e-06, + "loss": 0.9838, + "step": 2752 + }, + { + "epoch": 0.15152182288513402, + "grad_norm": 0.7207526564598083, + "learning_rate": 9.868334953540762e-06, + "loss": 0.6711, + "step": 2753 + }, + { + "epoch": 0.15157686168748968, + "grad_norm": 0.8159164786338806, + "learning_rate": 9.86823611570327e-06, + "loss": 0.7591, + "step": 2754 + }, + { + "epoch": 0.15163190048984534, + "grad_norm": 0.9062225818634033, + "learning_rate": 9.868137241277422e-06, + "loss": 0.8009, + "step": 2755 + }, + { + "epoch": 0.151686939292201, + "grad_norm": 0.8136696219444275, + "learning_rate": 9.868038330263957e-06, + "loss": 0.7014, + "step": 2756 + }, + { + "epoch": 0.15174197809455667, + "grad_norm": 0.7237691283226013, + "learning_rate": 9.867939382663618e-06, + "loss": 0.7766, + "step": 2757 + }, + { + "epoch": 0.15179701689691233, + "grad_norm": 0.8913742303848267, + "learning_rate": 9.86784039847715e-06, + "loss": 0.9362, + "step": 2758 + }, + { + "epoch": 0.151852055699268, + "grad_norm": 0.7763763070106506, + "learning_rate": 9.867741377705296e-06, + "loss": 0.7843, + "step": 2759 + }, + { + "epoch": 0.15190709450162365, + "grad_norm": 0.8973854780197144, + "learning_rate": 9.867642320348803e-06, + "loss": 0.911, + "step": 2760 + }, + { + "epoch": 0.1519621333039793, + "grad_norm": 0.7979685664176941, + "learning_rate": 9.86754322640841e-06, + "loss": 0.81, + "step": 2761 + }, + { + "epoch": 0.15201717210633497, + "grad_norm": 0.7740911841392517, + "learning_rate": 9.867444095884867e-06, + "loss": 0.8197, + "step": 2762 + }, + { + "epoch": 0.15207221090869064, + "grad_norm": 0.8400475978851318, + "learning_rate": 9.867344928778916e-06, + "loss": 0.8809, + "step": 2763 + }, + { + "epoch": 0.1521272497110463, + "grad_norm": 0.8995040655136108, + "learning_rate": 9.867245725091305e-06, + "loss": 0.8382, + "step": 2764 + }, + { + "epoch": 0.15218228851340196, + "grad_norm": 0.8162381052970886, + "learning_rate": 9.867146484822779e-06, + "loss": 0.9238, + "step": 2765 + }, + { + "epoch": 0.15223732731575762, + "grad_norm": 0.7668827176094055, + "learning_rate": 9.867047207974079e-06, + "loss": 0.8345, + "step": 2766 + }, + { + "epoch": 0.15229236611811328, + "grad_norm": 0.8719204664230347, + "learning_rate": 9.866947894545957e-06, + "loss": 0.7899, + "step": 2767 + }, + { + "epoch": 0.15234740492046894, + "grad_norm": 0.9043570756912231, + "learning_rate": 9.866848544539159e-06, + "loss": 0.8783, + "step": 2768 + }, + { + "epoch": 0.1524024437228246, + "grad_norm": 0.8859694004058838, + "learning_rate": 9.866749157954428e-06, + "loss": 0.862, + "step": 2769 + }, + { + "epoch": 0.15245748252518027, + "grad_norm": 1.022719144821167, + "learning_rate": 9.866649734792514e-06, + "loss": 0.8943, + "step": 2770 + }, + { + "epoch": 0.1525125213275359, + "grad_norm": 0.8710635900497437, + "learning_rate": 9.866550275054163e-06, + "loss": 0.7002, + "step": 2771 + }, + { + "epoch": 0.15256756012989156, + "grad_norm": 0.8482942581176758, + "learning_rate": 9.866450778740122e-06, + "loss": 0.7529, + "step": 2772 + }, + { + "epoch": 0.15262259893224722, + "grad_norm": 0.9637784361839294, + "learning_rate": 9.866351245851142e-06, + "loss": 0.8147, + "step": 2773 + }, + { + "epoch": 0.15267763773460288, + "grad_norm": 1.0472246408462524, + "learning_rate": 9.866251676387967e-06, + "loss": 0.8019, + "step": 2774 + }, + { + "epoch": 0.15273267653695854, + "grad_norm": 0.7916847467422485, + "learning_rate": 9.866152070351347e-06, + "loss": 0.7698, + "step": 2775 + }, + { + "epoch": 0.1527877153393142, + "grad_norm": 0.8421853184700012, + "learning_rate": 9.86605242774203e-06, + "loss": 0.8085, + "step": 2776 + }, + { + "epoch": 0.15284275414166987, + "grad_norm": 0.7990233898162842, + "learning_rate": 9.865952748560768e-06, + "loss": 0.8878, + "step": 2777 + }, + { + "epoch": 0.15289779294402553, + "grad_norm": 0.8017451167106628, + "learning_rate": 9.865853032808305e-06, + "loss": 0.8707, + "step": 2778 + }, + { + "epoch": 0.1529528317463812, + "grad_norm": 0.739850640296936, + "learning_rate": 9.865753280485393e-06, + "loss": 0.7884, + "step": 2779 + }, + { + "epoch": 0.15300787054873685, + "grad_norm": 1.0682430267333984, + "learning_rate": 9.865653491592784e-06, + "loss": 0.8548, + "step": 2780 + }, + { + "epoch": 0.1530629093510925, + "grad_norm": 0.7766296863555908, + "learning_rate": 9.865553666131225e-06, + "loss": 0.7786, + "step": 2781 + }, + { + "epoch": 0.15311794815344817, + "grad_norm": 0.8903290629386902, + "learning_rate": 9.865453804101466e-06, + "loss": 0.8978, + "step": 2782 + }, + { + "epoch": 0.15317298695580384, + "grad_norm": 0.8624514937400818, + "learning_rate": 9.86535390550426e-06, + "loss": 0.8472, + "step": 2783 + }, + { + "epoch": 0.1532280257581595, + "grad_norm": 0.7765294909477234, + "learning_rate": 9.865253970340356e-06, + "loss": 0.7702, + "step": 2784 + }, + { + "epoch": 0.15328306456051516, + "grad_norm": 0.9349095225334167, + "learning_rate": 9.865153998610504e-06, + "loss": 0.9154, + "step": 2785 + }, + { + "epoch": 0.15333810336287082, + "grad_norm": 0.8435478210449219, + "learning_rate": 9.865053990315458e-06, + "loss": 0.8986, + "step": 2786 + }, + { + "epoch": 0.15339314216522648, + "grad_norm": 0.8003486394882202, + "learning_rate": 9.864953945455968e-06, + "loss": 0.767, + "step": 2787 + }, + { + "epoch": 0.15344818096758214, + "grad_norm": 0.8060823678970337, + "learning_rate": 9.86485386403279e-06, + "loss": 0.8332, + "step": 2788 + }, + { + "epoch": 0.1535032197699378, + "grad_norm": 0.7914995551109314, + "learning_rate": 9.864753746046668e-06, + "loss": 0.6706, + "step": 2789 + }, + { + "epoch": 0.15355825857229347, + "grad_norm": 0.7792215943336487, + "learning_rate": 9.86465359149836e-06, + "loss": 0.8721, + "step": 2790 + }, + { + "epoch": 0.15361329737464913, + "grad_norm": 0.8572536110877991, + "learning_rate": 9.864553400388619e-06, + "loss": 0.8378, + "step": 2791 + }, + { + "epoch": 0.1536683361770048, + "grad_norm": 0.7645615339279175, + "learning_rate": 9.864453172718195e-06, + "loss": 0.6909, + "step": 2792 + }, + { + "epoch": 0.15372337497936045, + "grad_norm": 0.7627308964729309, + "learning_rate": 9.864352908487846e-06, + "loss": 0.7918, + "step": 2793 + }, + { + "epoch": 0.1537784137817161, + "grad_norm": 1.0830100774765015, + "learning_rate": 9.86425260769832e-06, + "loss": 0.9007, + "step": 2794 + }, + { + "epoch": 0.15383345258407177, + "grad_norm": 0.7667998671531677, + "learning_rate": 9.864152270350374e-06, + "loss": 0.832, + "step": 2795 + }, + { + "epoch": 0.15388849138642743, + "grad_norm": 0.9967591762542725, + "learning_rate": 9.864051896444764e-06, + "loss": 0.8917, + "step": 2796 + }, + { + "epoch": 0.1539435301887831, + "grad_norm": 0.8948462605476379, + "learning_rate": 9.86395148598224e-06, + "loss": 0.983, + "step": 2797 + }, + { + "epoch": 0.15399856899113876, + "grad_norm": 0.7857423424720764, + "learning_rate": 9.863851038963556e-06, + "loss": 0.7826, + "step": 2798 + }, + { + "epoch": 0.15405360779349442, + "grad_norm": 0.8821337223052979, + "learning_rate": 9.863750555389473e-06, + "loss": 0.8918, + "step": 2799 + }, + { + "epoch": 0.15410864659585008, + "grad_norm": 0.7896875143051147, + "learning_rate": 9.863650035260742e-06, + "loss": 0.8199, + "step": 2800 + }, + { + "epoch": 0.15416368539820574, + "grad_norm": 0.8046941161155701, + "learning_rate": 9.86354947857812e-06, + "loss": 0.8572, + "step": 2801 + }, + { + "epoch": 0.1542187242005614, + "grad_norm": 0.7266830205917358, + "learning_rate": 9.863448885342361e-06, + "loss": 0.8315, + "step": 2802 + }, + { + "epoch": 0.15427376300291706, + "grad_norm": 0.9009475708007812, + "learning_rate": 9.863348255554222e-06, + "loss": 0.7928, + "step": 2803 + }, + { + "epoch": 0.15432880180527273, + "grad_norm": 0.963364839553833, + "learning_rate": 9.863247589214459e-06, + "loss": 0.918, + "step": 2804 + }, + { + "epoch": 0.1543838406076284, + "grad_norm": 0.8278035521507263, + "learning_rate": 9.863146886323829e-06, + "loss": 0.8497, + "step": 2805 + }, + { + "epoch": 0.15443887940998405, + "grad_norm": 0.7360561490058899, + "learning_rate": 9.86304614688309e-06, + "loss": 0.676, + "step": 2806 + }, + { + "epoch": 0.1544939182123397, + "grad_norm": 0.7679837346076965, + "learning_rate": 9.862945370892996e-06, + "loss": 0.8114, + "step": 2807 + }, + { + "epoch": 0.15454895701469537, + "grad_norm": 0.8550567030906677, + "learning_rate": 9.862844558354309e-06, + "loss": 0.8222, + "step": 2808 + }, + { + "epoch": 0.15460399581705103, + "grad_norm": 0.7852397561073303, + "learning_rate": 9.86274370926778e-06, + "loss": 0.7449, + "step": 2809 + }, + { + "epoch": 0.1546590346194067, + "grad_norm": 0.9120833277702332, + "learning_rate": 9.862642823634175e-06, + "loss": 0.8702, + "step": 2810 + }, + { + "epoch": 0.15471407342176235, + "grad_norm": 0.8729703426361084, + "learning_rate": 9.862541901454246e-06, + "loss": 0.8064, + "step": 2811 + }, + { + "epoch": 0.15476911222411802, + "grad_norm": 0.7935470342636108, + "learning_rate": 9.862440942728754e-06, + "loss": 0.8502, + "step": 2812 + }, + { + "epoch": 0.15482415102647368, + "grad_norm": 0.8640689849853516, + "learning_rate": 9.86233994745846e-06, + "loss": 0.8159, + "step": 2813 + }, + { + "epoch": 0.1548791898288293, + "grad_norm": 0.9959222078323364, + "learning_rate": 9.862238915644116e-06, + "loss": 0.7767, + "step": 2814 + }, + { + "epoch": 0.15493422863118497, + "grad_norm": 0.7889506220817566, + "learning_rate": 9.862137847286487e-06, + "loss": 0.8293, + "step": 2815 + }, + { + "epoch": 0.15498926743354063, + "grad_norm": 0.8764606714248657, + "learning_rate": 9.862036742386335e-06, + "loss": 0.856, + "step": 2816 + }, + { + "epoch": 0.1550443062358963, + "grad_norm": 0.743727445602417, + "learning_rate": 9.861935600944413e-06, + "loss": 0.7099, + "step": 2817 + }, + { + "epoch": 0.15509934503825196, + "grad_norm": 0.7866224050521851, + "learning_rate": 9.861834422961485e-06, + "loss": 0.8805, + "step": 2818 + }, + { + "epoch": 0.15515438384060762, + "grad_norm": 0.8333723545074463, + "learning_rate": 9.861733208438311e-06, + "loss": 0.8486, + "step": 2819 + }, + { + "epoch": 0.15520942264296328, + "grad_norm": 0.8261659741401672, + "learning_rate": 9.861631957375652e-06, + "loss": 0.8896, + "step": 2820 + }, + { + "epoch": 0.15526446144531894, + "grad_norm": 0.8381538987159729, + "learning_rate": 9.861530669774268e-06, + "loss": 0.8686, + "step": 2821 + }, + { + "epoch": 0.1553195002476746, + "grad_norm": 0.9184440970420837, + "learning_rate": 9.861429345634923e-06, + "loss": 0.9702, + "step": 2822 + }, + { + "epoch": 0.15537453905003026, + "grad_norm": 0.8170294165611267, + "learning_rate": 9.861327984958374e-06, + "loss": 0.8298, + "step": 2823 + }, + { + "epoch": 0.15542957785238593, + "grad_norm": 0.8361968398094177, + "learning_rate": 9.861226587745385e-06, + "loss": 0.8232, + "step": 2824 + }, + { + "epoch": 0.1554846166547416, + "grad_norm": 0.7437820434570312, + "learning_rate": 9.861125153996718e-06, + "loss": 0.8271, + "step": 2825 + }, + { + "epoch": 0.15553965545709725, + "grad_norm": 0.715887188911438, + "learning_rate": 9.861023683713137e-06, + "loss": 0.7726, + "step": 2826 + }, + { + "epoch": 0.1555946942594529, + "grad_norm": 0.8358462452888489, + "learning_rate": 9.860922176895403e-06, + "loss": 0.8247, + "step": 2827 + }, + { + "epoch": 0.15564973306180857, + "grad_norm": 0.8620158433914185, + "learning_rate": 9.860820633544278e-06, + "loss": 0.8804, + "step": 2828 + }, + { + "epoch": 0.15570477186416423, + "grad_norm": 0.9035346508026123, + "learning_rate": 9.860719053660527e-06, + "loss": 0.7973, + "step": 2829 + }, + { + "epoch": 0.1557598106665199, + "grad_norm": 0.8014782071113586, + "learning_rate": 9.860617437244914e-06, + "loss": 0.7914, + "step": 2830 + }, + { + "epoch": 0.15581484946887555, + "grad_norm": 0.7788864970207214, + "learning_rate": 9.8605157842982e-06, + "loss": 0.7377, + "step": 2831 + }, + { + "epoch": 0.15586988827123122, + "grad_norm": 0.7475222945213318, + "learning_rate": 9.860414094821152e-06, + "loss": 0.7173, + "step": 2832 + }, + { + "epoch": 0.15592492707358688, + "grad_norm": 0.8866652846336365, + "learning_rate": 9.86031236881453e-06, + "loss": 0.8231, + "step": 2833 + }, + { + "epoch": 0.15597996587594254, + "grad_norm": 0.8725677728652954, + "learning_rate": 9.860210606279102e-06, + "loss": 0.9025, + "step": 2834 + }, + { + "epoch": 0.1560350046782982, + "grad_norm": 0.7608423233032227, + "learning_rate": 9.860108807215634e-06, + "loss": 0.8385, + "step": 2835 + }, + { + "epoch": 0.15609004348065386, + "grad_norm": 0.8237566351890564, + "learning_rate": 9.860006971624887e-06, + "loss": 0.8635, + "step": 2836 + }, + { + "epoch": 0.15614508228300952, + "grad_norm": 0.8078347444534302, + "learning_rate": 9.859905099507629e-06, + "loss": 0.7916, + "step": 2837 + }, + { + "epoch": 0.15620012108536518, + "grad_norm": 0.8282070755958557, + "learning_rate": 9.859803190864626e-06, + "loss": 0.8141, + "step": 2838 + }, + { + "epoch": 0.15625515988772085, + "grad_norm": 0.7639191150665283, + "learning_rate": 9.859701245696642e-06, + "loss": 0.7457, + "step": 2839 + }, + { + "epoch": 0.1563101986900765, + "grad_norm": 0.8429144620895386, + "learning_rate": 9.859599264004446e-06, + "loss": 0.9176, + "step": 2840 + }, + { + "epoch": 0.15636523749243217, + "grad_norm": 0.7792791724205017, + "learning_rate": 9.859497245788801e-06, + "loss": 0.8738, + "step": 2841 + }, + { + "epoch": 0.15642027629478783, + "grad_norm": 0.9018417596817017, + "learning_rate": 9.859395191050476e-06, + "loss": 0.841, + "step": 2842 + }, + { + "epoch": 0.1564753150971435, + "grad_norm": 0.7113705277442932, + "learning_rate": 9.859293099790239e-06, + "loss": 0.6576, + "step": 2843 + }, + { + "epoch": 0.15653035389949915, + "grad_norm": 0.8376311659812927, + "learning_rate": 9.859190972008853e-06, + "loss": 0.8559, + "step": 2844 + }, + { + "epoch": 0.15658539270185481, + "grad_norm": 0.7689141035079956, + "learning_rate": 9.859088807707092e-06, + "loss": 0.7844, + "step": 2845 + }, + { + "epoch": 0.15664043150421048, + "grad_norm": 0.7559483647346497, + "learning_rate": 9.858986606885717e-06, + "loss": 0.8676, + "step": 2846 + }, + { + "epoch": 0.15669547030656614, + "grad_norm": 0.7743827700614929, + "learning_rate": 9.8588843695455e-06, + "loss": 0.7995, + "step": 2847 + }, + { + "epoch": 0.1567505091089218, + "grad_norm": 0.8631327152252197, + "learning_rate": 9.85878209568721e-06, + "loss": 0.801, + "step": 2848 + }, + { + "epoch": 0.15680554791127746, + "grad_norm": 0.7454009056091309, + "learning_rate": 9.858679785311613e-06, + "loss": 0.8172, + "step": 2849 + }, + { + "epoch": 0.15686058671363312, + "grad_norm": 0.7915313839912415, + "learning_rate": 9.858577438419479e-06, + "loss": 0.833, + "step": 2850 + }, + { + "epoch": 0.15691562551598878, + "grad_norm": 0.8472526669502258, + "learning_rate": 9.858475055011578e-06, + "loss": 0.8249, + "step": 2851 + }, + { + "epoch": 0.15697066431834444, + "grad_norm": 0.7967580556869507, + "learning_rate": 9.85837263508868e-06, + "loss": 0.7533, + "step": 2852 + }, + { + "epoch": 0.1570257031207001, + "grad_norm": 0.7476257085800171, + "learning_rate": 9.858270178651554e-06, + "loss": 0.7918, + "step": 2853 + }, + { + "epoch": 0.15708074192305577, + "grad_norm": 0.8736184239387512, + "learning_rate": 9.858167685700968e-06, + "loss": 0.8254, + "step": 2854 + }, + { + "epoch": 0.15713578072541143, + "grad_norm": 0.8734819889068604, + "learning_rate": 9.858065156237694e-06, + "loss": 0.749, + "step": 2855 + }, + { + "epoch": 0.1571908195277671, + "grad_norm": 1.0344874858856201, + "learning_rate": 9.857962590262506e-06, + "loss": 0.9578, + "step": 2856 + }, + { + "epoch": 0.15724585833012272, + "grad_norm": 0.81183922290802, + "learning_rate": 9.857859987776168e-06, + "loss": 0.8845, + "step": 2857 + }, + { + "epoch": 0.15730089713247838, + "grad_norm": 0.8252540230751038, + "learning_rate": 9.857757348779456e-06, + "loss": 0.7862, + "step": 2858 + }, + { + "epoch": 0.15735593593483405, + "grad_norm": 0.7468119859695435, + "learning_rate": 9.85765467327314e-06, + "loss": 0.7587, + "step": 2859 + }, + { + "epoch": 0.1574109747371897, + "grad_norm": 0.8095998167991638, + "learning_rate": 9.857551961257993e-06, + "loss": 0.7467, + "step": 2860 + }, + { + "epoch": 0.15746601353954537, + "grad_norm": 0.8908564448356628, + "learning_rate": 9.857449212734785e-06, + "loss": 0.8199, + "step": 2861 + }, + { + "epoch": 0.15752105234190103, + "grad_norm": 0.7605593204498291, + "learning_rate": 9.857346427704288e-06, + "loss": 0.7196, + "step": 2862 + }, + { + "epoch": 0.1575760911442567, + "grad_norm": 0.9250784516334534, + "learning_rate": 9.857243606167276e-06, + "loss": 0.7366, + "step": 2863 + }, + { + "epoch": 0.15763112994661235, + "grad_norm": 0.882796585559845, + "learning_rate": 9.85714074812452e-06, + "loss": 0.8422, + "step": 2864 + }, + { + "epoch": 0.15768616874896801, + "grad_norm": 1.0014574527740479, + "learning_rate": 9.857037853576797e-06, + "loss": 0.8762, + "step": 2865 + }, + { + "epoch": 0.15774120755132368, + "grad_norm": 0.86713045835495, + "learning_rate": 9.856934922524877e-06, + "loss": 0.9282, + "step": 2866 + }, + { + "epoch": 0.15779624635367934, + "grad_norm": 1.1457390785217285, + "learning_rate": 9.856831954969532e-06, + "loss": 0.7947, + "step": 2867 + }, + { + "epoch": 0.157851285156035, + "grad_norm": 0.8902556896209717, + "learning_rate": 9.85672895091154e-06, + "loss": 0.928, + "step": 2868 + }, + { + "epoch": 0.15790632395839066, + "grad_norm": 0.7978467345237732, + "learning_rate": 9.856625910351674e-06, + "loss": 0.7382, + "step": 2869 + }, + { + "epoch": 0.15796136276074632, + "grad_norm": 0.741457462310791, + "learning_rate": 9.856522833290705e-06, + "loss": 0.7736, + "step": 2870 + }, + { + "epoch": 0.15801640156310198, + "grad_norm": 0.8330628871917725, + "learning_rate": 9.856419719729413e-06, + "loss": 0.8396, + "step": 2871 + }, + { + "epoch": 0.15807144036545764, + "grad_norm": 0.8771876692771912, + "learning_rate": 9.85631656966857e-06, + "loss": 0.6669, + "step": 2872 + }, + { + "epoch": 0.1581264791678133, + "grad_norm": 0.8073394298553467, + "learning_rate": 9.85621338310895e-06, + "loss": 0.8206, + "step": 2873 + }, + { + "epoch": 0.15818151797016897, + "grad_norm": 1.1058349609375, + "learning_rate": 9.85611016005133e-06, + "loss": 0.9526, + "step": 2874 + }, + { + "epoch": 0.15823655677252463, + "grad_norm": 0.7734992504119873, + "learning_rate": 9.856006900496488e-06, + "loss": 0.7477, + "step": 2875 + }, + { + "epoch": 0.1582915955748803, + "grad_norm": 0.9053219556808472, + "learning_rate": 9.855903604445196e-06, + "loss": 0.8009, + "step": 2876 + }, + { + "epoch": 0.15834663437723595, + "grad_norm": 0.8774041533470154, + "learning_rate": 9.855800271898233e-06, + "loss": 0.854, + "step": 2877 + }, + { + "epoch": 0.1584016731795916, + "grad_norm": 0.8346550464630127, + "learning_rate": 9.855696902856376e-06, + "loss": 0.7976, + "step": 2878 + }, + { + "epoch": 0.15845671198194727, + "grad_norm": 0.7781139016151428, + "learning_rate": 9.855593497320401e-06, + "loss": 0.7693, + "step": 2879 + }, + { + "epoch": 0.15851175078430293, + "grad_norm": 0.8707864880561829, + "learning_rate": 9.855490055291084e-06, + "loss": 0.882, + "step": 2880 + }, + { + "epoch": 0.1585667895866586, + "grad_norm": 0.7982275485992432, + "learning_rate": 9.855386576769203e-06, + "loss": 0.8457, + "step": 2881 + }, + { + "epoch": 0.15862182838901426, + "grad_norm": 0.7577090263366699, + "learning_rate": 9.855283061755536e-06, + "loss": 0.754, + "step": 2882 + }, + { + "epoch": 0.15867686719136992, + "grad_norm": 0.7422069311141968, + "learning_rate": 9.855179510250863e-06, + "loss": 0.673, + "step": 2883 + }, + { + "epoch": 0.15873190599372558, + "grad_norm": 0.7730041742324829, + "learning_rate": 9.85507592225596e-06, + "loss": 0.7888, + "step": 2884 + }, + { + "epoch": 0.15878694479608124, + "grad_norm": 0.7370560169219971, + "learning_rate": 9.854972297771605e-06, + "loss": 0.7762, + "step": 2885 + }, + { + "epoch": 0.1588419835984369, + "grad_norm": 0.725074291229248, + "learning_rate": 9.854868636798577e-06, + "loss": 0.7951, + "step": 2886 + }, + { + "epoch": 0.15889702240079256, + "grad_norm": 0.8088375926017761, + "learning_rate": 9.854764939337657e-06, + "loss": 0.8557, + "step": 2887 + }, + { + "epoch": 0.15895206120314823, + "grad_norm": 0.8268256187438965, + "learning_rate": 9.854661205389624e-06, + "loss": 0.7641, + "step": 2888 + }, + { + "epoch": 0.1590071000055039, + "grad_norm": 0.8079462051391602, + "learning_rate": 9.854557434955257e-06, + "loss": 0.7947, + "step": 2889 + }, + { + "epoch": 0.15906213880785955, + "grad_norm": 0.8257912993431091, + "learning_rate": 9.854453628035335e-06, + "loss": 0.771, + "step": 2890 + }, + { + "epoch": 0.1591171776102152, + "grad_norm": 0.8901774287223816, + "learning_rate": 9.85434978463064e-06, + "loss": 0.9415, + "step": 2891 + }, + { + "epoch": 0.15917221641257087, + "grad_norm": 0.8283013105392456, + "learning_rate": 9.854245904741948e-06, + "loss": 0.7267, + "step": 2892 + }, + { + "epoch": 0.15922725521492653, + "grad_norm": 0.8665382266044617, + "learning_rate": 9.854141988370045e-06, + "loss": 0.8681, + "step": 2893 + }, + { + "epoch": 0.1592822940172822, + "grad_norm": 0.786494255065918, + "learning_rate": 9.854038035515712e-06, + "loss": 0.7614, + "step": 2894 + }, + { + "epoch": 0.15933733281963786, + "grad_norm": 1.0548759698867798, + "learning_rate": 9.853934046179727e-06, + "loss": 0.861, + "step": 2895 + }, + { + "epoch": 0.15939237162199352, + "grad_norm": 0.8565425276756287, + "learning_rate": 9.853830020362873e-06, + "loss": 0.7858, + "step": 2896 + }, + { + "epoch": 0.15944741042434918, + "grad_norm": 0.7982691526412964, + "learning_rate": 9.853725958065933e-06, + "loss": 0.8797, + "step": 2897 + }, + { + "epoch": 0.15950244922670484, + "grad_norm": 0.8613169193267822, + "learning_rate": 9.853621859289686e-06, + "loss": 0.9217, + "step": 2898 + }, + { + "epoch": 0.1595574880290605, + "grad_norm": 0.950639009475708, + "learning_rate": 9.853517724034918e-06, + "loss": 0.8315, + "step": 2899 + }, + { + "epoch": 0.15961252683141613, + "grad_norm": 0.7940176129341125, + "learning_rate": 9.853413552302409e-06, + "loss": 0.7713, + "step": 2900 + }, + { + "epoch": 0.1596675656337718, + "grad_norm": 0.7716153264045715, + "learning_rate": 9.853309344092944e-06, + "loss": 0.7922, + "step": 2901 + }, + { + "epoch": 0.15972260443612746, + "grad_norm": 0.7626190781593323, + "learning_rate": 9.853205099407303e-06, + "loss": 0.7278, + "step": 2902 + }, + { + "epoch": 0.15977764323848312, + "grad_norm": 0.8523116707801819, + "learning_rate": 9.853100818246274e-06, + "loss": 0.8136, + "step": 2903 + }, + { + "epoch": 0.15983268204083878, + "grad_norm": 0.7636643052101135, + "learning_rate": 9.852996500610637e-06, + "loss": 0.6984, + "step": 2904 + }, + { + "epoch": 0.15988772084319444, + "grad_norm": 0.799201250076294, + "learning_rate": 9.852892146501179e-06, + "loss": 0.8319, + "step": 2905 + }, + { + "epoch": 0.1599427596455501, + "grad_norm": 0.7743694186210632, + "learning_rate": 9.85278775591868e-06, + "loss": 0.81, + "step": 2906 + }, + { + "epoch": 0.15999779844790576, + "grad_norm": 0.8964856863021851, + "learning_rate": 9.85268332886393e-06, + "loss": 0.9227, + "step": 2907 + }, + { + "epoch": 0.16005283725026143, + "grad_norm": 0.8809369802474976, + "learning_rate": 9.852578865337708e-06, + "loss": 0.9285, + "step": 2908 + }, + { + "epoch": 0.1601078760526171, + "grad_norm": 0.8960002064704895, + "learning_rate": 9.852474365340806e-06, + "loss": 0.8611, + "step": 2909 + }, + { + "epoch": 0.16016291485497275, + "grad_norm": 0.7539754509925842, + "learning_rate": 9.852369828874002e-06, + "loss": 0.7455, + "step": 2910 + }, + { + "epoch": 0.1602179536573284, + "grad_norm": 0.8189692497253418, + "learning_rate": 9.852265255938088e-06, + "loss": 0.8321, + "step": 2911 + }, + { + "epoch": 0.16027299245968407, + "grad_norm": 0.8708549737930298, + "learning_rate": 9.852160646533844e-06, + "loss": 0.8373, + "step": 2912 + }, + { + "epoch": 0.16032803126203973, + "grad_norm": 0.7701451778411865, + "learning_rate": 9.852056000662063e-06, + "loss": 0.805, + "step": 2913 + }, + { + "epoch": 0.1603830700643954, + "grad_norm": 0.9111948609352112, + "learning_rate": 9.851951318323526e-06, + "loss": 0.8513, + "step": 2914 + }, + { + "epoch": 0.16043810886675106, + "grad_norm": 0.7863909602165222, + "learning_rate": 9.85184659951902e-06, + "loss": 0.7856, + "step": 2915 + }, + { + "epoch": 0.16049314766910672, + "grad_norm": 0.9000817537307739, + "learning_rate": 9.851741844249336e-06, + "loss": 0.9172, + "step": 2916 + }, + { + "epoch": 0.16054818647146238, + "grad_norm": 1.0953118801116943, + "learning_rate": 9.851637052515259e-06, + "loss": 0.8564, + "step": 2917 + }, + { + "epoch": 0.16060322527381804, + "grad_norm": 0.8405389785766602, + "learning_rate": 9.851532224317575e-06, + "loss": 0.8317, + "step": 2918 + }, + { + "epoch": 0.1606582640761737, + "grad_norm": 0.8524565100669861, + "learning_rate": 9.851427359657075e-06, + "loss": 0.8765, + "step": 2919 + }, + { + "epoch": 0.16071330287852936, + "grad_norm": 0.8234089016914368, + "learning_rate": 9.851322458534546e-06, + "loss": 0.7873, + "step": 2920 + }, + { + "epoch": 0.16076834168088502, + "grad_norm": 0.7879638671875, + "learning_rate": 9.851217520950775e-06, + "loss": 0.8394, + "step": 2921 + }, + { + "epoch": 0.16082338048324069, + "grad_norm": 0.8168820738792419, + "learning_rate": 9.851112546906552e-06, + "loss": 0.8223, + "step": 2922 + }, + { + "epoch": 0.16087841928559635, + "grad_norm": 0.9423845410346985, + "learning_rate": 9.851007536402666e-06, + "loss": 0.9256, + "step": 2923 + }, + { + "epoch": 0.160933458087952, + "grad_norm": 0.7875099778175354, + "learning_rate": 9.850902489439906e-06, + "loss": 0.8199, + "step": 2924 + }, + { + "epoch": 0.16098849689030767, + "grad_norm": 0.6934793591499329, + "learning_rate": 9.85079740601906e-06, + "loss": 0.671, + "step": 2925 + }, + { + "epoch": 0.16104353569266333, + "grad_norm": 0.8172206282615662, + "learning_rate": 9.85069228614092e-06, + "loss": 0.7633, + "step": 2926 + }, + { + "epoch": 0.161098574495019, + "grad_norm": 0.72749263048172, + "learning_rate": 9.850587129806274e-06, + "loss": 0.8719, + "step": 2927 + }, + { + "epoch": 0.16115361329737465, + "grad_norm": 0.8416743874549866, + "learning_rate": 9.850481937015917e-06, + "loss": 0.8438, + "step": 2928 + }, + { + "epoch": 0.16120865209973032, + "grad_norm": 0.7415444850921631, + "learning_rate": 9.850376707770633e-06, + "loss": 0.7673, + "step": 2929 + }, + { + "epoch": 0.16126369090208598, + "grad_norm": 0.9364289045333862, + "learning_rate": 9.850271442071217e-06, + "loss": 0.7224, + "step": 2930 + }, + { + "epoch": 0.16131872970444164, + "grad_norm": 0.7314212918281555, + "learning_rate": 9.85016613991846e-06, + "loss": 0.7759, + "step": 2931 + }, + { + "epoch": 0.1613737685067973, + "grad_norm": 0.8940219283103943, + "learning_rate": 9.850060801313151e-06, + "loss": 0.8432, + "step": 2932 + }, + { + "epoch": 0.16142880730915296, + "grad_norm": 0.7499691843986511, + "learning_rate": 9.849955426256084e-06, + "loss": 0.8171, + "step": 2933 + }, + { + "epoch": 0.16148384611150862, + "grad_norm": 0.8123053312301636, + "learning_rate": 9.84985001474805e-06, + "loss": 0.7839, + "step": 2934 + }, + { + "epoch": 0.16153888491386428, + "grad_norm": 0.819618821144104, + "learning_rate": 9.849744566789842e-06, + "loss": 0.9123, + "step": 2935 + }, + { + "epoch": 0.16159392371621994, + "grad_norm": 0.791088342666626, + "learning_rate": 9.849639082382251e-06, + "loss": 0.8347, + "step": 2936 + }, + { + "epoch": 0.1616489625185756, + "grad_norm": 0.8166706562042236, + "learning_rate": 9.849533561526072e-06, + "loss": 0.8309, + "step": 2937 + }, + { + "epoch": 0.16170400132093127, + "grad_norm": 0.7944774031639099, + "learning_rate": 9.849428004222098e-06, + "loss": 0.8387, + "step": 2938 + }, + { + "epoch": 0.16175904012328693, + "grad_norm": 0.7414719462394714, + "learning_rate": 9.849322410471119e-06, + "loss": 0.71, + "step": 2939 + }, + { + "epoch": 0.1618140789256426, + "grad_norm": 0.8983511924743652, + "learning_rate": 9.849216780273931e-06, + "loss": 0.8902, + "step": 2940 + }, + { + "epoch": 0.16186911772799825, + "grad_norm": 0.9058687686920166, + "learning_rate": 9.849111113631329e-06, + "loss": 0.8804, + "step": 2941 + }, + { + "epoch": 0.1619241565303539, + "grad_norm": 0.948871374130249, + "learning_rate": 9.849005410544105e-06, + "loss": 0.9871, + "step": 2942 + }, + { + "epoch": 0.16197919533270955, + "grad_norm": 0.8240115642547607, + "learning_rate": 9.848899671013055e-06, + "loss": 0.8708, + "step": 2943 + }, + { + "epoch": 0.1620342341350652, + "grad_norm": 0.879953145980835, + "learning_rate": 9.848793895038972e-06, + "loss": 0.9279, + "step": 2944 + }, + { + "epoch": 0.16208927293742087, + "grad_norm": 0.8464690446853638, + "learning_rate": 9.848688082622653e-06, + "loss": 0.8418, + "step": 2945 + }, + { + "epoch": 0.16214431173977653, + "grad_norm": 0.8990732431411743, + "learning_rate": 9.848582233764891e-06, + "loss": 0.8622, + "step": 2946 + }, + { + "epoch": 0.1621993505421322, + "grad_norm": 0.8054911494255066, + "learning_rate": 9.848476348466483e-06, + "loss": 0.8295, + "step": 2947 + }, + { + "epoch": 0.16225438934448785, + "grad_norm": 0.7904845476150513, + "learning_rate": 9.848370426728226e-06, + "loss": 0.7777, + "step": 2948 + }, + { + "epoch": 0.16230942814684352, + "grad_norm": 1.0143954753875732, + "learning_rate": 9.848264468550915e-06, + "loss": 0.8556, + "step": 2949 + }, + { + "epoch": 0.16236446694919918, + "grad_norm": 0.7201125621795654, + "learning_rate": 9.848158473935344e-06, + "loss": 0.7981, + "step": 2950 + }, + { + "epoch": 0.16241950575155484, + "grad_norm": 0.8322157263755798, + "learning_rate": 9.848052442882312e-06, + "loss": 0.8323, + "step": 2951 + }, + { + "epoch": 0.1624745445539105, + "grad_norm": 0.7740346193313599, + "learning_rate": 9.847946375392617e-06, + "loss": 0.8355, + "step": 2952 + }, + { + "epoch": 0.16252958335626616, + "grad_norm": 0.8955645561218262, + "learning_rate": 9.847840271467053e-06, + "loss": 0.7161, + "step": 2953 + }, + { + "epoch": 0.16258462215862182, + "grad_norm": 0.800364077091217, + "learning_rate": 9.847734131106421e-06, + "loss": 0.8165, + "step": 2954 + }, + { + "epoch": 0.16263966096097748, + "grad_norm": 0.8305484056472778, + "learning_rate": 9.847627954311516e-06, + "loss": 0.7846, + "step": 2955 + }, + { + "epoch": 0.16269469976333314, + "grad_norm": 0.7354590892791748, + "learning_rate": 9.847521741083136e-06, + "loss": 0.7743, + "step": 2956 + }, + { + "epoch": 0.1627497385656888, + "grad_norm": 0.8173812627792358, + "learning_rate": 9.847415491422083e-06, + "loss": 0.8626, + "step": 2957 + }, + { + "epoch": 0.16280477736804447, + "grad_norm": 0.7959356307983398, + "learning_rate": 9.84730920532915e-06, + "loss": 0.8016, + "step": 2958 + }, + { + "epoch": 0.16285981617040013, + "grad_norm": 0.8256500363349915, + "learning_rate": 9.84720288280514e-06, + "loss": 0.7407, + "step": 2959 + }, + { + "epoch": 0.1629148549727558, + "grad_norm": 0.8522148728370667, + "learning_rate": 9.84709652385085e-06, + "loss": 0.8342, + "step": 2960 + }, + { + "epoch": 0.16296989377511145, + "grad_norm": 0.7791039943695068, + "learning_rate": 9.84699012846708e-06, + "loss": 0.7239, + "step": 2961 + }, + { + "epoch": 0.1630249325774671, + "grad_norm": 0.84294193983078, + "learning_rate": 9.84688369665463e-06, + "loss": 0.7498, + "step": 2962 + }, + { + "epoch": 0.16307997137982277, + "grad_norm": 0.7948899865150452, + "learning_rate": 9.846777228414299e-06, + "loss": 0.7713, + "step": 2963 + }, + { + "epoch": 0.16313501018217844, + "grad_norm": 0.6673180460929871, + "learning_rate": 9.846670723746888e-06, + "loss": 0.6759, + "step": 2964 + }, + { + "epoch": 0.1631900489845341, + "grad_norm": 0.8141015768051147, + "learning_rate": 9.846564182653199e-06, + "loss": 0.7928, + "step": 2965 + }, + { + "epoch": 0.16324508778688976, + "grad_norm": 0.967830240726471, + "learning_rate": 9.846457605134028e-06, + "loss": 0.823, + "step": 2966 + }, + { + "epoch": 0.16330012658924542, + "grad_norm": 0.8099361658096313, + "learning_rate": 9.84635099119018e-06, + "loss": 0.8724, + "step": 2967 + }, + { + "epoch": 0.16335516539160108, + "grad_norm": 0.7913978099822998, + "learning_rate": 9.846244340822456e-06, + "loss": 0.7106, + "step": 2968 + }, + { + "epoch": 0.16341020419395674, + "grad_norm": 0.8308563828468323, + "learning_rate": 9.846137654031655e-06, + "loss": 0.7631, + "step": 2969 + }, + { + "epoch": 0.1634652429963124, + "grad_norm": 0.8634191751480103, + "learning_rate": 9.846030930818582e-06, + "loss": 0.7363, + "step": 2970 + }, + { + "epoch": 0.16352028179866807, + "grad_norm": 0.8936432600021362, + "learning_rate": 9.845924171184038e-06, + "loss": 0.8714, + "step": 2971 + }, + { + "epoch": 0.16357532060102373, + "grad_norm": 0.8776300549507141, + "learning_rate": 9.845817375128825e-06, + "loss": 0.914, + "step": 2972 + }, + { + "epoch": 0.1636303594033794, + "grad_norm": 0.8793039321899414, + "learning_rate": 9.845710542653745e-06, + "loss": 0.7999, + "step": 2973 + }, + { + "epoch": 0.16368539820573505, + "grad_norm": 0.8391743302345276, + "learning_rate": 9.845603673759603e-06, + "loss": 0.8124, + "step": 2974 + }, + { + "epoch": 0.1637404370080907, + "grad_norm": 0.8487186431884766, + "learning_rate": 9.845496768447199e-06, + "loss": 0.8551, + "step": 2975 + }, + { + "epoch": 0.16379547581044637, + "grad_norm": 0.7780638933181763, + "learning_rate": 9.845389826717339e-06, + "loss": 0.7281, + "step": 2976 + }, + { + "epoch": 0.16385051461280203, + "grad_norm": 0.7209637761116028, + "learning_rate": 9.845282848570825e-06, + "loss": 0.6737, + "step": 2977 + }, + { + "epoch": 0.1639055534151577, + "grad_norm": 0.8414756059646606, + "learning_rate": 9.845175834008464e-06, + "loss": 0.8003, + "step": 2978 + }, + { + "epoch": 0.16396059221751336, + "grad_norm": 1.2730679512023926, + "learning_rate": 9.845068783031057e-06, + "loss": 0.8243, + "step": 2979 + }, + { + "epoch": 0.16401563101986902, + "grad_norm": 0.8573475480079651, + "learning_rate": 9.844961695639413e-06, + "loss": 0.7844, + "step": 2980 + }, + { + "epoch": 0.16407066982222468, + "grad_norm": 0.8029958605766296, + "learning_rate": 9.84485457183433e-06, + "loss": 0.7722, + "step": 2981 + }, + { + "epoch": 0.16412570862458034, + "grad_norm": 0.7839805483818054, + "learning_rate": 9.844747411616619e-06, + "loss": 0.8146, + "step": 2982 + }, + { + "epoch": 0.164180747426936, + "grad_norm": 0.7563499212265015, + "learning_rate": 9.844640214987082e-06, + "loss": 0.6909, + "step": 2983 + }, + { + "epoch": 0.16423578622929166, + "grad_norm": 0.7199193239212036, + "learning_rate": 9.844532981946527e-06, + "loss": 0.702, + "step": 2984 + }, + { + "epoch": 0.16429082503164733, + "grad_norm": 0.7519383430480957, + "learning_rate": 9.844425712495758e-06, + "loss": 0.6493, + "step": 2985 + }, + { + "epoch": 0.16434586383400296, + "grad_norm": 0.7493193745613098, + "learning_rate": 9.844318406635584e-06, + "loss": 0.8318, + "step": 2986 + }, + { + "epoch": 0.16440090263635862, + "grad_norm": 0.7951106429100037, + "learning_rate": 9.84421106436681e-06, + "loss": 0.923, + "step": 2987 + }, + { + "epoch": 0.16445594143871428, + "grad_norm": 0.8350820541381836, + "learning_rate": 9.844103685690238e-06, + "loss": 0.8091, + "step": 2988 + }, + { + "epoch": 0.16451098024106994, + "grad_norm": 0.773932695388794, + "learning_rate": 9.843996270606683e-06, + "loss": 0.8016, + "step": 2989 + }, + { + "epoch": 0.1645660190434256, + "grad_norm": 0.8208432793617249, + "learning_rate": 9.843888819116947e-06, + "loss": 0.7704, + "step": 2990 + }, + { + "epoch": 0.16462105784578127, + "grad_norm": 0.8552223443984985, + "learning_rate": 9.84378133122184e-06, + "loss": 0.8519, + "step": 2991 + }, + { + "epoch": 0.16467609664813693, + "grad_norm": 0.8015661835670471, + "learning_rate": 9.84367380692217e-06, + "loss": 0.8389, + "step": 2992 + }, + { + "epoch": 0.1647311354504926, + "grad_norm": 0.7828749418258667, + "learning_rate": 9.843566246218743e-06, + "loss": 0.7385, + "step": 2993 + }, + { + "epoch": 0.16478617425284825, + "grad_norm": 0.7761647701263428, + "learning_rate": 9.84345864911237e-06, + "loss": 0.8419, + "step": 2994 + }, + { + "epoch": 0.1648412130552039, + "grad_norm": 0.8839839100837708, + "learning_rate": 9.843351015603857e-06, + "loss": 0.8069, + "step": 2995 + }, + { + "epoch": 0.16489625185755957, + "grad_norm": 0.8611735105514526, + "learning_rate": 9.843243345694014e-06, + "loss": 0.9406, + "step": 2996 + }, + { + "epoch": 0.16495129065991523, + "grad_norm": 0.9042683839797974, + "learning_rate": 9.84313563938365e-06, + "loss": 0.821, + "step": 2997 + }, + { + "epoch": 0.1650063294622709, + "grad_norm": 0.8333690762519836, + "learning_rate": 9.843027896673577e-06, + "loss": 0.781, + "step": 2998 + }, + { + "epoch": 0.16506136826462656, + "grad_norm": 0.819922924041748, + "learning_rate": 9.8429201175646e-06, + "loss": 0.869, + "step": 2999 + }, + { + "epoch": 0.16511640706698222, + "grad_norm": 0.8349948525428772, + "learning_rate": 9.842812302057534e-06, + "loss": 0.9271, + "step": 3000 + }, + { + "epoch": 0.16517144586933788, + "grad_norm": 0.8981684446334839, + "learning_rate": 9.842704450153187e-06, + "loss": 0.7384, + "step": 3001 + }, + { + "epoch": 0.16522648467169354, + "grad_norm": 0.839133083820343, + "learning_rate": 9.842596561852369e-06, + "loss": 0.9016, + "step": 3002 + }, + { + "epoch": 0.1652815234740492, + "grad_norm": 0.8303349614143372, + "learning_rate": 9.842488637155891e-06, + "loss": 0.7488, + "step": 3003 + }, + { + "epoch": 0.16533656227640486, + "grad_norm": 0.8748323917388916, + "learning_rate": 9.842380676064566e-06, + "loss": 0.8163, + "step": 3004 + }, + { + "epoch": 0.16539160107876053, + "grad_norm": 0.782844603061676, + "learning_rate": 9.842272678579203e-06, + "loss": 0.8465, + "step": 3005 + }, + { + "epoch": 0.1654466398811162, + "grad_norm": 0.8068844676017761, + "learning_rate": 9.842164644700615e-06, + "loss": 0.8856, + "step": 3006 + }, + { + "epoch": 0.16550167868347185, + "grad_norm": 0.9174006581306458, + "learning_rate": 9.842056574429615e-06, + "loss": 0.7748, + "step": 3007 + }, + { + "epoch": 0.1655567174858275, + "grad_norm": 0.7453809380531311, + "learning_rate": 9.841948467767012e-06, + "loss": 0.7565, + "step": 3008 + }, + { + "epoch": 0.16561175628818317, + "grad_norm": 0.8408182859420776, + "learning_rate": 9.841840324713622e-06, + "loss": 0.7345, + "step": 3009 + }, + { + "epoch": 0.16566679509053883, + "grad_norm": 0.8599638938903809, + "learning_rate": 9.841732145270254e-06, + "loss": 0.8163, + "step": 3010 + }, + { + "epoch": 0.1657218338928945, + "grad_norm": 0.877616822719574, + "learning_rate": 9.841623929437725e-06, + "loss": 0.8685, + "step": 3011 + }, + { + "epoch": 0.16577687269525015, + "grad_norm": 0.7765643000602722, + "learning_rate": 9.841515677216846e-06, + "loss": 0.7281, + "step": 3012 + }, + { + "epoch": 0.16583191149760582, + "grad_norm": 0.7891712784767151, + "learning_rate": 9.841407388608431e-06, + "loss": 0.8618, + "step": 3013 + }, + { + "epoch": 0.16588695029996148, + "grad_norm": 0.9215571284294128, + "learning_rate": 9.841299063613295e-06, + "loss": 0.8709, + "step": 3014 + }, + { + "epoch": 0.16594198910231714, + "grad_norm": 0.8428288698196411, + "learning_rate": 9.841190702232249e-06, + "loss": 0.8227, + "step": 3015 + }, + { + "epoch": 0.1659970279046728, + "grad_norm": 0.9294042587280273, + "learning_rate": 9.841082304466112e-06, + "loss": 0.8203, + "step": 3016 + }, + { + "epoch": 0.16605206670702846, + "grad_norm": 0.7530880570411682, + "learning_rate": 9.840973870315695e-06, + "loss": 0.7681, + "step": 3017 + }, + { + "epoch": 0.16610710550938412, + "grad_norm": 1.0149626731872559, + "learning_rate": 9.840865399781814e-06, + "loss": 0.9212, + "step": 3018 + }, + { + "epoch": 0.16616214431173978, + "grad_norm": 0.7595353722572327, + "learning_rate": 9.840756892865285e-06, + "loss": 0.795, + "step": 3019 + }, + { + "epoch": 0.16621718311409545, + "grad_norm": 0.7893253564834595, + "learning_rate": 9.840648349566924e-06, + "loss": 0.8147, + "step": 3020 + }, + { + "epoch": 0.1662722219164511, + "grad_norm": 0.8190789222717285, + "learning_rate": 9.840539769887543e-06, + "loss": 0.7233, + "step": 3021 + }, + { + "epoch": 0.16632726071880677, + "grad_norm": 0.7771229147911072, + "learning_rate": 9.840431153827963e-06, + "loss": 0.7172, + "step": 3022 + }, + { + "epoch": 0.16638229952116243, + "grad_norm": 0.7379328012466431, + "learning_rate": 9.840322501388998e-06, + "loss": 0.7603, + "step": 3023 + }, + { + "epoch": 0.1664373383235181, + "grad_norm": 0.9488499760627747, + "learning_rate": 9.840213812571464e-06, + "loss": 0.8025, + "step": 3024 + }, + { + "epoch": 0.16649237712587375, + "grad_norm": 0.7135865092277527, + "learning_rate": 9.84010508737618e-06, + "loss": 0.7412, + "step": 3025 + }, + { + "epoch": 0.16654741592822941, + "grad_norm": 1.6780112981796265, + "learning_rate": 9.83999632580396e-06, + "loss": 0.9231, + "step": 3026 + }, + { + "epoch": 0.16660245473058508, + "grad_norm": 0.8815935850143433, + "learning_rate": 9.839887527855623e-06, + "loss": 0.7903, + "step": 3027 + }, + { + "epoch": 0.16665749353294074, + "grad_norm": 0.8942261338233948, + "learning_rate": 9.83977869353199e-06, + "loss": 0.8328, + "step": 3028 + }, + { + "epoch": 0.16671253233529637, + "grad_norm": 0.7866815328598022, + "learning_rate": 9.839669822833873e-06, + "loss": 0.8483, + "step": 3029 + }, + { + "epoch": 0.16676757113765203, + "grad_norm": 0.8133070468902588, + "learning_rate": 9.839560915762094e-06, + "loss": 0.8665, + "step": 3030 + }, + { + "epoch": 0.1668226099400077, + "grad_norm": 0.7442927360534668, + "learning_rate": 9.839451972317469e-06, + "loss": 0.6296, + "step": 3031 + }, + { + "epoch": 0.16687764874236335, + "grad_norm": 0.7505021691322327, + "learning_rate": 9.83934299250082e-06, + "loss": 0.7976, + "step": 3032 + }, + { + "epoch": 0.16693268754471902, + "grad_norm": 0.8310422897338867, + "learning_rate": 9.839233976312964e-06, + "loss": 0.9022, + "step": 3033 + }, + { + "epoch": 0.16698772634707468, + "grad_norm": 0.8175413012504578, + "learning_rate": 9.839124923754721e-06, + "loss": 0.8653, + "step": 3034 + }, + { + "epoch": 0.16704276514943034, + "grad_norm": 0.7963089346885681, + "learning_rate": 9.839015834826912e-06, + "loss": 0.7888, + "step": 3035 + }, + { + "epoch": 0.167097803951786, + "grad_norm": 0.8923391699790955, + "learning_rate": 9.838906709530353e-06, + "loss": 0.9396, + "step": 3036 + }, + { + "epoch": 0.16715284275414166, + "grad_norm": 0.7851678133010864, + "learning_rate": 9.838797547865869e-06, + "loss": 0.8163, + "step": 3037 + }, + { + "epoch": 0.16720788155649732, + "grad_norm": 0.817877471446991, + "learning_rate": 9.838688349834275e-06, + "loss": 0.8928, + "step": 3038 + }, + { + "epoch": 0.16726292035885298, + "grad_norm": 0.7603926062583923, + "learning_rate": 9.838579115436395e-06, + "loss": 0.7418, + "step": 3039 + }, + { + "epoch": 0.16731795916120865, + "grad_norm": 0.8086446523666382, + "learning_rate": 9.83846984467305e-06, + "loss": 0.8017, + "step": 3040 + }, + { + "epoch": 0.1673729979635643, + "grad_norm": 1.4745439291000366, + "learning_rate": 9.838360537545061e-06, + "loss": 0.7964, + "step": 3041 + }, + { + "epoch": 0.16742803676591997, + "grad_norm": 0.778404176235199, + "learning_rate": 9.83825119405325e-06, + "loss": 0.7395, + "step": 3042 + }, + { + "epoch": 0.16748307556827563, + "grad_norm": 0.8245886564254761, + "learning_rate": 9.838141814198439e-06, + "loss": 0.8697, + "step": 3043 + }, + { + "epoch": 0.1675381143706313, + "grad_norm": 0.8395472764968872, + "learning_rate": 9.838032397981448e-06, + "loss": 0.8545, + "step": 3044 + }, + { + "epoch": 0.16759315317298695, + "grad_norm": 0.8973744511604309, + "learning_rate": 9.8379229454031e-06, + "loss": 0.8999, + "step": 3045 + }, + { + "epoch": 0.16764819197534261, + "grad_norm": 1.2034368515014648, + "learning_rate": 9.837813456464219e-06, + "loss": 0.9039, + "step": 3046 + }, + { + "epoch": 0.16770323077769828, + "grad_norm": 0.862167477607727, + "learning_rate": 9.837703931165625e-06, + "loss": 0.889, + "step": 3047 + }, + { + "epoch": 0.16775826958005394, + "grad_norm": 0.7624714970588684, + "learning_rate": 9.837594369508146e-06, + "loss": 0.7072, + "step": 3048 + }, + { + "epoch": 0.1678133083824096, + "grad_norm": 0.7771621346473694, + "learning_rate": 9.8374847714926e-06, + "loss": 0.8769, + "step": 3049 + }, + { + "epoch": 0.16786834718476526, + "grad_norm": 0.7834492921829224, + "learning_rate": 9.837375137119816e-06, + "loss": 0.841, + "step": 3050 + }, + { + "epoch": 0.16792338598712092, + "grad_norm": 0.8175067901611328, + "learning_rate": 9.837265466390612e-06, + "loss": 0.8149, + "step": 3051 + }, + { + "epoch": 0.16797842478947658, + "grad_norm": 0.7474493384361267, + "learning_rate": 9.83715575930582e-06, + "loss": 0.7716, + "step": 3052 + }, + { + "epoch": 0.16803346359183224, + "grad_norm": 1.1263303756713867, + "learning_rate": 9.837046015866257e-06, + "loss": 0.8026, + "step": 3053 + }, + { + "epoch": 0.1680885023941879, + "grad_norm": 0.8741740584373474, + "learning_rate": 9.836936236072752e-06, + "loss": 0.8795, + "step": 3054 + }, + { + "epoch": 0.16814354119654357, + "grad_norm": 0.8108506798744202, + "learning_rate": 9.83682641992613e-06, + "loss": 0.7682, + "step": 3055 + }, + { + "epoch": 0.16819857999889923, + "grad_norm": 0.9380543231964111, + "learning_rate": 9.836716567427213e-06, + "loss": 0.8739, + "step": 3056 + }, + { + "epoch": 0.1682536188012549, + "grad_norm": 0.7755940556526184, + "learning_rate": 9.83660667857683e-06, + "loss": 0.7287, + "step": 3057 + }, + { + "epoch": 0.16830865760361055, + "grad_norm": 0.808907151222229, + "learning_rate": 9.836496753375807e-06, + "loss": 0.7988, + "step": 3058 + }, + { + "epoch": 0.1683636964059662, + "grad_norm": 1.1496524810791016, + "learning_rate": 9.836386791824967e-06, + "loss": 0.8621, + "step": 3059 + }, + { + "epoch": 0.16841873520832187, + "grad_norm": 0.8550384640693665, + "learning_rate": 9.83627679392514e-06, + "loss": 0.913, + "step": 3060 + }, + { + "epoch": 0.16847377401067753, + "grad_norm": 0.761142909526825, + "learning_rate": 9.83616675967715e-06, + "loss": 0.7271, + "step": 3061 + }, + { + "epoch": 0.1685288128130332, + "grad_norm": 0.8496200442314148, + "learning_rate": 9.836056689081828e-06, + "loss": 0.7885, + "step": 3062 + }, + { + "epoch": 0.16858385161538886, + "grad_norm": 0.8310382962226868, + "learning_rate": 9.835946582139996e-06, + "loss": 0.858, + "step": 3063 + }, + { + "epoch": 0.16863889041774452, + "grad_norm": 0.7870821952819824, + "learning_rate": 9.835836438852485e-06, + "loss": 0.7791, + "step": 3064 + }, + { + "epoch": 0.16869392922010018, + "grad_norm": 0.7170534729957581, + "learning_rate": 9.83572625922012e-06, + "loss": 0.6666, + "step": 3065 + }, + { + "epoch": 0.16874896802245584, + "grad_norm": 0.9764187932014465, + "learning_rate": 9.835616043243732e-06, + "loss": 0.8341, + "step": 3066 + }, + { + "epoch": 0.1688040068248115, + "grad_norm": 0.7453315258026123, + "learning_rate": 9.83550579092415e-06, + "loss": 0.7032, + "step": 3067 + }, + { + "epoch": 0.16885904562716716, + "grad_norm": 0.9205759763717651, + "learning_rate": 9.835395502262196e-06, + "loss": 0.813, + "step": 3068 + }, + { + "epoch": 0.16891408442952283, + "grad_norm": 0.8152205944061279, + "learning_rate": 9.835285177258708e-06, + "loss": 0.8275, + "step": 3069 + }, + { + "epoch": 0.1689691232318785, + "grad_norm": 0.8065707087516785, + "learning_rate": 9.83517481591451e-06, + "loss": 0.8648, + "step": 3070 + }, + { + "epoch": 0.16902416203423415, + "grad_norm": 0.7774410247802734, + "learning_rate": 9.835064418230432e-06, + "loss": 0.7818, + "step": 3071 + }, + { + "epoch": 0.16907920083658978, + "grad_norm": 0.8591069579124451, + "learning_rate": 9.834953984207305e-06, + "loss": 0.8055, + "step": 3072 + }, + { + "epoch": 0.16913423963894544, + "grad_norm": 0.7421612739562988, + "learning_rate": 9.834843513845958e-06, + "loss": 0.7543, + "step": 3073 + }, + { + "epoch": 0.1691892784413011, + "grad_norm": 0.7855183482170105, + "learning_rate": 9.83473300714722e-06, + "loss": 0.7011, + "step": 3074 + }, + { + "epoch": 0.16924431724365677, + "grad_norm": 0.8061636686325073, + "learning_rate": 9.834622464111924e-06, + "loss": 0.8096, + "step": 3075 + }, + { + "epoch": 0.16929935604601243, + "grad_norm": 0.8048406839370728, + "learning_rate": 9.834511884740898e-06, + "loss": 0.8166, + "step": 3076 + }, + { + "epoch": 0.1693543948483681, + "grad_norm": 0.8776549696922302, + "learning_rate": 9.834401269034977e-06, + "loss": 0.8169, + "step": 3077 + }, + { + "epoch": 0.16940943365072375, + "grad_norm": 1.0208356380462646, + "learning_rate": 9.83429061699499e-06, + "loss": 0.6976, + "step": 3078 + }, + { + "epoch": 0.1694644724530794, + "grad_norm": 0.7641016840934753, + "learning_rate": 9.834179928621767e-06, + "loss": 0.7109, + "step": 3079 + }, + { + "epoch": 0.16951951125543507, + "grad_norm": 0.7648905515670776, + "learning_rate": 9.834069203916143e-06, + "loss": 0.7927, + "step": 3080 + }, + { + "epoch": 0.16957455005779073, + "grad_norm": 0.7898744344711304, + "learning_rate": 9.833958442878948e-06, + "loss": 0.7911, + "step": 3081 + }, + { + "epoch": 0.1696295888601464, + "grad_norm": 0.8812462687492371, + "learning_rate": 9.833847645511016e-06, + "loss": 0.8381, + "step": 3082 + }, + { + "epoch": 0.16968462766250206, + "grad_norm": 0.8141197562217712, + "learning_rate": 9.833736811813179e-06, + "loss": 0.7422, + "step": 3083 + }, + { + "epoch": 0.16973966646485772, + "grad_norm": 0.7860949635505676, + "learning_rate": 9.83362594178627e-06, + "loss": 0.7568, + "step": 3084 + }, + { + "epoch": 0.16979470526721338, + "grad_norm": 0.6688396334648132, + "learning_rate": 9.833515035431123e-06, + "loss": 0.7143, + "step": 3085 + }, + { + "epoch": 0.16984974406956904, + "grad_norm": 0.7525103092193604, + "learning_rate": 9.833404092748569e-06, + "loss": 0.8026, + "step": 3086 + }, + { + "epoch": 0.1699047828719247, + "grad_norm": 0.8505181670188904, + "learning_rate": 9.833293113739444e-06, + "loss": 0.8894, + "step": 3087 + }, + { + "epoch": 0.16995982167428036, + "grad_norm": 0.8432300090789795, + "learning_rate": 9.833182098404583e-06, + "loss": 0.7801, + "step": 3088 + }, + { + "epoch": 0.17001486047663603, + "grad_norm": 0.7655903100967407, + "learning_rate": 9.833071046744819e-06, + "loss": 0.7838, + "step": 3089 + }, + { + "epoch": 0.1700698992789917, + "grad_norm": 0.8436369895935059, + "learning_rate": 9.832959958760986e-06, + "loss": 0.8636, + "step": 3090 + }, + { + "epoch": 0.17012493808134735, + "grad_norm": 0.7880234122276306, + "learning_rate": 9.83284883445392e-06, + "loss": 0.7701, + "step": 3091 + }, + { + "epoch": 0.170179976883703, + "grad_norm": 0.7713757753372192, + "learning_rate": 9.832737673824455e-06, + "loss": 0.8652, + "step": 3092 + }, + { + "epoch": 0.17023501568605867, + "grad_norm": 0.7905295491218567, + "learning_rate": 9.832626476873428e-06, + "loss": 0.8666, + "step": 3093 + }, + { + "epoch": 0.17029005448841433, + "grad_norm": 0.7589883804321289, + "learning_rate": 9.832515243601675e-06, + "loss": 0.8051, + "step": 3094 + }, + { + "epoch": 0.17034509329077, + "grad_norm": 0.9068838953971863, + "learning_rate": 9.83240397401003e-06, + "loss": 0.9037, + "step": 3095 + }, + { + "epoch": 0.17040013209312566, + "grad_norm": 0.7465278506278992, + "learning_rate": 9.83229266809933e-06, + "loss": 0.7425, + "step": 3096 + }, + { + "epoch": 0.17045517089548132, + "grad_norm": 0.8111177086830139, + "learning_rate": 9.83218132587041e-06, + "loss": 0.8034, + "step": 3097 + }, + { + "epoch": 0.17051020969783698, + "grad_norm": 1.1007672548294067, + "learning_rate": 9.832069947324112e-06, + "loss": 0.9139, + "step": 3098 + }, + { + "epoch": 0.17056524850019264, + "grad_norm": 0.881179690361023, + "learning_rate": 9.831958532461269e-06, + "loss": 0.9062, + "step": 3099 + }, + { + "epoch": 0.1706202873025483, + "grad_norm": 0.8012413382530212, + "learning_rate": 9.831847081282718e-06, + "loss": 0.7956, + "step": 3100 + }, + { + "epoch": 0.17067532610490396, + "grad_norm": 0.741731584072113, + "learning_rate": 9.831735593789298e-06, + "loss": 0.8754, + "step": 3101 + }, + { + "epoch": 0.17073036490725962, + "grad_norm": 0.8945604562759399, + "learning_rate": 9.831624069981848e-06, + "loss": 0.8293, + "step": 3102 + }, + { + "epoch": 0.17078540370961529, + "grad_norm": 0.7865545749664307, + "learning_rate": 9.831512509861203e-06, + "loss": 0.7812, + "step": 3103 + }, + { + "epoch": 0.17084044251197095, + "grad_norm": 0.832847535610199, + "learning_rate": 9.831400913428205e-06, + "loss": 0.8925, + "step": 3104 + }, + { + "epoch": 0.1708954813143266, + "grad_norm": 0.7374216914176941, + "learning_rate": 9.83128928068369e-06, + "loss": 0.8275, + "step": 3105 + }, + { + "epoch": 0.17095052011668227, + "grad_norm": 0.748725414276123, + "learning_rate": 9.831177611628497e-06, + "loss": 0.8364, + "step": 3106 + }, + { + "epoch": 0.17100555891903793, + "grad_norm": 0.810276448726654, + "learning_rate": 9.831065906263468e-06, + "loss": 0.861, + "step": 3107 + }, + { + "epoch": 0.1710605977213936, + "grad_norm": 0.7607758641242981, + "learning_rate": 9.83095416458944e-06, + "loss": 0.7989, + "step": 3108 + }, + { + "epoch": 0.17111563652374925, + "grad_norm": 0.7206127047538757, + "learning_rate": 9.830842386607253e-06, + "loss": 0.7187, + "step": 3109 + }, + { + "epoch": 0.17117067532610492, + "grad_norm": 0.7775895595550537, + "learning_rate": 9.83073057231775e-06, + "loss": 0.8008, + "step": 3110 + }, + { + "epoch": 0.17122571412846058, + "grad_norm": 0.8351094722747803, + "learning_rate": 9.830618721721768e-06, + "loss": 0.8025, + "step": 3111 + }, + { + "epoch": 0.17128075293081624, + "grad_norm": 0.8090646266937256, + "learning_rate": 9.830506834820148e-06, + "loss": 0.8012, + "step": 3112 + }, + { + "epoch": 0.1713357917331719, + "grad_norm": 0.7762801051139832, + "learning_rate": 9.830394911613733e-06, + "loss": 0.8428, + "step": 3113 + }, + { + "epoch": 0.17139083053552756, + "grad_norm": 0.8117541074752808, + "learning_rate": 9.83028295210336e-06, + "loss": 0.8566, + "step": 3114 + }, + { + "epoch": 0.1714458693378832, + "grad_norm": 0.8786184787750244, + "learning_rate": 9.830170956289876e-06, + "loss": 0.8386, + "step": 3115 + }, + { + "epoch": 0.17150090814023886, + "grad_norm": 1.0181046724319458, + "learning_rate": 9.83005892417412e-06, + "loss": 0.8555, + "step": 3116 + }, + { + "epoch": 0.17155594694259452, + "grad_norm": 0.8236173391342163, + "learning_rate": 9.829946855756934e-06, + "loss": 0.7933, + "step": 3117 + }, + { + "epoch": 0.17161098574495018, + "grad_norm": 0.8058149814605713, + "learning_rate": 9.829834751039157e-06, + "loss": 0.842, + "step": 3118 + }, + { + "epoch": 0.17166602454730584, + "grad_norm": 0.7419908046722412, + "learning_rate": 9.82972261002164e-06, + "loss": 0.8397, + "step": 3119 + }, + { + "epoch": 0.1717210633496615, + "grad_norm": 0.7528164982795715, + "learning_rate": 9.829610432705216e-06, + "loss": 0.7931, + "step": 3120 + }, + { + "epoch": 0.17177610215201716, + "grad_norm": 0.7357296943664551, + "learning_rate": 9.829498219090736e-06, + "loss": 0.8089, + "step": 3121 + }, + { + "epoch": 0.17183114095437282, + "grad_norm": 0.7635773420333862, + "learning_rate": 9.829385969179039e-06, + "loss": 0.7442, + "step": 3122 + }, + { + "epoch": 0.17188617975672849, + "grad_norm": 0.8200171589851379, + "learning_rate": 9.82927368297097e-06, + "loss": 0.757, + "step": 3123 + }, + { + "epoch": 0.17194121855908415, + "grad_norm": 0.8367171287536621, + "learning_rate": 9.829161360467374e-06, + "loss": 0.915, + "step": 3124 + }, + { + "epoch": 0.1719962573614398, + "grad_norm": 0.8460778594017029, + "learning_rate": 9.829049001669091e-06, + "loss": 0.8568, + "step": 3125 + }, + { + "epoch": 0.17205129616379547, + "grad_norm": 0.7301799058914185, + "learning_rate": 9.82893660657697e-06, + "loss": 0.8041, + "step": 3126 + }, + { + "epoch": 0.17210633496615113, + "grad_norm": 0.7858132123947144, + "learning_rate": 9.828824175191854e-06, + "loss": 0.8367, + "step": 3127 + }, + { + "epoch": 0.1721613737685068, + "grad_norm": 0.8118360042572021, + "learning_rate": 9.82871170751459e-06, + "loss": 0.85, + "step": 3128 + }, + { + "epoch": 0.17221641257086245, + "grad_norm": 0.9020261764526367, + "learning_rate": 9.828599203546019e-06, + "loss": 0.789, + "step": 3129 + }, + { + "epoch": 0.17227145137321812, + "grad_norm": 0.8194546699523926, + "learning_rate": 9.828486663286989e-06, + "loss": 0.8644, + "step": 3130 + }, + { + "epoch": 0.17232649017557378, + "grad_norm": 0.7764905095100403, + "learning_rate": 9.828374086738345e-06, + "loss": 0.7961, + "step": 3131 + }, + { + "epoch": 0.17238152897792944, + "grad_norm": 0.7712632417678833, + "learning_rate": 9.828261473900935e-06, + "loss": 0.8082, + "step": 3132 + }, + { + "epoch": 0.1724365677802851, + "grad_norm": 0.7100280523300171, + "learning_rate": 9.828148824775604e-06, + "loss": 0.7514, + "step": 3133 + }, + { + "epoch": 0.17249160658264076, + "grad_norm": 0.7812890410423279, + "learning_rate": 9.8280361393632e-06, + "loss": 0.7125, + "step": 3134 + }, + { + "epoch": 0.17254664538499642, + "grad_norm": 0.8772642612457275, + "learning_rate": 9.827923417664568e-06, + "loss": 0.8355, + "step": 3135 + }, + { + "epoch": 0.17260168418735208, + "grad_norm": 0.9161205291748047, + "learning_rate": 9.827810659680555e-06, + "loss": 0.7511, + "step": 3136 + }, + { + "epoch": 0.17265672298970774, + "grad_norm": 0.7628560662269592, + "learning_rate": 9.82769786541201e-06, + "loss": 0.882, + "step": 3137 + }, + { + "epoch": 0.1727117617920634, + "grad_norm": 0.8203405737876892, + "learning_rate": 9.827585034859781e-06, + "loss": 0.8172, + "step": 3138 + }, + { + "epoch": 0.17276680059441907, + "grad_norm": 0.8318095207214355, + "learning_rate": 9.827472168024715e-06, + "loss": 0.7784, + "step": 3139 + }, + { + "epoch": 0.17282183939677473, + "grad_norm": 0.9137747287750244, + "learning_rate": 9.827359264907658e-06, + "loss": 0.8643, + "step": 3140 + }, + { + "epoch": 0.1728768781991304, + "grad_norm": 0.9441068768501282, + "learning_rate": 9.827246325509463e-06, + "loss": 0.7936, + "step": 3141 + }, + { + "epoch": 0.17293191700148605, + "grad_norm": 0.7402390837669373, + "learning_rate": 9.827133349830977e-06, + "loss": 0.7813, + "step": 3142 + }, + { + "epoch": 0.1729869558038417, + "grad_norm": 0.8328836560249329, + "learning_rate": 9.827020337873048e-06, + "loss": 0.7676, + "step": 3143 + }, + { + "epoch": 0.17304199460619737, + "grad_norm": 0.8106881380081177, + "learning_rate": 9.826907289636526e-06, + "loss": 0.9037, + "step": 3144 + }, + { + "epoch": 0.17309703340855304, + "grad_norm": 0.8457425236701965, + "learning_rate": 9.826794205122263e-06, + "loss": 0.78, + "step": 3145 + }, + { + "epoch": 0.1731520722109087, + "grad_norm": 0.9335517883300781, + "learning_rate": 9.826681084331105e-06, + "loss": 0.9197, + "step": 3146 + }, + { + "epoch": 0.17320711101326436, + "grad_norm": 0.9098715782165527, + "learning_rate": 9.826567927263904e-06, + "loss": 0.932, + "step": 3147 + }, + { + "epoch": 0.17326214981562002, + "grad_norm": 0.767234206199646, + "learning_rate": 9.826454733921512e-06, + "loss": 0.8717, + "step": 3148 + }, + { + "epoch": 0.17331718861797568, + "grad_norm": 0.8114444017410278, + "learning_rate": 9.826341504304775e-06, + "loss": 0.8744, + "step": 3149 + }, + { + "epoch": 0.17337222742033134, + "grad_norm": 0.7948976755142212, + "learning_rate": 9.82622823841455e-06, + "loss": 0.7947, + "step": 3150 + }, + { + "epoch": 0.173427266222687, + "grad_norm": 0.7808204889297485, + "learning_rate": 9.826114936251684e-06, + "loss": 0.8151, + "step": 3151 + }, + { + "epoch": 0.17348230502504267, + "grad_norm": 0.733860969543457, + "learning_rate": 9.82600159781703e-06, + "loss": 0.8018, + "step": 3152 + }, + { + "epoch": 0.17353734382739833, + "grad_norm": 0.7630699276924133, + "learning_rate": 9.825888223111442e-06, + "loss": 0.7937, + "step": 3153 + }, + { + "epoch": 0.173592382629754, + "grad_norm": 0.7892931699752808, + "learning_rate": 9.825774812135766e-06, + "loss": 0.782, + "step": 3154 + }, + { + "epoch": 0.17364742143210965, + "grad_norm": 0.6642436385154724, + "learning_rate": 9.825661364890862e-06, + "loss": 0.6611, + "step": 3155 + }, + { + "epoch": 0.1737024602344653, + "grad_norm": 0.7755968570709229, + "learning_rate": 9.825547881377577e-06, + "loss": 0.7835, + "step": 3156 + }, + { + "epoch": 0.17375749903682097, + "grad_norm": 0.8406579494476318, + "learning_rate": 9.825434361596766e-06, + "loss": 0.9178, + "step": 3157 + }, + { + "epoch": 0.1738125378391766, + "grad_norm": 0.8887308835983276, + "learning_rate": 9.825320805549284e-06, + "loss": 0.7951, + "step": 3158 + }, + { + "epoch": 0.17386757664153227, + "grad_norm": 0.85418701171875, + "learning_rate": 9.825207213235978e-06, + "loss": 0.8671, + "step": 3159 + }, + { + "epoch": 0.17392261544388793, + "grad_norm": 0.8831202983856201, + "learning_rate": 9.82509358465771e-06, + "loss": 0.8708, + "step": 3160 + }, + { + "epoch": 0.1739776542462436, + "grad_norm": 0.9041616320610046, + "learning_rate": 9.82497991981533e-06, + "loss": 0.8981, + "step": 3161 + }, + { + "epoch": 0.17403269304859925, + "grad_norm": 0.8169258832931519, + "learning_rate": 9.824866218709692e-06, + "loss": 0.8857, + "step": 3162 + }, + { + "epoch": 0.1740877318509549, + "grad_norm": 0.8714475631713867, + "learning_rate": 9.824752481341651e-06, + "loss": 0.8552, + "step": 3163 + }, + { + "epoch": 0.17414277065331057, + "grad_norm": 0.8261111378669739, + "learning_rate": 9.824638707712061e-06, + "loss": 0.808, + "step": 3164 + }, + { + "epoch": 0.17419780945566624, + "grad_norm": 0.7542527914047241, + "learning_rate": 9.82452489782178e-06, + "loss": 0.8078, + "step": 3165 + }, + { + "epoch": 0.1742528482580219, + "grad_norm": 1.309218168258667, + "learning_rate": 9.824411051671658e-06, + "loss": 0.9325, + "step": 3166 + }, + { + "epoch": 0.17430788706037756, + "grad_norm": 0.8528563380241394, + "learning_rate": 9.824297169262555e-06, + "loss": 0.8493, + "step": 3167 + }, + { + "epoch": 0.17436292586273322, + "grad_norm": 0.7777062058448792, + "learning_rate": 9.824183250595328e-06, + "loss": 0.7002, + "step": 3168 + }, + { + "epoch": 0.17441796466508888, + "grad_norm": 0.7385506629943848, + "learning_rate": 9.824069295670828e-06, + "loss": 0.8396, + "step": 3169 + }, + { + "epoch": 0.17447300346744454, + "grad_norm": 0.8316949605941772, + "learning_rate": 9.823955304489918e-06, + "loss": 0.8769, + "step": 3170 + }, + { + "epoch": 0.1745280422698002, + "grad_norm": 0.8149139285087585, + "learning_rate": 9.823841277053448e-06, + "loss": 0.8009, + "step": 3171 + }, + { + "epoch": 0.17458308107215587, + "grad_norm": 0.8761584162712097, + "learning_rate": 9.82372721336228e-06, + "loss": 0.7366, + "step": 3172 + }, + { + "epoch": 0.17463811987451153, + "grad_norm": 0.7104084491729736, + "learning_rate": 9.82361311341727e-06, + "loss": 0.6704, + "step": 3173 + }, + { + "epoch": 0.1746931586768672, + "grad_norm": 0.791806697845459, + "learning_rate": 9.823498977219273e-06, + "loss": 0.9054, + "step": 3174 + }, + { + "epoch": 0.17474819747922285, + "grad_norm": 0.7675086855888367, + "learning_rate": 9.82338480476915e-06, + "loss": 0.751, + "step": 3175 + }, + { + "epoch": 0.1748032362815785, + "grad_norm": 0.7380725145339966, + "learning_rate": 9.823270596067759e-06, + "loss": 0.7618, + "step": 3176 + }, + { + "epoch": 0.17485827508393417, + "grad_norm": 0.7311519384384155, + "learning_rate": 9.823156351115954e-06, + "loss": 0.7424, + "step": 3177 + }, + { + "epoch": 0.17491331388628983, + "grad_norm": 0.7888365387916565, + "learning_rate": 9.8230420699146e-06, + "loss": 0.7717, + "step": 3178 + }, + { + "epoch": 0.1749683526886455, + "grad_norm": 0.9329265356063843, + "learning_rate": 9.822927752464552e-06, + "loss": 0.8256, + "step": 3179 + }, + { + "epoch": 0.17502339149100116, + "grad_norm": 0.711794912815094, + "learning_rate": 9.822813398766671e-06, + "loss": 0.7373, + "step": 3180 + }, + { + "epoch": 0.17507843029335682, + "grad_norm": 0.8713497519493103, + "learning_rate": 9.822699008821813e-06, + "loss": 0.8135, + "step": 3181 + }, + { + "epoch": 0.17513346909571248, + "grad_norm": 0.6923471689224243, + "learning_rate": 9.822584582630841e-06, + "loss": 0.7589, + "step": 3182 + }, + { + "epoch": 0.17518850789806814, + "grad_norm": 0.8648017048835754, + "learning_rate": 9.822470120194616e-06, + "loss": 0.7828, + "step": 3183 + }, + { + "epoch": 0.1752435467004238, + "grad_norm": 0.8407077789306641, + "learning_rate": 9.822355621513994e-06, + "loss": 0.8537, + "step": 3184 + }, + { + "epoch": 0.17529858550277946, + "grad_norm": 0.8076738119125366, + "learning_rate": 9.822241086589841e-06, + "loss": 0.7827, + "step": 3185 + }, + { + "epoch": 0.17535362430513513, + "grad_norm": 0.8402661085128784, + "learning_rate": 9.822126515423011e-06, + "loss": 0.8247, + "step": 3186 + }, + { + "epoch": 0.1754086631074908, + "grad_norm": 0.8911813497543335, + "learning_rate": 9.822011908014373e-06, + "loss": 0.8996, + "step": 3187 + }, + { + "epoch": 0.17546370190984645, + "grad_norm": 0.8060111999511719, + "learning_rate": 9.821897264364782e-06, + "loss": 0.796, + "step": 3188 + }, + { + "epoch": 0.1755187407122021, + "grad_norm": 0.8476423621177673, + "learning_rate": 9.8217825844751e-06, + "loss": 0.8657, + "step": 3189 + }, + { + "epoch": 0.17557377951455777, + "grad_norm": 0.7614054083824158, + "learning_rate": 9.821667868346194e-06, + "loss": 0.8583, + "step": 3190 + }, + { + "epoch": 0.17562881831691343, + "grad_norm": 0.8312287330627441, + "learning_rate": 9.821553115978923e-06, + "loss": 0.7718, + "step": 3191 + }, + { + "epoch": 0.1756838571192691, + "grad_norm": 0.8199487328529358, + "learning_rate": 9.82143832737415e-06, + "loss": 0.7617, + "step": 3192 + }, + { + "epoch": 0.17573889592162475, + "grad_norm": 0.7529115080833435, + "learning_rate": 9.821323502532733e-06, + "loss": 0.7587, + "step": 3193 + }, + { + "epoch": 0.17579393472398042, + "grad_norm": 0.9205463528633118, + "learning_rate": 9.821208641455542e-06, + "loss": 0.7871, + "step": 3194 + }, + { + "epoch": 0.17584897352633608, + "grad_norm": 0.8055161833763123, + "learning_rate": 9.821093744143437e-06, + "loss": 0.8133, + "step": 3195 + }, + { + "epoch": 0.17590401232869174, + "grad_norm": 0.7322981953620911, + "learning_rate": 9.82097881059728e-06, + "loss": 0.7442, + "step": 3196 + }, + { + "epoch": 0.1759590511310474, + "grad_norm": 1.0465941429138184, + "learning_rate": 9.82086384081794e-06, + "loss": 1.0073, + "step": 3197 + }, + { + "epoch": 0.17601408993340306, + "grad_norm": 0.7607331275939941, + "learning_rate": 9.820748834806278e-06, + "loss": 0.8128, + "step": 3198 + }, + { + "epoch": 0.17606912873575872, + "grad_norm": 0.7901879549026489, + "learning_rate": 9.820633792563156e-06, + "loss": 0.7928, + "step": 3199 + }, + { + "epoch": 0.17612416753811436, + "grad_norm": 0.8010839223861694, + "learning_rate": 9.820518714089442e-06, + "loss": 0.7025, + "step": 3200 + }, + { + "epoch": 0.17617920634047002, + "grad_norm": 0.8511317372322083, + "learning_rate": 9.820403599385999e-06, + "loss": 0.7947, + "step": 3201 + }, + { + "epoch": 0.17623424514282568, + "grad_norm": 0.7978847026824951, + "learning_rate": 9.820288448453693e-06, + "loss": 0.7395, + "step": 3202 + }, + { + "epoch": 0.17628928394518134, + "grad_norm": 0.6991232633590698, + "learning_rate": 9.820173261293388e-06, + "loss": 0.7113, + "step": 3203 + }, + { + "epoch": 0.176344322747537, + "grad_norm": 0.8966444730758667, + "learning_rate": 9.820058037905954e-06, + "loss": 0.7399, + "step": 3204 + }, + { + "epoch": 0.17639936154989266, + "grad_norm": 0.8042632341384888, + "learning_rate": 9.819942778292253e-06, + "loss": 0.8183, + "step": 3205 + }, + { + "epoch": 0.17645440035224833, + "grad_norm": 0.8047537803649902, + "learning_rate": 9.81982748245315e-06, + "loss": 0.852, + "step": 3206 + }, + { + "epoch": 0.176509439154604, + "grad_norm": 0.8277122378349304, + "learning_rate": 9.819712150389517e-06, + "loss": 0.8828, + "step": 3207 + }, + { + "epoch": 0.17656447795695965, + "grad_norm": 0.8677185773849487, + "learning_rate": 9.819596782102216e-06, + "loss": 0.8416, + "step": 3208 + }, + { + "epoch": 0.1766195167593153, + "grad_norm": 0.8750975728034973, + "learning_rate": 9.819481377592115e-06, + "loss": 0.9289, + "step": 3209 + }, + { + "epoch": 0.17667455556167097, + "grad_norm": 0.7665122151374817, + "learning_rate": 9.819365936860084e-06, + "loss": 0.8653, + "step": 3210 + }, + { + "epoch": 0.17672959436402663, + "grad_norm": 0.9341353178024292, + "learning_rate": 9.819250459906989e-06, + "loss": 0.7225, + "step": 3211 + }, + { + "epoch": 0.1767846331663823, + "grad_norm": 0.7007241249084473, + "learning_rate": 9.819134946733696e-06, + "loss": 0.7429, + "step": 3212 + }, + { + "epoch": 0.17683967196873795, + "grad_norm": 0.8001461029052734, + "learning_rate": 9.819019397341074e-06, + "loss": 0.759, + "step": 3213 + }, + { + "epoch": 0.17689471077109362, + "grad_norm": 0.8936446905136108, + "learning_rate": 9.818903811729993e-06, + "loss": 0.8248, + "step": 3214 + }, + { + "epoch": 0.17694974957344928, + "grad_norm": 0.805570125579834, + "learning_rate": 9.818788189901321e-06, + "loss": 0.9214, + "step": 3215 + }, + { + "epoch": 0.17700478837580494, + "grad_norm": 0.7762455940246582, + "learning_rate": 9.818672531855926e-06, + "loss": 0.7848, + "step": 3216 + }, + { + "epoch": 0.1770598271781606, + "grad_norm": 0.8391497731208801, + "learning_rate": 9.81855683759468e-06, + "loss": 0.7543, + "step": 3217 + }, + { + "epoch": 0.17711486598051626, + "grad_norm": 0.8489046692848206, + "learning_rate": 9.818441107118449e-06, + "loss": 0.7908, + "step": 3218 + }, + { + "epoch": 0.17716990478287192, + "grad_norm": 1.0949461460113525, + "learning_rate": 9.818325340428105e-06, + "loss": 0.8255, + "step": 3219 + }, + { + "epoch": 0.17722494358522758, + "grad_norm": 0.8710842132568359, + "learning_rate": 9.81820953752452e-06, + "loss": 0.859, + "step": 3220 + }, + { + "epoch": 0.17727998238758325, + "grad_norm": 0.7936064600944519, + "learning_rate": 9.818093698408558e-06, + "loss": 0.8475, + "step": 3221 + }, + { + "epoch": 0.1773350211899389, + "grad_norm": 0.790341854095459, + "learning_rate": 9.817977823081095e-06, + "loss": 0.8137, + "step": 3222 + }, + { + "epoch": 0.17739005999229457, + "grad_norm": 0.8154531717300415, + "learning_rate": 9.817861911543002e-06, + "loss": 0.8687, + "step": 3223 + }, + { + "epoch": 0.17744509879465023, + "grad_norm": 0.8346067070960999, + "learning_rate": 9.817745963795144e-06, + "loss": 0.8905, + "step": 3224 + }, + { + "epoch": 0.1775001375970059, + "grad_norm": 0.7137764096260071, + "learning_rate": 9.817629979838401e-06, + "loss": 0.7715, + "step": 3225 + }, + { + "epoch": 0.17755517639936155, + "grad_norm": 0.7237628102302551, + "learning_rate": 9.81751395967364e-06, + "loss": 0.7824, + "step": 3226 + }, + { + "epoch": 0.17761021520171721, + "grad_norm": 0.9481163024902344, + "learning_rate": 9.817397903301733e-06, + "loss": 0.7451, + "step": 3227 + }, + { + "epoch": 0.17766525400407288, + "grad_norm": 0.9472424387931824, + "learning_rate": 9.817281810723552e-06, + "loss": 0.8774, + "step": 3228 + }, + { + "epoch": 0.17772029280642854, + "grad_norm": 0.9295538663864136, + "learning_rate": 9.81716568193997e-06, + "loss": 0.8507, + "step": 3229 + }, + { + "epoch": 0.1777753316087842, + "grad_norm": 0.7668172717094421, + "learning_rate": 9.817049516951863e-06, + "loss": 0.8547, + "step": 3230 + }, + { + "epoch": 0.17783037041113986, + "grad_norm": 0.8640413880348206, + "learning_rate": 9.8169333157601e-06, + "loss": 0.8485, + "step": 3231 + }, + { + "epoch": 0.17788540921349552, + "grad_norm": 0.9901431798934937, + "learning_rate": 9.816817078365554e-06, + "loss": 0.9236, + "step": 3232 + }, + { + "epoch": 0.17794044801585118, + "grad_norm": 1.0242371559143066, + "learning_rate": 9.816700804769104e-06, + "loss": 0.8096, + "step": 3233 + }, + { + "epoch": 0.17799548681820684, + "grad_norm": 0.910498857498169, + "learning_rate": 9.816584494971617e-06, + "loss": 0.829, + "step": 3234 + }, + { + "epoch": 0.1780505256205625, + "grad_norm": 0.8254473805427551, + "learning_rate": 9.816468148973972e-06, + "loss": 0.7828, + "step": 3235 + }, + { + "epoch": 0.17810556442291817, + "grad_norm": 0.7971221804618835, + "learning_rate": 9.816351766777039e-06, + "loss": 0.8057, + "step": 3236 + }, + { + "epoch": 0.17816060322527383, + "grad_norm": 0.8151674270629883, + "learning_rate": 9.816235348381697e-06, + "loss": 0.7801, + "step": 3237 + }, + { + "epoch": 0.1782156420276295, + "grad_norm": 0.7587556838989258, + "learning_rate": 9.81611889378882e-06, + "loss": 0.7814, + "step": 3238 + }, + { + "epoch": 0.17827068082998515, + "grad_norm": 0.8843516111373901, + "learning_rate": 9.816002402999283e-06, + "loss": 0.8873, + "step": 3239 + }, + { + "epoch": 0.1783257196323408, + "grad_norm": 0.917859673500061, + "learning_rate": 9.81588587601396e-06, + "loss": 0.8963, + "step": 3240 + }, + { + "epoch": 0.17838075843469647, + "grad_norm": 0.8256439566612244, + "learning_rate": 9.815769312833727e-06, + "loss": 0.9157, + "step": 3241 + }, + { + "epoch": 0.17843579723705214, + "grad_norm": 0.8364603519439697, + "learning_rate": 9.815652713459462e-06, + "loss": 0.8253, + "step": 3242 + }, + { + "epoch": 0.17849083603940777, + "grad_norm": 0.7717131972312927, + "learning_rate": 9.81553607789204e-06, + "loss": 0.7211, + "step": 3243 + }, + { + "epoch": 0.17854587484176343, + "grad_norm": 0.8069111704826355, + "learning_rate": 9.815419406132338e-06, + "loss": 0.8986, + "step": 3244 + }, + { + "epoch": 0.1786009136441191, + "grad_norm": 0.9176943302154541, + "learning_rate": 9.815302698181233e-06, + "loss": 0.8084, + "step": 3245 + }, + { + "epoch": 0.17865595244647475, + "grad_norm": 0.769183874130249, + "learning_rate": 9.815185954039601e-06, + "loss": 0.8084, + "step": 3246 + }, + { + "epoch": 0.17871099124883041, + "grad_norm": 0.8070697784423828, + "learning_rate": 9.815069173708321e-06, + "loss": 0.8371, + "step": 3247 + }, + { + "epoch": 0.17876603005118608, + "grad_norm": 0.7837347388267517, + "learning_rate": 9.81495235718827e-06, + "loss": 0.8015, + "step": 3248 + }, + { + "epoch": 0.17882106885354174, + "grad_norm": 0.9248430728912354, + "learning_rate": 9.814835504480327e-06, + "loss": 0.8396, + "step": 3249 + }, + { + "epoch": 0.1788761076558974, + "grad_norm": 0.7914367914199829, + "learning_rate": 9.814718615585367e-06, + "loss": 0.8068, + "step": 3250 + }, + { + "epoch": 0.17893114645825306, + "grad_norm": 0.8612570762634277, + "learning_rate": 9.814601690504273e-06, + "loss": 0.8227, + "step": 3251 + }, + { + "epoch": 0.17898618526060872, + "grad_norm": 0.7476248741149902, + "learning_rate": 9.81448472923792e-06, + "loss": 0.8609, + "step": 3252 + }, + { + "epoch": 0.17904122406296438, + "grad_norm": 0.7455218434333801, + "learning_rate": 9.81436773178719e-06, + "loss": 0.7992, + "step": 3253 + }, + { + "epoch": 0.17909626286532004, + "grad_norm": 0.7917896509170532, + "learning_rate": 9.814250698152958e-06, + "loss": 0.8383, + "step": 3254 + }, + { + "epoch": 0.1791513016676757, + "grad_norm": 0.6926130652427673, + "learning_rate": 9.81413362833611e-06, + "loss": 0.709, + "step": 3255 + }, + { + "epoch": 0.17920634047003137, + "grad_norm": 0.8219630718231201, + "learning_rate": 9.814016522337519e-06, + "loss": 0.9387, + "step": 3256 + }, + { + "epoch": 0.17926137927238703, + "grad_norm": 0.8588619828224182, + "learning_rate": 9.81389938015807e-06, + "loss": 0.8354, + "step": 3257 + }, + { + "epoch": 0.1793164180747427, + "grad_norm": 0.7868718504905701, + "learning_rate": 9.81378220179864e-06, + "loss": 0.8464, + "step": 3258 + }, + { + "epoch": 0.17937145687709835, + "grad_norm": 0.789479672908783, + "learning_rate": 9.813664987260114e-06, + "loss": 0.8577, + "step": 3259 + }, + { + "epoch": 0.179426495679454, + "grad_norm": 0.8280717730522156, + "learning_rate": 9.81354773654337e-06, + "loss": 0.765, + "step": 3260 + }, + { + "epoch": 0.17948153448180967, + "grad_norm": 0.7660181522369385, + "learning_rate": 9.813430449649289e-06, + "loss": 0.7116, + "step": 3261 + }, + { + "epoch": 0.17953657328416534, + "grad_norm": 0.8043892979621887, + "learning_rate": 9.813313126578754e-06, + "loss": 0.8398, + "step": 3262 + }, + { + "epoch": 0.179591612086521, + "grad_norm": 0.8708420991897583, + "learning_rate": 9.813195767332647e-06, + "loss": 0.8246, + "step": 3263 + }, + { + "epoch": 0.17964665088887666, + "grad_norm": 1.1456964015960693, + "learning_rate": 9.813078371911846e-06, + "loss": 0.8798, + "step": 3264 + }, + { + "epoch": 0.17970168969123232, + "grad_norm": 0.9668154716491699, + "learning_rate": 9.812960940317238e-06, + "loss": 0.9645, + "step": 3265 + }, + { + "epoch": 0.17975672849358798, + "grad_norm": 0.862050473690033, + "learning_rate": 9.812843472549705e-06, + "loss": 0.8675, + "step": 3266 + }, + { + "epoch": 0.17981176729594364, + "grad_norm": 0.7776491641998291, + "learning_rate": 9.812725968610126e-06, + "loss": 0.7727, + "step": 3267 + }, + { + "epoch": 0.1798668060982993, + "grad_norm": 0.7197048664093018, + "learning_rate": 9.812608428499389e-06, + "loss": 0.6877, + "step": 3268 + }, + { + "epoch": 0.17992184490065496, + "grad_norm": 0.7995713353157043, + "learning_rate": 9.812490852218375e-06, + "loss": 0.8576, + "step": 3269 + }, + { + "epoch": 0.17997688370301063, + "grad_norm": 0.8300820589065552, + "learning_rate": 9.812373239767967e-06, + "loss": 0.8119, + "step": 3270 + }, + { + "epoch": 0.1800319225053663, + "grad_norm": 0.8625856041908264, + "learning_rate": 9.812255591149052e-06, + "loss": 0.7547, + "step": 3271 + }, + { + "epoch": 0.18008696130772195, + "grad_norm": 1.016419768333435, + "learning_rate": 9.812137906362511e-06, + "loss": 0.8457, + "step": 3272 + }, + { + "epoch": 0.1801420001100776, + "grad_norm": 0.7303110361099243, + "learning_rate": 9.812020185409229e-06, + "loss": 0.7954, + "step": 3273 + }, + { + "epoch": 0.18019703891243327, + "grad_norm": 0.8632498383522034, + "learning_rate": 9.811902428290093e-06, + "loss": 0.8952, + "step": 3274 + }, + { + "epoch": 0.18025207771478893, + "grad_norm": 0.7666932940483093, + "learning_rate": 9.811784635005984e-06, + "loss": 0.746, + "step": 3275 + }, + { + "epoch": 0.1803071165171446, + "grad_norm": 0.8962032198905945, + "learning_rate": 9.811666805557791e-06, + "loss": 0.8654, + "step": 3276 + }, + { + "epoch": 0.18036215531950026, + "grad_norm": 0.9399656057357788, + "learning_rate": 9.811548939946397e-06, + "loss": 0.8062, + "step": 3277 + }, + { + "epoch": 0.18041719412185592, + "grad_norm": 0.7469807863235474, + "learning_rate": 9.811431038172692e-06, + "loss": 0.79, + "step": 3278 + }, + { + "epoch": 0.18047223292421158, + "grad_norm": 0.7661105394363403, + "learning_rate": 9.811313100237556e-06, + "loss": 0.7768, + "step": 3279 + }, + { + "epoch": 0.18052727172656724, + "grad_norm": 0.7567458748817444, + "learning_rate": 9.811195126141881e-06, + "loss": 0.7329, + "step": 3280 + }, + { + "epoch": 0.1805823105289229, + "grad_norm": 0.7187278866767883, + "learning_rate": 9.811077115886552e-06, + "loss": 0.6511, + "step": 3281 + }, + { + "epoch": 0.18063734933127856, + "grad_norm": 0.7641230821609497, + "learning_rate": 9.810959069472452e-06, + "loss": 0.7704, + "step": 3282 + }, + { + "epoch": 0.18069238813363422, + "grad_norm": 0.7790704369544983, + "learning_rate": 9.810840986900474e-06, + "loss": 0.8142, + "step": 3283 + }, + { + "epoch": 0.18074742693598989, + "grad_norm": 0.8102816343307495, + "learning_rate": 9.810722868171502e-06, + "loss": 0.765, + "step": 3284 + }, + { + "epoch": 0.18080246573834555, + "grad_norm": 0.7251957058906555, + "learning_rate": 9.810604713286424e-06, + "loss": 0.7836, + "step": 3285 + }, + { + "epoch": 0.18085750454070118, + "grad_norm": 0.845348060131073, + "learning_rate": 9.81048652224613e-06, + "loss": 0.8386, + "step": 3286 + }, + { + "epoch": 0.18091254334305684, + "grad_norm": 0.8397864103317261, + "learning_rate": 9.810368295051507e-06, + "loss": 0.805, + "step": 3287 + }, + { + "epoch": 0.1809675821454125, + "grad_norm": 1.0739909410476685, + "learning_rate": 9.810250031703444e-06, + "loss": 0.8735, + "step": 3288 + }, + { + "epoch": 0.18102262094776816, + "grad_norm": 0.752091646194458, + "learning_rate": 9.810131732202826e-06, + "loss": 0.7814, + "step": 3289 + }, + { + "epoch": 0.18107765975012383, + "grad_norm": 0.7826841473579407, + "learning_rate": 9.810013396550548e-06, + "loss": 0.7761, + "step": 3290 + }, + { + "epoch": 0.1811326985524795, + "grad_norm": 0.6979131102561951, + "learning_rate": 9.809895024747498e-06, + "loss": 0.672, + "step": 3291 + }, + { + "epoch": 0.18118773735483515, + "grad_norm": 0.8571394085884094, + "learning_rate": 9.809776616794562e-06, + "loss": 0.8795, + "step": 3292 + }, + { + "epoch": 0.1812427761571908, + "grad_norm": 0.8287902474403381, + "learning_rate": 9.809658172692634e-06, + "loss": 0.9032, + "step": 3293 + }, + { + "epoch": 0.18129781495954647, + "grad_norm": 0.7884420156478882, + "learning_rate": 9.809539692442602e-06, + "loss": 0.87, + "step": 3294 + }, + { + "epoch": 0.18135285376190213, + "grad_norm": 0.8955305218696594, + "learning_rate": 9.809421176045358e-06, + "loss": 0.7982, + "step": 3295 + }, + { + "epoch": 0.1814078925642578, + "grad_norm": 0.7893335819244385, + "learning_rate": 9.809302623501791e-06, + "loss": 0.7792, + "step": 3296 + }, + { + "epoch": 0.18146293136661346, + "grad_norm": 0.8077870011329651, + "learning_rate": 9.809184034812794e-06, + "loss": 0.829, + "step": 3297 + }, + { + "epoch": 0.18151797016896912, + "grad_norm": 0.8282631635665894, + "learning_rate": 9.809065409979256e-06, + "loss": 0.8502, + "step": 3298 + }, + { + "epoch": 0.18157300897132478, + "grad_norm": 0.7988418936729431, + "learning_rate": 9.808946749002068e-06, + "loss": 0.7853, + "step": 3299 + }, + { + "epoch": 0.18162804777368044, + "grad_norm": 0.7776056528091431, + "learning_rate": 9.808828051882127e-06, + "loss": 0.7843, + "step": 3300 + }, + { + "epoch": 0.1816830865760361, + "grad_norm": 0.8772258758544922, + "learning_rate": 9.80870931862032e-06, + "loss": 0.896, + "step": 3301 + }, + { + "epoch": 0.18173812537839176, + "grad_norm": 0.8080328702926636, + "learning_rate": 9.80859054921754e-06, + "loss": 0.8464, + "step": 3302 + }, + { + "epoch": 0.18179316418074742, + "grad_norm": 0.862707257270813, + "learning_rate": 9.808471743674682e-06, + "loss": 0.8732, + "step": 3303 + }, + { + "epoch": 0.18184820298310309, + "grad_norm": 1.1964820623397827, + "learning_rate": 9.808352901992637e-06, + "loss": 0.9911, + "step": 3304 + }, + { + "epoch": 0.18190324178545875, + "grad_norm": 0.8597685694694519, + "learning_rate": 9.808234024172298e-06, + "loss": 0.8724, + "step": 3305 + }, + { + "epoch": 0.1819582805878144, + "grad_norm": 0.8068556189537048, + "learning_rate": 9.80811511021456e-06, + "loss": 0.8116, + "step": 3306 + }, + { + "epoch": 0.18201331939017007, + "grad_norm": 1.0014268159866333, + "learning_rate": 9.807996160120317e-06, + "loss": 0.8585, + "step": 3307 + }, + { + "epoch": 0.18206835819252573, + "grad_norm": 0.8541132807731628, + "learning_rate": 9.80787717389046e-06, + "loss": 0.8505, + "step": 3308 + }, + { + "epoch": 0.1821233969948814, + "grad_norm": 0.7973629832267761, + "learning_rate": 9.807758151525886e-06, + "loss": 0.8312, + "step": 3309 + }, + { + "epoch": 0.18217843579723705, + "grad_norm": 0.82973712682724, + "learning_rate": 9.807639093027488e-06, + "loss": 0.894, + "step": 3310 + }, + { + "epoch": 0.18223347459959272, + "grad_norm": 0.7729674577713013, + "learning_rate": 9.807519998396162e-06, + "loss": 0.7459, + "step": 3311 + }, + { + "epoch": 0.18228851340194838, + "grad_norm": 0.8106189370155334, + "learning_rate": 9.807400867632804e-06, + "loss": 0.914, + "step": 3312 + }, + { + "epoch": 0.18234355220430404, + "grad_norm": 0.7672377228736877, + "learning_rate": 9.807281700738305e-06, + "loss": 0.8475, + "step": 3313 + }, + { + "epoch": 0.1823985910066597, + "grad_norm": 0.8776688575744629, + "learning_rate": 9.807162497713566e-06, + "loss": 0.7641, + "step": 3314 + }, + { + "epoch": 0.18245362980901536, + "grad_norm": 0.8781917691230774, + "learning_rate": 9.807043258559479e-06, + "loss": 0.86, + "step": 3315 + }, + { + "epoch": 0.18250866861137102, + "grad_norm": 0.819362998008728, + "learning_rate": 9.806923983276942e-06, + "loss": 0.8829, + "step": 3316 + }, + { + "epoch": 0.18256370741372668, + "grad_norm": 0.8065270185470581, + "learning_rate": 9.80680467186685e-06, + "loss": 0.7335, + "step": 3317 + }, + { + "epoch": 0.18261874621608234, + "grad_norm": 0.8692485690116882, + "learning_rate": 9.806685324330102e-06, + "loss": 0.8582, + "step": 3318 + }, + { + "epoch": 0.182673785018438, + "grad_norm": 0.7910160422325134, + "learning_rate": 9.806565940667594e-06, + "loss": 0.8569, + "step": 3319 + }, + { + "epoch": 0.18272882382079367, + "grad_norm": 0.8282253742218018, + "learning_rate": 9.806446520880225e-06, + "loss": 0.7791, + "step": 3320 + }, + { + "epoch": 0.18278386262314933, + "grad_norm": 0.7513861060142517, + "learning_rate": 9.806327064968887e-06, + "loss": 0.7287, + "step": 3321 + }, + { + "epoch": 0.182838901425505, + "grad_norm": 0.8141188621520996, + "learning_rate": 9.806207572934483e-06, + "loss": 0.7772, + "step": 3322 + }, + { + "epoch": 0.18289394022786065, + "grad_norm": 0.7963125705718994, + "learning_rate": 9.806088044777909e-06, + "loss": 0.7993, + "step": 3323 + }, + { + "epoch": 0.1829489790302163, + "grad_norm": 0.8527218103408813, + "learning_rate": 9.805968480500063e-06, + "loss": 0.822, + "step": 3324 + }, + { + "epoch": 0.18300401783257197, + "grad_norm": 0.822467565536499, + "learning_rate": 9.805848880101845e-06, + "loss": 0.8606, + "step": 3325 + }, + { + "epoch": 0.18305905663492764, + "grad_norm": 0.8197154402732849, + "learning_rate": 9.805729243584154e-06, + "loss": 0.9004, + "step": 3326 + }, + { + "epoch": 0.1831140954372833, + "grad_norm": 0.8379594683647156, + "learning_rate": 9.805609570947887e-06, + "loss": 0.8467, + "step": 3327 + }, + { + "epoch": 0.18316913423963896, + "grad_norm": 0.7787355184555054, + "learning_rate": 9.805489862193947e-06, + "loss": 0.8221, + "step": 3328 + }, + { + "epoch": 0.1832241730419946, + "grad_norm": 0.8464100956916809, + "learning_rate": 9.80537011732323e-06, + "loss": 0.7722, + "step": 3329 + }, + { + "epoch": 0.18327921184435025, + "grad_norm": 0.8351306319236755, + "learning_rate": 9.805250336336637e-06, + "loss": 0.7638, + "step": 3330 + }, + { + "epoch": 0.18333425064670592, + "grad_norm": 0.8098864555358887, + "learning_rate": 9.805130519235068e-06, + "loss": 0.8448, + "step": 3331 + }, + { + "epoch": 0.18338928944906158, + "grad_norm": 0.8290563821792603, + "learning_rate": 9.805010666019427e-06, + "loss": 0.6574, + "step": 3332 + }, + { + "epoch": 0.18344432825141724, + "grad_norm": 0.7748262882232666, + "learning_rate": 9.804890776690611e-06, + "loss": 0.8002, + "step": 3333 + }, + { + "epoch": 0.1834993670537729, + "grad_norm": 0.8422787189483643, + "learning_rate": 9.80477085124952e-06, + "loss": 0.8452, + "step": 3334 + }, + { + "epoch": 0.18355440585612856, + "grad_norm": 0.7776510119438171, + "learning_rate": 9.804650889697061e-06, + "loss": 0.8774, + "step": 3335 + }, + { + "epoch": 0.18360944465848422, + "grad_norm": 0.8449370861053467, + "learning_rate": 9.80453089203413e-06, + "loss": 0.8233, + "step": 3336 + }, + { + "epoch": 0.18366448346083988, + "grad_norm": 0.8254217505455017, + "learning_rate": 9.804410858261632e-06, + "loss": 0.8778, + "step": 3337 + }, + { + "epoch": 0.18371952226319554, + "grad_norm": 0.8673515915870667, + "learning_rate": 9.804290788380466e-06, + "loss": 0.8005, + "step": 3338 + }, + { + "epoch": 0.1837745610655512, + "grad_norm": 0.8106067776679993, + "learning_rate": 9.804170682391538e-06, + "loss": 0.86, + "step": 3339 + }, + { + "epoch": 0.18382959986790687, + "grad_norm": 0.8211669325828552, + "learning_rate": 9.804050540295749e-06, + "loss": 0.8013, + "step": 3340 + }, + { + "epoch": 0.18388463867026253, + "grad_norm": 0.7866180539131165, + "learning_rate": 9.803930362094003e-06, + "loss": 0.8108, + "step": 3341 + }, + { + "epoch": 0.1839396774726182, + "grad_norm": 0.8192055225372314, + "learning_rate": 9.8038101477872e-06, + "loss": 0.7586, + "step": 3342 + }, + { + "epoch": 0.18399471627497385, + "grad_norm": 0.940910279750824, + "learning_rate": 9.803689897376248e-06, + "loss": 0.8174, + "step": 3343 + }, + { + "epoch": 0.1840497550773295, + "grad_norm": 0.7979292869567871, + "learning_rate": 9.803569610862048e-06, + "loss": 0.8341, + "step": 3344 + }, + { + "epoch": 0.18410479387968517, + "grad_norm": 0.7577546238899231, + "learning_rate": 9.803449288245504e-06, + "loss": 0.7775, + "step": 3345 + }, + { + "epoch": 0.18415983268204084, + "grad_norm": 0.7255160212516785, + "learning_rate": 9.80332892952752e-06, + "loss": 0.7648, + "step": 3346 + }, + { + "epoch": 0.1842148714843965, + "grad_norm": 0.8269388675689697, + "learning_rate": 9.803208534709004e-06, + "loss": 0.8902, + "step": 3347 + }, + { + "epoch": 0.18426991028675216, + "grad_norm": 0.783867359161377, + "learning_rate": 9.803088103790857e-06, + "loss": 0.8191, + "step": 3348 + }, + { + "epoch": 0.18432494908910782, + "grad_norm": 0.7658863663673401, + "learning_rate": 9.802967636773986e-06, + "loss": 0.7505, + "step": 3349 + }, + { + "epoch": 0.18437998789146348, + "grad_norm": 0.701225757598877, + "learning_rate": 9.802847133659294e-06, + "loss": 0.7159, + "step": 3350 + }, + { + "epoch": 0.18443502669381914, + "grad_norm": 0.9224311709403992, + "learning_rate": 9.802726594447692e-06, + "loss": 0.7766, + "step": 3351 + }, + { + "epoch": 0.1844900654961748, + "grad_norm": 0.8835979700088501, + "learning_rate": 9.80260601914008e-06, + "loss": 0.9304, + "step": 3352 + }, + { + "epoch": 0.18454510429853047, + "grad_norm": 0.7918481826782227, + "learning_rate": 9.802485407737368e-06, + "loss": 0.7691, + "step": 3353 + }, + { + "epoch": 0.18460014310088613, + "grad_norm": 0.8855286240577698, + "learning_rate": 9.80236476024046e-06, + "loss": 0.9213, + "step": 3354 + }, + { + "epoch": 0.1846551819032418, + "grad_norm": 0.7863314747810364, + "learning_rate": 9.802244076650264e-06, + "loss": 0.7675, + "step": 3355 + }, + { + "epoch": 0.18471022070559745, + "grad_norm": 0.8230198621749878, + "learning_rate": 9.802123356967687e-06, + "loss": 0.7243, + "step": 3356 + }, + { + "epoch": 0.1847652595079531, + "grad_norm": 0.8038737773895264, + "learning_rate": 9.80200260119364e-06, + "loss": 0.8094, + "step": 3357 + }, + { + "epoch": 0.18482029831030877, + "grad_norm": 0.7656993269920349, + "learning_rate": 9.801881809329022e-06, + "loss": 0.7736, + "step": 3358 + }, + { + "epoch": 0.18487533711266443, + "grad_norm": 0.8222082853317261, + "learning_rate": 9.801760981374747e-06, + "loss": 0.844, + "step": 3359 + }, + { + "epoch": 0.1849303759150201, + "grad_norm": 0.7632889747619629, + "learning_rate": 9.801640117331723e-06, + "loss": 0.8354, + "step": 3360 + }, + { + "epoch": 0.18498541471737576, + "grad_norm": 0.8308513760566711, + "learning_rate": 9.801519217200857e-06, + "loss": 0.8277, + "step": 3361 + }, + { + "epoch": 0.18504045351973142, + "grad_norm": 0.7865434885025024, + "learning_rate": 9.801398280983057e-06, + "loss": 0.8614, + "step": 3362 + }, + { + "epoch": 0.18509549232208708, + "grad_norm": 0.7249410152435303, + "learning_rate": 9.801277308679232e-06, + "loss": 0.7259, + "step": 3363 + }, + { + "epoch": 0.18515053112444274, + "grad_norm": 0.7604461908340454, + "learning_rate": 9.801156300290293e-06, + "loss": 0.8507, + "step": 3364 + }, + { + "epoch": 0.1852055699267984, + "grad_norm": 0.8725959062576294, + "learning_rate": 9.801035255817149e-06, + "loss": 0.7688, + "step": 3365 + }, + { + "epoch": 0.18526060872915406, + "grad_norm": 0.7798827290534973, + "learning_rate": 9.800914175260708e-06, + "loss": 0.8788, + "step": 3366 + }, + { + "epoch": 0.18531564753150973, + "grad_norm": 0.7060996890068054, + "learning_rate": 9.800793058621882e-06, + "loss": 0.8183, + "step": 3367 + }, + { + "epoch": 0.1853706863338654, + "grad_norm": 0.7558063268661499, + "learning_rate": 9.80067190590158e-06, + "loss": 0.7834, + "step": 3368 + }, + { + "epoch": 0.18542572513622105, + "grad_norm": 0.7411057353019714, + "learning_rate": 9.800550717100714e-06, + "loss": 0.8298, + "step": 3369 + }, + { + "epoch": 0.1854807639385767, + "grad_norm": 0.8466144800186157, + "learning_rate": 9.800429492220193e-06, + "loss": 0.8297, + "step": 3370 + }, + { + "epoch": 0.18553580274093237, + "grad_norm": 0.7302330136299133, + "learning_rate": 9.800308231260928e-06, + "loss": 0.72, + "step": 3371 + }, + { + "epoch": 0.185590841543288, + "grad_norm": 0.8140530586242676, + "learning_rate": 9.800186934223832e-06, + "loss": 0.9287, + "step": 3372 + }, + { + "epoch": 0.18564588034564367, + "grad_norm": 0.8246129751205444, + "learning_rate": 9.800065601109817e-06, + "loss": 0.7891, + "step": 3373 + }, + { + "epoch": 0.18570091914799933, + "grad_norm": 0.8746623396873474, + "learning_rate": 9.799944231919794e-06, + "loss": 0.8549, + "step": 3374 + }, + { + "epoch": 0.185755957950355, + "grad_norm": 0.9977195858955383, + "learning_rate": 9.799822826654672e-06, + "loss": 0.821, + "step": 3375 + }, + { + "epoch": 0.18581099675271065, + "grad_norm": 0.8937395811080933, + "learning_rate": 9.79970138531537e-06, + "loss": 0.8639, + "step": 3376 + }, + { + "epoch": 0.1858660355550663, + "grad_norm": 1.039695143699646, + "learning_rate": 9.799579907902794e-06, + "loss": 1.0425, + "step": 3377 + }, + { + "epoch": 0.18592107435742197, + "grad_norm": 0.7847749590873718, + "learning_rate": 9.799458394417863e-06, + "loss": 0.8505, + "step": 3378 + }, + { + "epoch": 0.18597611315977763, + "grad_norm": 0.760334312915802, + "learning_rate": 9.799336844861486e-06, + "loss": 0.7418, + "step": 3379 + }, + { + "epoch": 0.1860311519621333, + "grad_norm": 0.7599604725837708, + "learning_rate": 9.799215259234578e-06, + "loss": 0.8305, + "step": 3380 + }, + { + "epoch": 0.18608619076448896, + "grad_norm": 0.846767246723175, + "learning_rate": 9.799093637538054e-06, + "loss": 0.7526, + "step": 3381 + }, + { + "epoch": 0.18614122956684462, + "grad_norm": 0.7840956449508667, + "learning_rate": 9.798971979772825e-06, + "loss": 0.8009, + "step": 3382 + }, + { + "epoch": 0.18619626836920028, + "grad_norm": 0.7826499342918396, + "learning_rate": 9.798850285939809e-06, + "loss": 0.821, + "step": 3383 + }, + { + "epoch": 0.18625130717155594, + "grad_norm": 0.7829813361167908, + "learning_rate": 9.798728556039918e-06, + "loss": 0.8053, + "step": 3384 + }, + { + "epoch": 0.1863063459739116, + "grad_norm": 0.7267470359802246, + "learning_rate": 9.798606790074067e-06, + "loss": 0.6797, + "step": 3385 + }, + { + "epoch": 0.18636138477626726, + "grad_norm": 0.8560196757316589, + "learning_rate": 9.798484988043173e-06, + "loss": 0.8476, + "step": 3386 + }, + { + "epoch": 0.18641642357862293, + "grad_norm": 0.7920921444892883, + "learning_rate": 9.798363149948148e-06, + "loss": 0.8832, + "step": 3387 + }, + { + "epoch": 0.1864714623809786, + "grad_norm": 0.8414384126663208, + "learning_rate": 9.798241275789912e-06, + "loss": 0.8607, + "step": 3388 + }, + { + "epoch": 0.18652650118333425, + "grad_norm": 0.7255431413650513, + "learning_rate": 9.798119365569378e-06, + "loss": 0.6426, + "step": 3389 + }, + { + "epoch": 0.1865815399856899, + "grad_norm": 0.8842852711677551, + "learning_rate": 9.797997419287465e-06, + "loss": 0.9058, + "step": 3390 + }, + { + "epoch": 0.18663657878804557, + "grad_norm": 0.7178265452384949, + "learning_rate": 9.797875436945086e-06, + "loss": 0.8134, + "step": 3391 + }, + { + "epoch": 0.18669161759040123, + "grad_norm": 0.7275096774101257, + "learning_rate": 9.797753418543161e-06, + "loss": 0.6858, + "step": 3392 + }, + { + "epoch": 0.1867466563927569, + "grad_norm": 0.7587800025939941, + "learning_rate": 9.797631364082605e-06, + "loss": 0.7437, + "step": 3393 + }, + { + "epoch": 0.18680169519511255, + "grad_norm": 0.9769744873046875, + "learning_rate": 9.797509273564336e-06, + "loss": 0.8024, + "step": 3394 + }, + { + "epoch": 0.18685673399746822, + "grad_norm": 0.7662433385848999, + "learning_rate": 9.79738714698927e-06, + "loss": 0.8122, + "step": 3395 + }, + { + "epoch": 0.18691177279982388, + "grad_norm": 0.8620306849479675, + "learning_rate": 9.797264984358328e-06, + "loss": 0.7952, + "step": 3396 + }, + { + "epoch": 0.18696681160217954, + "grad_norm": 0.7542591094970703, + "learning_rate": 9.797142785672427e-06, + "loss": 0.8315, + "step": 3397 + }, + { + "epoch": 0.1870218504045352, + "grad_norm": 0.7273713946342468, + "learning_rate": 9.797020550932483e-06, + "loss": 0.7316, + "step": 3398 + }, + { + "epoch": 0.18707688920689086, + "grad_norm": 1.031592845916748, + "learning_rate": 9.796898280139417e-06, + "loss": 0.7478, + "step": 3399 + }, + { + "epoch": 0.18713192800924652, + "grad_norm": 0.791407585144043, + "learning_rate": 9.796775973294147e-06, + "loss": 0.7742, + "step": 3400 + }, + { + "epoch": 0.18718696681160218, + "grad_norm": 0.8311418294906616, + "learning_rate": 9.796653630397595e-06, + "loss": 0.8182, + "step": 3401 + }, + { + "epoch": 0.18724200561395785, + "grad_norm": 0.7960993051528931, + "learning_rate": 9.796531251450678e-06, + "loss": 0.7606, + "step": 3402 + }, + { + "epoch": 0.1872970444163135, + "grad_norm": 0.8671618103981018, + "learning_rate": 9.796408836454316e-06, + "loss": 0.7136, + "step": 3403 + }, + { + "epoch": 0.18735208321866917, + "grad_norm": 1.1071348190307617, + "learning_rate": 9.796286385409428e-06, + "loss": 0.7729, + "step": 3404 + }, + { + "epoch": 0.18740712202102483, + "grad_norm": 0.738217294216156, + "learning_rate": 9.796163898316935e-06, + "loss": 0.7425, + "step": 3405 + }, + { + "epoch": 0.1874621608233805, + "grad_norm": 0.7567199468612671, + "learning_rate": 9.796041375177758e-06, + "loss": 0.8442, + "step": 3406 + }, + { + "epoch": 0.18751719962573615, + "grad_norm": 0.7942413091659546, + "learning_rate": 9.79591881599282e-06, + "loss": 0.852, + "step": 3407 + }, + { + "epoch": 0.18757223842809181, + "grad_norm": 0.7529355883598328, + "learning_rate": 9.795796220763038e-06, + "loss": 0.8086, + "step": 3408 + }, + { + "epoch": 0.18762727723044748, + "grad_norm": 0.7645192742347717, + "learning_rate": 9.795673589489337e-06, + "loss": 0.831, + "step": 3409 + }, + { + "epoch": 0.18768231603280314, + "grad_norm": 0.694791853427887, + "learning_rate": 9.795550922172635e-06, + "loss": 0.6919, + "step": 3410 + }, + { + "epoch": 0.1877373548351588, + "grad_norm": 0.7041944265365601, + "learning_rate": 9.795428218813858e-06, + "loss": 0.7284, + "step": 3411 + }, + { + "epoch": 0.18779239363751446, + "grad_norm": 0.8972276449203491, + "learning_rate": 9.795305479413924e-06, + "loss": 0.7156, + "step": 3412 + }, + { + "epoch": 0.18784743243987012, + "grad_norm": 0.9730873107910156, + "learning_rate": 9.795182703973758e-06, + "loss": 0.8739, + "step": 3413 + }, + { + "epoch": 0.18790247124222578, + "grad_norm": 0.8137956261634827, + "learning_rate": 9.795059892494283e-06, + "loss": 0.8189, + "step": 3414 + }, + { + "epoch": 0.18795751004458142, + "grad_norm": 0.8171416521072388, + "learning_rate": 9.794937044976422e-06, + "loss": 0.9449, + "step": 3415 + }, + { + "epoch": 0.18801254884693708, + "grad_norm": 0.7929911017417908, + "learning_rate": 9.794814161421098e-06, + "loss": 0.8034, + "step": 3416 + }, + { + "epoch": 0.18806758764929274, + "grad_norm": 1.1045749187469482, + "learning_rate": 9.794691241829233e-06, + "loss": 0.875, + "step": 3417 + }, + { + "epoch": 0.1881226264516484, + "grad_norm": 0.8141040205955505, + "learning_rate": 9.794568286201752e-06, + "loss": 0.787, + "step": 3418 + }, + { + "epoch": 0.18817766525400406, + "grad_norm": 0.7615541815757751, + "learning_rate": 9.79444529453958e-06, + "loss": 0.8491, + "step": 3419 + }, + { + "epoch": 0.18823270405635972, + "grad_norm": 0.848419189453125, + "learning_rate": 9.79432226684364e-06, + "loss": 0.7445, + "step": 3420 + }, + { + "epoch": 0.18828774285871538, + "grad_norm": 0.8075067400932312, + "learning_rate": 9.794199203114858e-06, + "loss": 0.6581, + "step": 3421 + }, + { + "epoch": 0.18834278166107105, + "grad_norm": 0.8473401069641113, + "learning_rate": 9.794076103354158e-06, + "loss": 0.839, + "step": 3422 + }, + { + "epoch": 0.1883978204634267, + "grad_norm": 0.8211609721183777, + "learning_rate": 9.793952967562463e-06, + "loss": 0.7709, + "step": 3423 + }, + { + "epoch": 0.18845285926578237, + "grad_norm": 0.7527804374694824, + "learning_rate": 9.793829795740703e-06, + "loss": 0.7315, + "step": 3424 + }, + { + "epoch": 0.18850789806813803, + "grad_norm": 0.7971188426017761, + "learning_rate": 9.793706587889802e-06, + "loss": 0.7507, + "step": 3425 + }, + { + "epoch": 0.1885629368704937, + "grad_norm": 1.024066686630249, + "learning_rate": 9.793583344010684e-06, + "loss": 0.9043, + "step": 3426 + }, + { + "epoch": 0.18861797567284935, + "grad_norm": 0.7428625226020813, + "learning_rate": 9.793460064104276e-06, + "loss": 0.7435, + "step": 3427 + }, + { + "epoch": 0.18867301447520501, + "grad_norm": 0.8438264727592468, + "learning_rate": 9.793336748171507e-06, + "loss": 0.8618, + "step": 3428 + }, + { + "epoch": 0.18872805327756068, + "grad_norm": 0.7846877574920654, + "learning_rate": 9.793213396213302e-06, + "loss": 0.8064, + "step": 3429 + }, + { + "epoch": 0.18878309207991634, + "grad_norm": 0.7527204751968384, + "learning_rate": 9.793090008230587e-06, + "loss": 0.7596, + "step": 3430 + }, + { + "epoch": 0.188838130882272, + "grad_norm": 1.1236757040023804, + "learning_rate": 9.792966584224292e-06, + "loss": 0.8292, + "step": 3431 + }, + { + "epoch": 0.18889316968462766, + "grad_norm": 0.8128102421760559, + "learning_rate": 9.792843124195343e-06, + "loss": 0.8073, + "step": 3432 + }, + { + "epoch": 0.18894820848698332, + "grad_norm": 0.7668742537498474, + "learning_rate": 9.792719628144667e-06, + "loss": 0.7848, + "step": 3433 + }, + { + "epoch": 0.18900324728933898, + "grad_norm": 1.8663485050201416, + "learning_rate": 9.792596096073193e-06, + "loss": 0.9388, + "step": 3434 + }, + { + "epoch": 0.18905828609169464, + "grad_norm": 0.8066239356994629, + "learning_rate": 9.792472527981852e-06, + "loss": 0.6647, + "step": 3435 + }, + { + "epoch": 0.1891133248940503, + "grad_norm": 0.8268817067146301, + "learning_rate": 9.792348923871567e-06, + "loss": 0.9676, + "step": 3436 + }, + { + "epoch": 0.18916836369640597, + "grad_norm": 0.7165037393569946, + "learning_rate": 9.792225283743272e-06, + "loss": 0.6937, + "step": 3437 + }, + { + "epoch": 0.18922340249876163, + "grad_norm": 0.7850403785705566, + "learning_rate": 9.792101607597895e-06, + "loss": 0.7782, + "step": 3438 + }, + { + "epoch": 0.1892784413011173, + "grad_norm": 0.8839808702468872, + "learning_rate": 9.791977895436365e-06, + "loss": 0.7639, + "step": 3439 + }, + { + "epoch": 0.18933348010347295, + "grad_norm": 0.8260362148284912, + "learning_rate": 9.791854147259611e-06, + "loss": 0.8201, + "step": 3440 + }, + { + "epoch": 0.1893885189058286, + "grad_norm": 0.8792916536331177, + "learning_rate": 9.791730363068564e-06, + "loss": 0.8251, + "step": 3441 + }, + { + "epoch": 0.18944355770818427, + "grad_norm": 0.8192774653434753, + "learning_rate": 9.791606542864154e-06, + "loss": 0.7944, + "step": 3442 + }, + { + "epoch": 0.18949859651053994, + "grad_norm": 0.751470685005188, + "learning_rate": 9.791482686647313e-06, + "loss": 0.7563, + "step": 3443 + }, + { + "epoch": 0.1895536353128956, + "grad_norm": 0.8902072906494141, + "learning_rate": 9.79135879441897e-06, + "loss": 0.7719, + "step": 3444 + }, + { + "epoch": 0.18960867411525126, + "grad_norm": 0.7166435122489929, + "learning_rate": 9.791234866180058e-06, + "loss": 0.7871, + "step": 3445 + }, + { + "epoch": 0.18966371291760692, + "grad_norm": 0.763416588306427, + "learning_rate": 9.791110901931505e-06, + "loss": 0.8226, + "step": 3446 + }, + { + "epoch": 0.18971875171996258, + "grad_norm": 0.806633472442627, + "learning_rate": 9.790986901674246e-06, + "loss": 0.7828, + "step": 3447 + }, + { + "epoch": 0.18977379052231824, + "grad_norm": 0.8139312863349915, + "learning_rate": 9.790862865409213e-06, + "loss": 0.8441, + "step": 3448 + }, + { + "epoch": 0.1898288293246739, + "grad_norm": 0.8362452387809753, + "learning_rate": 9.790738793137335e-06, + "loss": 0.8765, + "step": 3449 + }, + { + "epoch": 0.18988386812702956, + "grad_norm": 0.7736263871192932, + "learning_rate": 9.790614684859549e-06, + "loss": 0.8373, + "step": 3450 + }, + { + "epoch": 0.18993890692938523, + "grad_norm": 0.8742800354957581, + "learning_rate": 9.790490540576784e-06, + "loss": 0.8976, + "step": 3451 + }, + { + "epoch": 0.1899939457317409, + "grad_norm": 0.701505720615387, + "learning_rate": 9.790366360289974e-06, + "loss": 0.7799, + "step": 3452 + }, + { + "epoch": 0.19004898453409655, + "grad_norm": 0.7771356701850891, + "learning_rate": 9.790242144000055e-06, + "loss": 0.7617, + "step": 3453 + }, + { + "epoch": 0.1901040233364522, + "grad_norm": 0.897576093673706, + "learning_rate": 9.790117891707955e-06, + "loss": 0.7817, + "step": 3454 + }, + { + "epoch": 0.19015906213880787, + "grad_norm": 0.7296561002731323, + "learning_rate": 9.789993603414613e-06, + "loss": 0.8344, + "step": 3455 + }, + { + "epoch": 0.19021410094116353, + "grad_norm": 0.8099396228790283, + "learning_rate": 9.789869279120962e-06, + "loss": 0.7369, + "step": 3456 + }, + { + "epoch": 0.1902691397435192, + "grad_norm": 0.7802554368972778, + "learning_rate": 9.789744918827935e-06, + "loss": 0.8383, + "step": 3457 + }, + { + "epoch": 0.19032417854587483, + "grad_norm": 0.7508029341697693, + "learning_rate": 9.789620522536467e-06, + "loss": 0.825, + "step": 3458 + }, + { + "epoch": 0.1903792173482305, + "grad_norm": 0.7782164216041565, + "learning_rate": 9.789496090247494e-06, + "loss": 0.7737, + "step": 3459 + }, + { + "epoch": 0.19043425615058615, + "grad_norm": 0.7711489796638489, + "learning_rate": 9.78937162196195e-06, + "loss": 0.7694, + "step": 3460 + }, + { + "epoch": 0.1904892949529418, + "grad_norm": 0.821579098701477, + "learning_rate": 9.789247117680769e-06, + "loss": 0.7493, + "step": 3461 + }, + { + "epoch": 0.19054433375529747, + "grad_norm": 0.6700833439826965, + "learning_rate": 9.789122577404892e-06, + "loss": 0.7696, + "step": 3462 + }, + { + "epoch": 0.19059937255765314, + "grad_norm": 0.854340136051178, + "learning_rate": 9.78899800113525e-06, + "loss": 0.9503, + "step": 3463 + }, + { + "epoch": 0.1906544113600088, + "grad_norm": 0.8095537424087524, + "learning_rate": 9.78887338887278e-06, + "loss": 0.8435, + "step": 3464 + }, + { + "epoch": 0.19070945016236446, + "grad_norm": 0.8156480193138123, + "learning_rate": 9.78874874061842e-06, + "loss": 0.8561, + "step": 3465 + }, + { + "epoch": 0.19076448896472012, + "grad_norm": 0.8065482378005981, + "learning_rate": 9.788624056373108e-06, + "loss": 0.7793, + "step": 3466 + }, + { + "epoch": 0.19081952776707578, + "grad_norm": 0.789601743221283, + "learning_rate": 9.788499336137778e-06, + "loss": 0.7523, + "step": 3467 + }, + { + "epoch": 0.19087456656943144, + "grad_norm": 0.8322301506996155, + "learning_rate": 9.788374579913369e-06, + "loss": 0.9034, + "step": 3468 + }, + { + "epoch": 0.1909296053717871, + "grad_norm": 0.8194506764411926, + "learning_rate": 9.788249787700818e-06, + "loss": 0.8601, + "step": 3469 + }, + { + "epoch": 0.19098464417414276, + "grad_norm": 0.8419962525367737, + "learning_rate": 9.788124959501065e-06, + "loss": 0.869, + "step": 3470 + }, + { + "epoch": 0.19103968297649843, + "grad_norm": 0.760637104511261, + "learning_rate": 9.788000095315044e-06, + "loss": 0.7293, + "step": 3471 + }, + { + "epoch": 0.1910947217788541, + "grad_norm": 1.3964574337005615, + "learning_rate": 9.787875195143697e-06, + "loss": 0.8032, + "step": 3472 + }, + { + "epoch": 0.19114976058120975, + "grad_norm": 0.8205012679100037, + "learning_rate": 9.787750258987962e-06, + "loss": 0.8868, + "step": 3473 + }, + { + "epoch": 0.1912047993835654, + "grad_norm": 0.8183104991912842, + "learning_rate": 9.78762528684878e-06, + "loss": 0.7531, + "step": 3474 + }, + { + "epoch": 0.19125983818592107, + "grad_norm": 0.7659775018692017, + "learning_rate": 9.787500278727083e-06, + "loss": 0.8081, + "step": 3475 + }, + { + "epoch": 0.19131487698827673, + "grad_norm": 0.8262091279029846, + "learning_rate": 9.787375234623819e-06, + "loss": 0.82, + "step": 3476 + }, + { + "epoch": 0.1913699157906324, + "grad_norm": 0.857761025428772, + "learning_rate": 9.787250154539923e-06, + "loss": 0.9133, + "step": 3477 + }, + { + "epoch": 0.19142495459298806, + "grad_norm": 0.7551915645599365, + "learning_rate": 9.787125038476334e-06, + "loss": 0.7822, + "step": 3478 + }, + { + "epoch": 0.19147999339534372, + "grad_norm": 0.7777357697486877, + "learning_rate": 9.786999886433998e-06, + "loss": 0.7676, + "step": 3479 + }, + { + "epoch": 0.19153503219769938, + "grad_norm": 0.8389080166816711, + "learning_rate": 9.786874698413852e-06, + "loss": 0.7901, + "step": 3480 + }, + { + "epoch": 0.19159007100005504, + "grad_norm": 0.7894837856292725, + "learning_rate": 9.786749474416836e-06, + "loss": 0.8393, + "step": 3481 + }, + { + "epoch": 0.1916451098024107, + "grad_norm": 1.9752860069274902, + "learning_rate": 9.786624214443893e-06, + "loss": 0.7611, + "step": 3482 + }, + { + "epoch": 0.19170014860476636, + "grad_norm": 0.8023802042007446, + "learning_rate": 9.786498918495963e-06, + "loss": 0.8426, + "step": 3483 + }, + { + "epoch": 0.19175518740712202, + "grad_norm": 0.7232086658477783, + "learning_rate": 9.78637358657399e-06, + "loss": 0.6611, + "step": 3484 + }, + { + "epoch": 0.19181022620947769, + "grad_norm": 0.8198665380477905, + "learning_rate": 9.786248218678912e-06, + "loss": 0.8795, + "step": 3485 + }, + { + "epoch": 0.19186526501183335, + "grad_norm": 0.942404568195343, + "learning_rate": 9.786122814811675e-06, + "loss": 0.9146, + "step": 3486 + }, + { + "epoch": 0.191920303814189, + "grad_norm": 0.7602691054344177, + "learning_rate": 9.78599737497322e-06, + "loss": 0.7514, + "step": 3487 + }, + { + "epoch": 0.19197534261654467, + "grad_norm": 0.7981933951377869, + "learning_rate": 9.785871899164489e-06, + "loss": 0.7722, + "step": 3488 + }, + { + "epoch": 0.19203038141890033, + "grad_norm": 0.8617631793022156, + "learning_rate": 9.785746387386427e-06, + "loss": 0.8989, + "step": 3489 + }, + { + "epoch": 0.192085420221256, + "grad_norm": 0.7691803574562073, + "learning_rate": 9.785620839639976e-06, + "loss": 0.7929, + "step": 3490 + }, + { + "epoch": 0.19214045902361165, + "grad_norm": 1.3053189516067505, + "learning_rate": 9.785495255926078e-06, + "loss": 0.8478, + "step": 3491 + }, + { + "epoch": 0.19219549782596732, + "grad_norm": 0.807064950466156, + "learning_rate": 9.785369636245681e-06, + "loss": 0.7452, + "step": 3492 + }, + { + "epoch": 0.19225053662832298, + "grad_norm": 0.8182778358459473, + "learning_rate": 9.785243980599726e-06, + "loss": 0.8371, + "step": 3493 + }, + { + "epoch": 0.19230557543067864, + "grad_norm": 0.7654449343681335, + "learning_rate": 9.785118288989157e-06, + "loss": 0.8321, + "step": 3494 + }, + { + "epoch": 0.1923606142330343, + "grad_norm": 0.7192448973655701, + "learning_rate": 9.784992561414922e-06, + "loss": 0.7451, + "step": 3495 + }, + { + "epoch": 0.19241565303538996, + "grad_norm": 0.8639407753944397, + "learning_rate": 9.784866797877964e-06, + "loss": 0.9272, + "step": 3496 + }, + { + "epoch": 0.19247069183774562, + "grad_norm": 0.8329927921295166, + "learning_rate": 9.784740998379225e-06, + "loss": 0.8034, + "step": 3497 + }, + { + "epoch": 0.19252573064010128, + "grad_norm": 0.7975476980209351, + "learning_rate": 9.784615162919656e-06, + "loss": 0.6885, + "step": 3498 + }, + { + "epoch": 0.19258076944245694, + "grad_norm": 0.8077559471130371, + "learning_rate": 9.7844892915002e-06, + "loss": 0.8745, + "step": 3499 + }, + { + "epoch": 0.1926358082448126, + "grad_norm": 0.7957825660705566, + "learning_rate": 9.7843633841218e-06, + "loss": 0.7612, + "step": 3500 + }, + { + "epoch": 0.19269084704716824, + "grad_norm": 0.8478250503540039, + "learning_rate": 9.784237440785408e-06, + "loss": 0.8675, + "step": 3501 + }, + { + "epoch": 0.1927458858495239, + "grad_norm": 0.7289726138114929, + "learning_rate": 9.78411146149197e-06, + "loss": 0.7126, + "step": 3502 + }, + { + "epoch": 0.19280092465187956, + "grad_norm": 0.7608509063720703, + "learning_rate": 9.783985446242427e-06, + "loss": 0.7049, + "step": 3503 + }, + { + "epoch": 0.19285596345423522, + "grad_norm": 0.8985201120376587, + "learning_rate": 9.783859395037733e-06, + "loss": 0.8067, + "step": 3504 + }, + { + "epoch": 0.19291100225659089, + "grad_norm": 0.7563273906707764, + "learning_rate": 9.78373330787883e-06, + "loss": 0.7018, + "step": 3505 + }, + { + "epoch": 0.19296604105894655, + "grad_norm": 0.8022900223731995, + "learning_rate": 9.78360718476667e-06, + "loss": 0.8346, + "step": 3506 + }, + { + "epoch": 0.1930210798613022, + "grad_norm": 0.897566020488739, + "learning_rate": 9.783481025702197e-06, + "loss": 0.9465, + "step": 3507 + }, + { + "epoch": 0.19307611866365787, + "grad_norm": 0.9550303220748901, + "learning_rate": 9.783354830686363e-06, + "loss": 0.8904, + "step": 3508 + }, + { + "epoch": 0.19313115746601353, + "grad_norm": 0.8152582049369812, + "learning_rate": 9.783228599720114e-06, + "loss": 0.7776, + "step": 3509 + }, + { + "epoch": 0.1931861962683692, + "grad_norm": 0.7421940565109253, + "learning_rate": 9.783102332804398e-06, + "loss": 0.6847, + "step": 3510 + }, + { + "epoch": 0.19324123507072485, + "grad_norm": 0.7414368391036987, + "learning_rate": 9.782976029940167e-06, + "loss": 0.8435, + "step": 3511 + }, + { + "epoch": 0.19329627387308052, + "grad_norm": 0.7845529317855835, + "learning_rate": 9.782849691128366e-06, + "loss": 0.8255, + "step": 3512 + }, + { + "epoch": 0.19335131267543618, + "grad_norm": 0.7779788970947266, + "learning_rate": 9.78272331636995e-06, + "loss": 0.7801, + "step": 3513 + }, + { + "epoch": 0.19340635147779184, + "grad_norm": 0.7537885904312134, + "learning_rate": 9.782596905665865e-06, + "loss": 0.7501, + "step": 3514 + }, + { + "epoch": 0.1934613902801475, + "grad_norm": 0.7585812211036682, + "learning_rate": 9.782470459017059e-06, + "loss": 0.8425, + "step": 3515 + }, + { + "epoch": 0.19351642908250316, + "grad_norm": 0.7923589944839478, + "learning_rate": 9.78234397642449e-06, + "loss": 0.8412, + "step": 3516 + }, + { + "epoch": 0.19357146788485882, + "grad_norm": 0.8710628151893616, + "learning_rate": 9.7822174578891e-06, + "loss": 0.8014, + "step": 3517 + }, + { + "epoch": 0.19362650668721448, + "grad_norm": 0.7646920084953308, + "learning_rate": 9.782090903411845e-06, + "loss": 0.8256, + "step": 3518 + }, + { + "epoch": 0.19368154548957014, + "grad_norm": 0.7560480833053589, + "learning_rate": 9.781964312993675e-06, + "loss": 0.7816, + "step": 3519 + }, + { + "epoch": 0.1937365842919258, + "grad_norm": 0.7438123226165771, + "learning_rate": 9.78183768663554e-06, + "loss": 0.8319, + "step": 3520 + }, + { + "epoch": 0.19379162309428147, + "grad_norm": 0.7239874601364136, + "learning_rate": 9.781711024338394e-06, + "loss": 0.6968, + "step": 3521 + }, + { + "epoch": 0.19384666189663713, + "grad_norm": 0.881197988986969, + "learning_rate": 9.781584326103188e-06, + "loss": 0.9493, + "step": 3522 + }, + { + "epoch": 0.1939017006989928, + "grad_norm": 0.7903854846954346, + "learning_rate": 9.781457591930874e-06, + "loss": 0.8312, + "step": 3523 + }, + { + "epoch": 0.19395673950134845, + "grad_norm": 0.7375456094741821, + "learning_rate": 9.781330821822405e-06, + "loss": 0.7434, + "step": 3524 + }, + { + "epoch": 0.1940117783037041, + "grad_norm": 0.7101724743843079, + "learning_rate": 9.781204015778733e-06, + "loss": 0.75, + "step": 3525 + }, + { + "epoch": 0.19406681710605977, + "grad_norm": 0.8267471194267273, + "learning_rate": 9.781077173800812e-06, + "loss": 0.8807, + "step": 3526 + }, + { + "epoch": 0.19412185590841544, + "grad_norm": 0.9014178514480591, + "learning_rate": 9.780950295889594e-06, + "loss": 0.7836, + "step": 3527 + }, + { + "epoch": 0.1941768947107711, + "grad_norm": 0.7579739689826965, + "learning_rate": 9.780823382046034e-06, + "loss": 0.8331, + "step": 3528 + }, + { + "epoch": 0.19423193351312676, + "grad_norm": 0.8308925032615662, + "learning_rate": 9.780696432271084e-06, + "loss": 0.794, + "step": 3529 + }, + { + "epoch": 0.19428697231548242, + "grad_norm": 0.7461574673652649, + "learning_rate": 9.780569446565701e-06, + "loss": 0.8155, + "step": 3530 + }, + { + "epoch": 0.19434201111783808, + "grad_norm": 0.8658885359764099, + "learning_rate": 9.780442424930836e-06, + "loss": 0.7907, + "step": 3531 + }, + { + "epoch": 0.19439704992019374, + "grad_norm": 0.7243279218673706, + "learning_rate": 9.780315367367449e-06, + "loss": 0.7985, + "step": 3532 + }, + { + "epoch": 0.1944520887225494, + "grad_norm": 0.8482224345207214, + "learning_rate": 9.780188273876486e-06, + "loss": 0.9095, + "step": 3533 + }, + { + "epoch": 0.19450712752490507, + "grad_norm": 0.8675364255905151, + "learning_rate": 9.78006114445891e-06, + "loss": 0.759, + "step": 3534 + }, + { + "epoch": 0.19456216632726073, + "grad_norm": 0.8388474583625793, + "learning_rate": 9.779933979115675e-06, + "loss": 0.8331, + "step": 3535 + }, + { + "epoch": 0.1946172051296164, + "grad_norm": 0.8050872683525085, + "learning_rate": 9.779806777847735e-06, + "loss": 0.861, + "step": 3536 + }, + { + "epoch": 0.19467224393197205, + "grad_norm": 0.8401390910148621, + "learning_rate": 9.779679540656046e-06, + "loss": 0.755, + "step": 3537 + }, + { + "epoch": 0.1947272827343277, + "grad_norm": 0.865160346031189, + "learning_rate": 9.779552267541566e-06, + "loss": 0.7515, + "step": 3538 + }, + { + "epoch": 0.19478232153668337, + "grad_norm": 0.923086941242218, + "learning_rate": 9.77942495850525e-06, + "loss": 0.8032, + "step": 3539 + }, + { + "epoch": 0.19483736033903903, + "grad_norm": 0.8402467966079712, + "learning_rate": 9.779297613548056e-06, + "loss": 0.9198, + "step": 3540 + }, + { + "epoch": 0.1948923991413947, + "grad_norm": 0.7875306606292725, + "learning_rate": 9.779170232670939e-06, + "loss": 0.712, + "step": 3541 + }, + { + "epoch": 0.19494743794375036, + "grad_norm": 0.7996379137039185, + "learning_rate": 9.779042815874858e-06, + "loss": 0.8126, + "step": 3542 + }, + { + "epoch": 0.19500247674610602, + "grad_norm": 0.7644525766372681, + "learning_rate": 9.778915363160773e-06, + "loss": 0.8602, + "step": 3543 + }, + { + "epoch": 0.19505751554846165, + "grad_norm": 0.8068630695343018, + "learning_rate": 9.778787874529635e-06, + "loss": 0.736, + "step": 3544 + }, + { + "epoch": 0.1951125543508173, + "grad_norm": 0.7889519929885864, + "learning_rate": 9.77866034998241e-06, + "loss": 0.755, + "step": 3545 + }, + { + "epoch": 0.19516759315317297, + "grad_norm": 0.7895978689193726, + "learning_rate": 9.778532789520053e-06, + "loss": 0.8213, + "step": 3546 + }, + { + "epoch": 0.19522263195552864, + "grad_norm": 0.8571796417236328, + "learning_rate": 9.77840519314352e-06, + "loss": 0.8193, + "step": 3547 + }, + { + "epoch": 0.1952776707578843, + "grad_norm": 0.6880007982254028, + "learning_rate": 9.778277560853775e-06, + "loss": 0.6354, + "step": 3548 + }, + { + "epoch": 0.19533270956023996, + "grad_norm": 0.8155353665351868, + "learning_rate": 9.778149892651775e-06, + "loss": 0.8518, + "step": 3549 + }, + { + "epoch": 0.19538774836259562, + "grad_norm": 0.851021945476532, + "learning_rate": 9.778022188538479e-06, + "loss": 0.8506, + "step": 3550 + }, + { + "epoch": 0.19544278716495128, + "grad_norm": 0.8910510540008545, + "learning_rate": 9.777894448514847e-06, + "loss": 0.8825, + "step": 3551 + }, + { + "epoch": 0.19549782596730694, + "grad_norm": 0.8156018853187561, + "learning_rate": 9.777766672581838e-06, + "loss": 0.8262, + "step": 3552 + }, + { + "epoch": 0.1955528647696626, + "grad_norm": 0.756340503692627, + "learning_rate": 9.777638860740415e-06, + "loss": 0.7094, + "step": 3553 + }, + { + "epoch": 0.19560790357201827, + "grad_norm": 0.7604243159294128, + "learning_rate": 9.777511012991538e-06, + "loss": 0.8089, + "step": 3554 + }, + { + "epoch": 0.19566294237437393, + "grad_norm": 0.7609277963638306, + "learning_rate": 9.777383129336167e-06, + "loss": 0.7853, + "step": 3555 + }, + { + "epoch": 0.1957179811767296, + "grad_norm": 1.3562177419662476, + "learning_rate": 9.77725520977526e-06, + "loss": 0.7051, + "step": 3556 + }, + { + "epoch": 0.19577301997908525, + "grad_norm": 0.7428582310676575, + "learning_rate": 9.777127254309784e-06, + "loss": 0.734, + "step": 3557 + }, + { + "epoch": 0.1958280587814409, + "grad_norm": 0.6941032409667969, + "learning_rate": 9.776999262940698e-06, + "loss": 0.7862, + "step": 3558 + }, + { + "epoch": 0.19588309758379657, + "grad_norm": 0.8249906301498413, + "learning_rate": 9.776871235668966e-06, + "loss": 0.8324, + "step": 3559 + }, + { + "epoch": 0.19593813638615223, + "grad_norm": 0.6778795719146729, + "learning_rate": 9.776743172495546e-06, + "loss": 0.743, + "step": 3560 + }, + { + "epoch": 0.1959931751885079, + "grad_norm": 0.8454411625862122, + "learning_rate": 9.776615073421405e-06, + "loss": 0.8625, + "step": 3561 + }, + { + "epoch": 0.19604821399086356, + "grad_norm": 0.8303809762001038, + "learning_rate": 9.776486938447503e-06, + "loss": 0.8806, + "step": 3562 + }, + { + "epoch": 0.19610325279321922, + "grad_norm": 0.8814080357551575, + "learning_rate": 9.776358767574803e-06, + "loss": 0.9096, + "step": 3563 + }, + { + "epoch": 0.19615829159557488, + "grad_norm": 0.7860022187232971, + "learning_rate": 9.77623056080427e-06, + "loss": 0.8101, + "step": 3564 + }, + { + "epoch": 0.19621333039793054, + "grad_norm": 0.7604898810386658, + "learning_rate": 9.776102318136866e-06, + "loss": 0.8121, + "step": 3565 + }, + { + "epoch": 0.1962683692002862, + "grad_norm": 0.810708224773407, + "learning_rate": 9.775974039573555e-06, + "loss": 0.8334, + "step": 3566 + }, + { + "epoch": 0.19632340800264186, + "grad_norm": 1.0174707174301147, + "learning_rate": 9.775845725115301e-06, + "loss": 0.8147, + "step": 3567 + }, + { + "epoch": 0.19637844680499753, + "grad_norm": 0.825137734413147, + "learning_rate": 9.77571737476307e-06, + "loss": 0.816, + "step": 3568 + }, + { + "epoch": 0.1964334856073532, + "grad_norm": 0.9023691415786743, + "learning_rate": 9.775588988517826e-06, + "loss": 0.9157, + "step": 3569 + }, + { + "epoch": 0.19648852440970885, + "grad_norm": 0.7287655472755432, + "learning_rate": 9.775460566380534e-06, + "loss": 0.7414, + "step": 3570 + }, + { + "epoch": 0.1965435632120645, + "grad_norm": 0.8675361275672913, + "learning_rate": 9.775332108352158e-06, + "loss": 0.7212, + "step": 3571 + }, + { + "epoch": 0.19659860201442017, + "grad_norm": 0.8633139729499817, + "learning_rate": 9.775203614433664e-06, + "loss": 0.7254, + "step": 3572 + }, + { + "epoch": 0.19665364081677583, + "grad_norm": 0.8628275394439697, + "learning_rate": 9.775075084626017e-06, + "loss": 0.7403, + "step": 3573 + }, + { + "epoch": 0.1967086796191315, + "grad_norm": 0.86918044090271, + "learning_rate": 9.774946518930184e-06, + "loss": 0.8208, + "step": 3574 + }, + { + "epoch": 0.19676371842148715, + "grad_norm": 1.3616218566894531, + "learning_rate": 9.774817917347132e-06, + "loss": 0.7432, + "step": 3575 + }, + { + "epoch": 0.19681875722384282, + "grad_norm": 0.929084062576294, + "learning_rate": 9.774689279877827e-06, + "loss": 0.9567, + "step": 3576 + }, + { + "epoch": 0.19687379602619848, + "grad_norm": 0.7732542753219604, + "learning_rate": 9.774560606523234e-06, + "loss": 0.8682, + "step": 3577 + }, + { + "epoch": 0.19692883482855414, + "grad_norm": 0.7933471202850342, + "learning_rate": 9.774431897284323e-06, + "loss": 0.7112, + "step": 3578 + }, + { + "epoch": 0.1969838736309098, + "grad_norm": 0.8229583501815796, + "learning_rate": 9.77430315216206e-06, + "loss": 0.762, + "step": 3579 + }, + { + "epoch": 0.19703891243326546, + "grad_norm": 0.7571341395378113, + "learning_rate": 9.774174371157412e-06, + "loss": 0.7627, + "step": 3580 + }, + { + "epoch": 0.19709395123562112, + "grad_norm": 1.1551839113235474, + "learning_rate": 9.774045554271347e-06, + "loss": 0.8621, + "step": 3581 + }, + { + "epoch": 0.19714899003797678, + "grad_norm": 0.8546237349510193, + "learning_rate": 9.773916701504833e-06, + "loss": 0.8183, + "step": 3582 + }, + { + "epoch": 0.19720402884033245, + "grad_norm": 0.7297555804252625, + "learning_rate": 9.773787812858841e-06, + "loss": 0.8098, + "step": 3583 + }, + { + "epoch": 0.1972590676426881, + "grad_norm": 0.7846053838729858, + "learning_rate": 9.773658888334336e-06, + "loss": 0.7874, + "step": 3584 + }, + { + "epoch": 0.19731410644504377, + "grad_norm": 0.8949562907218933, + "learning_rate": 9.773529927932288e-06, + "loss": 0.8651, + "step": 3585 + }, + { + "epoch": 0.19736914524739943, + "grad_norm": 0.8041829466819763, + "learning_rate": 9.773400931653668e-06, + "loss": 0.7519, + "step": 3586 + }, + { + "epoch": 0.19742418404975506, + "grad_norm": 0.8090983033180237, + "learning_rate": 9.773271899499444e-06, + "loss": 0.8606, + "step": 3587 + }, + { + "epoch": 0.19747922285211073, + "grad_norm": 0.7954100966453552, + "learning_rate": 9.773142831470587e-06, + "loss": 0.9028, + "step": 3588 + }, + { + "epoch": 0.1975342616544664, + "grad_norm": 0.6865562796592712, + "learning_rate": 9.773013727568066e-06, + "loss": 0.7323, + "step": 3589 + }, + { + "epoch": 0.19758930045682205, + "grad_norm": 0.9144858717918396, + "learning_rate": 9.772884587792851e-06, + "loss": 0.8178, + "step": 3590 + }, + { + "epoch": 0.1976443392591777, + "grad_norm": 0.8096563220024109, + "learning_rate": 9.772755412145913e-06, + "loss": 0.7749, + "step": 3591 + }, + { + "epoch": 0.19769937806153337, + "grad_norm": 1.4496957063674927, + "learning_rate": 9.772626200628222e-06, + "loss": 0.7981, + "step": 3592 + }, + { + "epoch": 0.19775441686388903, + "grad_norm": 0.7699438333511353, + "learning_rate": 9.77249695324075e-06, + "loss": 0.7683, + "step": 3593 + }, + { + "epoch": 0.1978094556662447, + "grad_norm": 0.7883017063140869, + "learning_rate": 9.77236766998447e-06, + "loss": 0.7668, + "step": 3594 + }, + { + "epoch": 0.19786449446860035, + "grad_norm": 0.7552568912506104, + "learning_rate": 9.772238350860352e-06, + "loss": 0.7914, + "step": 3595 + }, + { + "epoch": 0.19791953327095602, + "grad_norm": 0.8585009574890137, + "learning_rate": 9.772108995869366e-06, + "loss": 0.9888, + "step": 3596 + }, + { + "epoch": 0.19797457207331168, + "grad_norm": 0.9459839463233948, + "learning_rate": 9.77197960501249e-06, + "loss": 0.9923, + "step": 3597 + }, + { + "epoch": 0.19802961087566734, + "grad_norm": 0.844771683216095, + "learning_rate": 9.77185017829069e-06, + "loss": 0.8427, + "step": 3598 + }, + { + "epoch": 0.198084649678023, + "grad_norm": 0.749700665473938, + "learning_rate": 9.77172071570494e-06, + "loss": 0.8111, + "step": 3599 + }, + { + "epoch": 0.19813968848037866, + "grad_norm": 0.7297450304031372, + "learning_rate": 9.771591217256216e-06, + "loss": 0.7783, + "step": 3600 + }, + { + "epoch": 0.19819472728273432, + "grad_norm": 0.7928450703620911, + "learning_rate": 9.77146168294549e-06, + "loss": 0.8755, + "step": 3601 + }, + { + "epoch": 0.19824976608508998, + "grad_norm": 0.7236143946647644, + "learning_rate": 9.771332112773734e-06, + "loss": 0.7159, + "step": 3602 + }, + { + "epoch": 0.19830480488744565, + "grad_norm": 0.8170965313911438, + "learning_rate": 9.771202506741926e-06, + "loss": 0.9093, + "step": 3603 + }, + { + "epoch": 0.1983598436898013, + "grad_norm": 0.8834578990936279, + "learning_rate": 9.771072864851035e-06, + "loss": 0.8961, + "step": 3604 + }, + { + "epoch": 0.19841488249215697, + "grad_norm": 1.3750289678573608, + "learning_rate": 9.770943187102037e-06, + "loss": 0.8175, + "step": 3605 + }, + { + "epoch": 0.19846992129451263, + "grad_norm": 0.7016286253929138, + "learning_rate": 9.770813473495909e-06, + "loss": 0.7171, + "step": 3606 + }, + { + "epoch": 0.1985249600968683, + "grad_norm": 0.7792307734489441, + "learning_rate": 9.770683724033622e-06, + "loss": 0.6892, + "step": 3607 + }, + { + "epoch": 0.19857999889922395, + "grad_norm": 0.789820671081543, + "learning_rate": 9.770553938716153e-06, + "loss": 0.8531, + "step": 3608 + }, + { + "epoch": 0.19863503770157961, + "grad_norm": 0.7585997581481934, + "learning_rate": 9.77042411754448e-06, + "loss": 0.8195, + "step": 3609 + }, + { + "epoch": 0.19869007650393528, + "grad_norm": 0.8989273905754089, + "learning_rate": 9.770294260519573e-06, + "loss": 0.891, + "step": 3610 + }, + { + "epoch": 0.19874511530629094, + "grad_norm": 0.8044012188911438, + "learning_rate": 9.770164367642414e-06, + "loss": 0.8428, + "step": 3611 + }, + { + "epoch": 0.1988001541086466, + "grad_norm": 0.7847021222114563, + "learning_rate": 9.770034438913975e-06, + "loss": 0.8302, + "step": 3612 + }, + { + "epoch": 0.19885519291100226, + "grad_norm": 0.9260531663894653, + "learning_rate": 9.769904474335234e-06, + "loss": 0.8187, + "step": 3613 + }, + { + "epoch": 0.19891023171335792, + "grad_norm": 0.7491805553436279, + "learning_rate": 9.769774473907168e-06, + "loss": 0.8374, + "step": 3614 + }, + { + "epoch": 0.19896527051571358, + "grad_norm": 1.1665992736816406, + "learning_rate": 9.769644437630754e-06, + "loss": 0.8154, + "step": 3615 + }, + { + "epoch": 0.19902030931806924, + "grad_norm": 0.9162279963493347, + "learning_rate": 9.769514365506968e-06, + "loss": 0.8883, + "step": 3616 + }, + { + "epoch": 0.1990753481204249, + "grad_norm": 0.8980437517166138, + "learning_rate": 9.769384257536791e-06, + "loss": 0.8948, + "step": 3617 + }, + { + "epoch": 0.19913038692278057, + "grad_norm": 0.7544137835502625, + "learning_rate": 9.769254113721197e-06, + "loss": 0.7763, + "step": 3618 + }, + { + "epoch": 0.19918542572513623, + "grad_norm": 0.8393334746360779, + "learning_rate": 9.769123934061168e-06, + "loss": 0.8361, + "step": 3619 + }, + { + "epoch": 0.1992404645274919, + "grad_norm": 0.8184031248092651, + "learning_rate": 9.768993718557678e-06, + "loss": 0.8104, + "step": 3620 + }, + { + "epoch": 0.19929550332984755, + "grad_norm": 0.8023706674575806, + "learning_rate": 9.76886346721171e-06, + "loss": 0.7824, + "step": 3621 + }, + { + "epoch": 0.1993505421322032, + "grad_norm": 0.9354264736175537, + "learning_rate": 9.768733180024238e-06, + "loss": 0.7782, + "step": 3622 + }, + { + "epoch": 0.19940558093455887, + "grad_norm": 0.7037177681922913, + "learning_rate": 9.768602856996244e-06, + "loss": 0.8054, + "step": 3623 + }, + { + "epoch": 0.19946061973691454, + "grad_norm": 0.7926928997039795, + "learning_rate": 9.768472498128709e-06, + "loss": 0.8864, + "step": 3624 + }, + { + "epoch": 0.1995156585392702, + "grad_norm": 0.7963769435882568, + "learning_rate": 9.76834210342261e-06, + "loss": 0.8505, + "step": 3625 + }, + { + "epoch": 0.19957069734162586, + "grad_norm": 0.8553926944732666, + "learning_rate": 9.768211672878929e-06, + "loss": 0.8519, + "step": 3626 + }, + { + "epoch": 0.19962573614398152, + "grad_norm": 0.8147156834602356, + "learning_rate": 9.768081206498644e-06, + "loss": 0.8091, + "step": 3627 + }, + { + "epoch": 0.19968077494633718, + "grad_norm": 0.8226443529129028, + "learning_rate": 9.767950704282739e-06, + "loss": 0.8561, + "step": 3628 + }, + { + "epoch": 0.19973581374869284, + "grad_norm": 0.7246909141540527, + "learning_rate": 9.76782016623219e-06, + "loss": 0.7318, + "step": 3629 + }, + { + "epoch": 0.19979085255104848, + "grad_norm": 1.0527293682098389, + "learning_rate": 9.767689592347983e-06, + "loss": 0.7699, + "step": 3630 + }, + { + "epoch": 0.19984589135340414, + "grad_norm": 0.7433847188949585, + "learning_rate": 9.767558982631097e-06, + "loss": 0.8619, + "step": 3631 + }, + { + "epoch": 0.1999009301557598, + "grad_norm": 0.7901468873023987, + "learning_rate": 9.767428337082513e-06, + "loss": 0.8365, + "step": 3632 + }, + { + "epoch": 0.19995596895811546, + "grad_norm": 0.7766845226287842, + "learning_rate": 9.767297655703215e-06, + "loss": 0.7767, + "step": 3633 + }, + { + "epoch": 0.20001100776047112, + "grad_norm": 0.7785109281539917, + "learning_rate": 9.767166938494183e-06, + "loss": 0.7114, + "step": 3634 + }, + { + "epoch": 0.20006604656282678, + "grad_norm": 0.8068187832832336, + "learning_rate": 9.767036185456402e-06, + "loss": 0.8142, + "step": 3635 + }, + { + "epoch": 0.20012108536518244, + "grad_norm": 0.7893292307853699, + "learning_rate": 9.766905396590851e-06, + "loss": 0.8658, + "step": 3636 + }, + { + "epoch": 0.2001761241675381, + "grad_norm": 0.8647506237030029, + "learning_rate": 9.766774571898516e-06, + "loss": 0.84, + "step": 3637 + }, + { + "epoch": 0.20023116296989377, + "grad_norm": 0.8545078635215759, + "learning_rate": 9.766643711380378e-06, + "loss": 0.8455, + "step": 3638 + }, + { + "epoch": 0.20028620177224943, + "grad_norm": 0.924404501914978, + "learning_rate": 9.766512815037424e-06, + "loss": 0.6954, + "step": 3639 + }, + { + "epoch": 0.2003412405746051, + "grad_norm": 0.8077614903450012, + "learning_rate": 9.766381882870635e-06, + "loss": 0.7724, + "step": 3640 + }, + { + "epoch": 0.20039627937696075, + "grad_norm": 0.8886739015579224, + "learning_rate": 9.766250914880994e-06, + "loss": 0.8318, + "step": 3641 + }, + { + "epoch": 0.2004513181793164, + "grad_norm": 0.8086267113685608, + "learning_rate": 9.76611991106949e-06, + "loss": 0.8494, + "step": 3642 + }, + { + "epoch": 0.20050635698167207, + "grad_norm": 0.8606873750686646, + "learning_rate": 9.765988871437101e-06, + "loss": 0.8488, + "step": 3643 + }, + { + "epoch": 0.20056139578402774, + "grad_norm": 0.6966355443000793, + "learning_rate": 9.76585779598482e-06, + "loss": 0.7361, + "step": 3644 + }, + { + "epoch": 0.2006164345863834, + "grad_norm": 0.8474385738372803, + "learning_rate": 9.765726684713623e-06, + "loss": 0.8354, + "step": 3645 + }, + { + "epoch": 0.20067147338873906, + "grad_norm": 0.7609736919403076, + "learning_rate": 9.765595537624502e-06, + "loss": 0.7297, + "step": 3646 + }, + { + "epoch": 0.20072651219109472, + "grad_norm": 1.08648681640625, + "learning_rate": 9.76546435471844e-06, + "loss": 0.7534, + "step": 3647 + }, + { + "epoch": 0.20078155099345038, + "grad_norm": 0.7437332272529602, + "learning_rate": 9.765333135996425e-06, + "loss": 0.8532, + "step": 3648 + }, + { + "epoch": 0.20083658979580604, + "grad_norm": 0.9016552567481995, + "learning_rate": 9.76520188145944e-06, + "loss": 0.7968, + "step": 3649 + }, + { + "epoch": 0.2008916285981617, + "grad_norm": 0.8916428089141846, + "learning_rate": 9.765070591108473e-06, + "loss": 0.9601, + "step": 3650 + }, + { + "epoch": 0.20094666740051736, + "grad_norm": 0.7679058313369751, + "learning_rate": 9.764939264944512e-06, + "loss": 0.816, + "step": 3651 + }, + { + "epoch": 0.20100170620287303, + "grad_norm": 0.7716549634933472, + "learning_rate": 9.764807902968543e-06, + "loss": 0.876, + "step": 3652 + }, + { + "epoch": 0.2010567450052287, + "grad_norm": 0.8288074731826782, + "learning_rate": 9.764676505181554e-06, + "loss": 0.8054, + "step": 3653 + }, + { + "epoch": 0.20111178380758435, + "grad_norm": 0.7906842827796936, + "learning_rate": 9.76454507158453e-06, + "loss": 0.8026, + "step": 3654 + }, + { + "epoch": 0.20116682260994, + "grad_norm": 0.8093311190605164, + "learning_rate": 9.764413602178461e-06, + "loss": 0.8093, + "step": 3655 + }, + { + "epoch": 0.20122186141229567, + "grad_norm": 0.7234730124473572, + "learning_rate": 9.764282096964335e-06, + "loss": 0.7194, + "step": 3656 + }, + { + "epoch": 0.20127690021465133, + "grad_norm": 0.9048555493354797, + "learning_rate": 9.76415055594314e-06, + "loss": 0.8996, + "step": 3657 + }, + { + "epoch": 0.201331939017007, + "grad_norm": 0.7630691528320312, + "learning_rate": 9.764018979115864e-06, + "loss": 0.7876, + "step": 3658 + }, + { + "epoch": 0.20138697781936266, + "grad_norm": 0.9551032781600952, + "learning_rate": 9.763887366483498e-06, + "loss": 0.8249, + "step": 3659 + }, + { + "epoch": 0.20144201662171832, + "grad_norm": 0.6988314986228943, + "learning_rate": 9.76375571804703e-06, + "loss": 0.8011, + "step": 3660 + }, + { + "epoch": 0.20149705542407398, + "grad_norm": 0.7790704369544983, + "learning_rate": 9.763624033807448e-06, + "loss": 0.8287, + "step": 3661 + }, + { + "epoch": 0.20155209422642964, + "grad_norm": 0.7201293706893921, + "learning_rate": 9.763492313765743e-06, + "loss": 0.7854, + "step": 3662 + }, + { + "epoch": 0.2016071330287853, + "grad_norm": 0.8691730499267578, + "learning_rate": 9.763360557922905e-06, + "loss": 0.8348, + "step": 3663 + }, + { + "epoch": 0.20166217183114096, + "grad_norm": 0.7660881876945496, + "learning_rate": 9.763228766279924e-06, + "loss": 0.7686, + "step": 3664 + }, + { + "epoch": 0.20171721063349662, + "grad_norm": 1.083796501159668, + "learning_rate": 9.76309693883779e-06, + "loss": 0.8848, + "step": 3665 + }, + { + "epoch": 0.20177224943585229, + "grad_norm": 0.7892678380012512, + "learning_rate": 9.762965075597496e-06, + "loss": 0.7804, + "step": 3666 + }, + { + "epoch": 0.20182728823820795, + "grad_norm": 0.7166122198104858, + "learning_rate": 9.762833176560031e-06, + "loss": 0.761, + "step": 3667 + }, + { + "epoch": 0.2018823270405636, + "grad_norm": 0.8187084794044495, + "learning_rate": 9.762701241726386e-06, + "loss": 0.8251, + "step": 3668 + }, + { + "epoch": 0.20193736584291927, + "grad_norm": 0.6930577158927917, + "learning_rate": 9.762569271097556e-06, + "loss": 0.6795, + "step": 3669 + }, + { + "epoch": 0.20199240464527493, + "grad_norm": 0.8085465431213379, + "learning_rate": 9.762437264674527e-06, + "loss": 0.8415, + "step": 3670 + }, + { + "epoch": 0.2020474434476306, + "grad_norm": 0.8111084699630737, + "learning_rate": 9.762305222458294e-06, + "loss": 0.792, + "step": 3671 + }, + { + "epoch": 0.20210248224998625, + "grad_norm": 0.8200401067733765, + "learning_rate": 9.762173144449852e-06, + "loss": 0.8224, + "step": 3672 + }, + { + "epoch": 0.2021575210523419, + "grad_norm": 0.8460109233856201, + "learning_rate": 9.762041030650192e-06, + "loss": 0.9025, + "step": 3673 + }, + { + "epoch": 0.20221255985469755, + "grad_norm": 0.8152671456336975, + "learning_rate": 9.761908881060303e-06, + "loss": 0.9002, + "step": 3674 + }, + { + "epoch": 0.2022675986570532, + "grad_norm": 0.8204773664474487, + "learning_rate": 9.761776695681185e-06, + "loss": 0.8324, + "step": 3675 + }, + { + "epoch": 0.20232263745940887, + "grad_norm": 0.8121044039726257, + "learning_rate": 9.761644474513825e-06, + "loss": 0.855, + "step": 3676 + }, + { + "epoch": 0.20237767626176453, + "grad_norm": 0.79920494556427, + "learning_rate": 9.76151221755922e-06, + "loss": 0.7837, + "step": 3677 + }, + { + "epoch": 0.2024327150641202, + "grad_norm": 0.862808346748352, + "learning_rate": 9.761379924818367e-06, + "loss": 0.8714, + "step": 3678 + }, + { + "epoch": 0.20248775386647586, + "grad_norm": 0.7135004997253418, + "learning_rate": 9.761247596292254e-06, + "loss": 0.774, + "step": 3679 + }, + { + "epoch": 0.20254279266883152, + "grad_norm": 0.7967603802680969, + "learning_rate": 9.761115231981878e-06, + "loss": 0.919, + "step": 3680 + }, + { + "epoch": 0.20259783147118718, + "grad_norm": 0.7425099611282349, + "learning_rate": 9.760982831888236e-06, + "loss": 0.819, + "step": 3681 + }, + { + "epoch": 0.20265287027354284, + "grad_norm": 0.7631763815879822, + "learning_rate": 9.760850396012323e-06, + "loss": 0.816, + "step": 3682 + }, + { + "epoch": 0.2027079090758985, + "grad_norm": 0.7931755185127258, + "learning_rate": 9.76071792435513e-06, + "loss": 0.8299, + "step": 3683 + }, + { + "epoch": 0.20276294787825416, + "grad_norm": 0.8409438729286194, + "learning_rate": 9.760585416917657e-06, + "loss": 0.8503, + "step": 3684 + }, + { + "epoch": 0.20281798668060982, + "grad_norm": 0.7632728815078735, + "learning_rate": 9.760452873700898e-06, + "loss": 0.8394, + "step": 3685 + }, + { + "epoch": 0.20287302548296549, + "grad_norm": 0.7765083312988281, + "learning_rate": 9.76032029470585e-06, + "loss": 0.8879, + "step": 3686 + }, + { + "epoch": 0.20292806428532115, + "grad_norm": 0.7736936807632446, + "learning_rate": 9.760187679933507e-06, + "loss": 0.7987, + "step": 3687 + }, + { + "epoch": 0.2029831030876768, + "grad_norm": 0.8270270824432373, + "learning_rate": 9.760055029384869e-06, + "loss": 0.8267, + "step": 3688 + }, + { + "epoch": 0.20303814189003247, + "grad_norm": 0.7742369174957275, + "learning_rate": 9.759922343060932e-06, + "loss": 0.8447, + "step": 3689 + }, + { + "epoch": 0.20309318069238813, + "grad_norm": 0.7543869018554688, + "learning_rate": 9.759789620962692e-06, + "loss": 0.7325, + "step": 3690 + }, + { + "epoch": 0.2031482194947438, + "grad_norm": 0.7913174033164978, + "learning_rate": 9.759656863091147e-06, + "loss": 0.8622, + "step": 3691 + }, + { + "epoch": 0.20320325829709945, + "grad_norm": 0.7445376515388489, + "learning_rate": 9.759524069447296e-06, + "loss": 0.7115, + "step": 3692 + }, + { + "epoch": 0.20325829709945512, + "grad_norm": 0.7744696140289307, + "learning_rate": 9.759391240032136e-06, + "loss": 0.8437, + "step": 3693 + }, + { + "epoch": 0.20331333590181078, + "grad_norm": 0.6984724998474121, + "learning_rate": 9.759258374846665e-06, + "loss": 0.7415, + "step": 3694 + }, + { + "epoch": 0.20336837470416644, + "grad_norm": 0.7453249096870422, + "learning_rate": 9.759125473891882e-06, + "loss": 0.7708, + "step": 3695 + }, + { + "epoch": 0.2034234135065221, + "grad_norm": 0.7459438443183899, + "learning_rate": 9.758992537168787e-06, + "loss": 0.7961, + "step": 3696 + }, + { + "epoch": 0.20347845230887776, + "grad_norm": 0.808944582939148, + "learning_rate": 9.758859564678377e-06, + "loss": 0.8875, + "step": 3697 + }, + { + "epoch": 0.20353349111123342, + "grad_norm": 0.7202889323234558, + "learning_rate": 9.758726556421652e-06, + "loss": 0.8064, + "step": 3698 + }, + { + "epoch": 0.20358852991358908, + "grad_norm": 0.7874952554702759, + "learning_rate": 9.758593512399613e-06, + "loss": 0.7881, + "step": 3699 + }, + { + "epoch": 0.20364356871594474, + "grad_norm": 0.771300733089447, + "learning_rate": 9.758460432613259e-06, + "loss": 0.8938, + "step": 3700 + }, + { + "epoch": 0.2036986075183004, + "grad_norm": 0.7332000136375427, + "learning_rate": 9.758327317063589e-06, + "loss": 0.7369, + "step": 3701 + }, + { + "epoch": 0.20375364632065607, + "grad_norm": 0.8206236958503723, + "learning_rate": 9.758194165751604e-06, + "loss": 0.8727, + "step": 3702 + }, + { + "epoch": 0.20380868512301173, + "grad_norm": 0.7750238180160522, + "learning_rate": 9.758060978678308e-06, + "loss": 0.8013, + "step": 3703 + }, + { + "epoch": 0.2038637239253674, + "grad_norm": 0.7213704586029053, + "learning_rate": 9.757927755844698e-06, + "loss": 0.7413, + "step": 3704 + }, + { + "epoch": 0.20391876272772305, + "grad_norm": 0.8982640504837036, + "learning_rate": 9.757794497251776e-06, + "loss": 0.9077, + "step": 3705 + }, + { + "epoch": 0.2039738015300787, + "grad_norm": 0.8439363241195679, + "learning_rate": 9.757661202900544e-06, + "loss": 0.7887, + "step": 3706 + }, + { + "epoch": 0.20402884033243437, + "grad_norm": 0.7700560688972473, + "learning_rate": 9.757527872792005e-06, + "loss": 0.8677, + "step": 3707 + }, + { + "epoch": 0.20408387913479004, + "grad_norm": 0.7462438941001892, + "learning_rate": 9.75739450692716e-06, + "loss": 0.7937, + "step": 3708 + }, + { + "epoch": 0.2041389179371457, + "grad_norm": 0.9125999808311462, + "learning_rate": 9.75726110530701e-06, + "loss": 0.9374, + "step": 3709 + }, + { + "epoch": 0.20419395673950136, + "grad_norm": 0.8949875831604004, + "learning_rate": 9.75712766793256e-06, + "loss": 0.8586, + "step": 3710 + }, + { + "epoch": 0.20424899554185702, + "grad_norm": 0.9042442440986633, + "learning_rate": 9.756994194804812e-06, + "loss": 0.9411, + "step": 3711 + }, + { + "epoch": 0.20430403434421268, + "grad_norm": 0.7646238207817078, + "learning_rate": 9.756860685924769e-06, + "loss": 0.8353, + "step": 3712 + }, + { + "epoch": 0.20435907314656834, + "grad_norm": 0.7551934123039246, + "learning_rate": 9.756727141293434e-06, + "loss": 0.8109, + "step": 3713 + }, + { + "epoch": 0.204414111948924, + "grad_norm": 0.7526532411575317, + "learning_rate": 9.756593560911811e-06, + "loss": 0.8509, + "step": 3714 + }, + { + "epoch": 0.20446915075127967, + "grad_norm": 0.8423319458961487, + "learning_rate": 9.756459944780903e-06, + "loss": 0.9003, + "step": 3715 + }, + { + "epoch": 0.2045241895536353, + "grad_norm": 0.7966015934944153, + "learning_rate": 9.756326292901716e-06, + "loss": 0.7606, + "step": 3716 + }, + { + "epoch": 0.20457922835599096, + "grad_norm": 0.7642805576324463, + "learning_rate": 9.756192605275256e-06, + "loss": 0.8321, + "step": 3717 + }, + { + "epoch": 0.20463426715834662, + "grad_norm": 0.7285729646682739, + "learning_rate": 9.756058881902524e-06, + "loss": 0.7375, + "step": 3718 + }, + { + "epoch": 0.20468930596070228, + "grad_norm": 0.852020263671875, + "learning_rate": 9.755925122784525e-06, + "loss": 0.8207, + "step": 3719 + }, + { + "epoch": 0.20474434476305794, + "grad_norm": 0.8227072358131409, + "learning_rate": 9.755791327922268e-06, + "loss": 0.872, + "step": 3720 + }, + { + "epoch": 0.2047993835654136, + "grad_norm": 1.0128127336502075, + "learning_rate": 9.755657497316755e-06, + "loss": 0.9186, + "step": 3721 + }, + { + "epoch": 0.20485442236776927, + "grad_norm": 0.8208017349243164, + "learning_rate": 9.755523630968994e-06, + "loss": 0.6968, + "step": 3722 + }, + { + "epoch": 0.20490946117012493, + "grad_norm": 0.7716407179832458, + "learning_rate": 9.75538972887999e-06, + "loss": 0.8068, + "step": 3723 + }, + { + "epoch": 0.2049644999724806, + "grad_norm": 0.779608964920044, + "learning_rate": 9.75525579105075e-06, + "loss": 0.6968, + "step": 3724 + }, + { + "epoch": 0.20501953877483625, + "grad_norm": 0.7463479042053223, + "learning_rate": 9.75512181748228e-06, + "loss": 0.7581, + "step": 3725 + }, + { + "epoch": 0.2050745775771919, + "grad_norm": 0.8104956150054932, + "learning_rate": 9.754987808175587e-06, + "loss": 0.7838, + "step": 3726 + }, + { + "epoch": 0.20512961637954757, + "grad_norm": 0.7911564707756042, + "learning_rate": 9.75485376313168e-06, + "loss": 0.848, + "step": 3727 + }, + { + "epoch": 0.20518465518190324, + "grad_norm": 0.8340871334075928, + "learning_rate": 9.754719682351564e-06, + "loss": 0.7879, + "step": 3728 + }, + { + "epoch": 0.2052396939842589, + "grad_norm": 1.5543067455291748, + "learning_rate": 9.754585565836247e-06, + "loss": 0.8091, + "step": 3729 + }, + { + "epoch": 0.20529473278661456, + "grad_norm": 0.8262580633163452, + "learning_rate": 9.754451413586739e-06, + "loss": 0.9076, + "step": 3730 + }, + { + "epoch": 0.20534977158897022, + "grad_norm": 0.7558280825614929, + "learning_rate": 9.754317225604045e-06, + "loss": 0.7781, + "step": 3731 + }, + { + "epoch": 0.20540481039132588, + "grad_norm": 0.7197710275650024, + "learning_rate": 9.754183001889177e-06, + "loss": 0.765, + "step": 3732 + }, + { + "epoch": 0.20545984919368154, + "grad_norm": 0.8053440451622009, + "learning_rate": 9.754048742443141e-06, + "loss": 0.7986, + "step": 3733 + }, + { + "epoch": 0.2055148879960372, + "grad_norm": 0.9183983206748962, + "learning_rate": 9.753914447266947e-06, + "loss": 0.8522, + "step": 3734 + }, + { + "epoch": 0.20556992679839287, + "grad_norm": 0.8095504641532898, + "learning_rate": 9.753780116361607e-06, + "loss": 0.7243, + "step": 3735 + }, + { + "epoch": 0.20562496560074853, + "grad_norm": 0.816818356513977, + "learning_rate": 9.753645749728127e-06, + "loss": 0.8262, + "step": 3736 + }, + { + "epoch": 0.2056800044031042, + "grad_norm": 0.8425988554954529, + "learning_rate": 9.753511347367516e-06, + "loss": 0.8142, + "step": 3737 + }, + { + "epoch": 0.20573504320545985, + "grad_norm": 0.7719724178314209, + "learning_rate": 9.753376909280789e-06, + "loss": 0.8444, + "step": 3738 + }, + { + "epoch": 0.2057900820078155, + "grad_norm": 0.877646803855896, + "learning_rate": 9.753242435468952e-06, + "loss": 0.8515, + "step": 3739 + }, + { + "epoch": 0.20584512081017117, + "grad_norm": 0.9261211156845093, + "learning_rate": 9.753107925933017e-06, + "loss": 0.7605, + "step": 3740 + }, + { + "epoch": 0.20590015961252683, + "grad_norm": 0.7790889739990234, + "learning_rate": 9.752973380673995e-06, + "loss": 0.7911, + "step": 3741 + }, + { + "epoch": 0.2059551984148825, + "grad_norm": 0.7112367153167725, + "learning_rate": 9.752838799692899e-06, + "loss": 0.8212, + "step": 3742 + }, + { + "epoch": 0.20601023721723816, + "grad_norm": 0.7568365335464478, + "learning_rate": 9.752704182990736e-06, + "loss": 0.8505, + "step": 3743 + }, + { + "epoch": 0.20606527601959382, + "grad_norm": 0.7501981258392334, + "learning_rate": 9.752569530568523e-06, + "loss": 0.8191, + "step": 3744 + }, + { + "epoch": 0.20612031482194948, + "grad_norm": 0.7822220325469971, + "learning_rate": 9.752434842427268e-06, + "loss": 0.8032, + "step": 3745 + }, + { + "epoch": 0.20617535362430514, + "grad_norm": 0.810197114944458, + "learning_rate": 9.752300118567987e-06, + "loss": 0.7789, + "step": 3746 + }, + { + "epoch": 0.2062303924266608, + "grad_norm": 0.7386943101882935, + "learning_rate": 9.752165358991688e-06, + "loss": 0.7733, + "step": 3747 + }, + { + "epoch": 0.20628543122901646, + "grad_norm": 0.7086807489395142, + "learning_rate": 9.75203056369939e-06, + "loss": 0.6328, + "step": 3748 + }, + { + "epoch": 0.20634047003137213, + "grad_norm": 0.9881154894828796, + "learning_rate": 9.751895732692099e-06, + "loss": 0.8515, + "step": 3749 + }, + { + "epoch": 0.2063955088337278, + "grad_norm": 0.813521683216095, + "learning_rate": 9.751760865970831e-06, + "loss": 0.8438, + "step": 3750 + }, + { + "epoch": 0.20645054763608345, + "grad_norm": 0.8357470631599426, + "learning_rate": 9.751625963536602e-06, + "loss": 0.7635, + "step": 3751 + }, + { + "epoch": 0.2065055864384391, + "grad_norm": 0.8629693388938904, + "learning_rate": 9.751491025390423e-06, + "loss": 0.888, + "step": 3752 + }, + { + "epoch": 0.20656062524079477, + "grad_norm": 0.8844664096832275, + "learning_rate": 9.751356051533311e-06, + "loss": 0.7654, + "step": 3753 + }, + { + "epoch": 0.20661566404315043, + "grad_norm": 0.7006319165229797, + "learning_rate": 9.751221041966276e-06, + "loss": 0.7618, + "step": 3754 + }, + { + "epoch": 0.2066707028455061, + "grad_norm": 0.9291046261787415, + "learning_rate": 9.75108599669034e-06, + "loss": 0.8485, + "step": 3755 + }, + { + "epoch": 0.20672574164786175, + "grad_norm": 0.7670828700065613, + "learning_rate": 9.75095091570651e-06, + "loss": 0.7856, + "step": 3756 + }, + { + "epoch": 0.20678078045021742, + "grad_norm": 0.8709883689880371, + "learning_rate": 9.750815799015804e-06, + "loss": 0.7983, + "step": 3757 + }, + { + "epoch": 0.20683581925257308, + "grad_norm": 0.7688055634498596, + "learning_rate": 9.750680646619241e-06, + "loss": 0.8064, + "step": 3758 + }, + { + "epoch": 0.2068908580549287, + "grad_norm": 0.9492738246917725, + "learning_rate": 9.750545458517832e-06, + "loss": 0.8256, + "step": 3759 + }, + { + "epoch": 0.20694589685728437, + "grad_norm": 0.9685352444648743, + "learning_rate": 9.750410234712596e-06, + "loss": 0.839, + "step": 3760 + }, + { + "epoch": 0.20700093565964003, + "grad_norm": 0.788577139377594, + "learning_rate": 9.750274975204547e-06, + "loss": 0.8743, + "step": 3761 + }, + { + "epoch": 0.2070559744619957, + "grad_norm": 0.8496370315551758, + "learning_rate": 9.750139679994703e-06, + "loss": 0.9286, + "step": 3762 + }, + { + "epoch": 0.20711101326435136, + "grad_norm": 0.9539788961410522, + "learning_rate": 9.750004349084083e-06, + "loss": 0.7568, + "step": 3763 + }, + { + "epoch": 0.20716605206670702, + "grad_norm": 0.8825643062591553, + "learning_rate": 9.7498689824737e-06, + "loss": 0.9339, + "step": 3764 + }, + { + "epoch": 0.20722109086906268, + "grad_norm": 0.7771373391151428, + "learning_rate": 9.749733580164573e-06, + "loss": 0.851, + "step": 3765 + }, + { + "epoch": 0.20727612967141834, + "grad_norm": 0.7460281252861023, + "learning_rate": 9.749598142157721e-06, + "loss": 0.8208, + "step": 3766 + }, + { + "epoch": 0.207331168473774, + "grad_norm": 0.8370739817619324, + "learning_rate": 9.74946266845416e-06, + "loss": 0.8634, + "step": 3767 + }, + { + "epoch": 0.20738620727612966, + "grad_norm": 0.7770463228225708, + "learning_rate": 9.749327159054907e-06, + "loss": 0.7955, + "step": 3768 + }, + { + "epoch": 0.20744124607848533, + "grad_norm": 0.8048208355903625, + "learning_rate": 9.749191613960985e-06, + "loss": 0.7736, + "step": 3769 + }, + { + "epoch": 0.207496284880841, + "grad_norm": 0.9187547564506531, + "learning_rate": 9.74905603317341e-06, + "loss": 0.8534, + "step": 3770 + }, + { + "epoch": 0.20755132368319665, + "grad_norm": 0.7304024696350098, + "learning_rate": 9.7489204166932e-06, + "loss": 0.72, + "step": 3771 + }, + { + "epoch": 0.2076063624855523, + "grad_norm": 0.86177659034729, + "learning_rate": 9.748784764521376e-06, + "loss": 0.7838, + "step": 3772 + }, + { + "epoch": 0.20766140128790797, + "grad_norm": 0.7988011837005615, + "learning_rate": 9.748649076658956e-06, + "loss": 0.7776, + "step": 3773 + }, + { + "epoch": 0.20771644009026363, + "grad_norm": 0.706099808216095, + "learning_rate": 9.74851335310696e-06, + "loss": 0.759, + "step": 3774 + }, + { + "epoch": 0.2077714788926193, + "grad_norm": 0.8125914931297302, + "learning_rate": 9.748377593866412e-06, + "loss": 0.8155, + "step": 3775 + }, + { + "epoch": 0.20782651769497495, + "grad_norm": 0.8603429794311523, + "learning_rate": 9.748241798938326e-06, + "loss": 0.8018, + "step": 3776 + }, + { + "epoch": 0.20788155649733062, + "grad_norm": 0.7735254764556885, + "learning_rate": 9.748105968323726e-06, + "loss": 0.7788, + "step": 3777 + }, + { + "epoch": 0.20793659529968628, + "grad_norm": 0.9037501811981201, + "learning_rate": 9.747970102023635e-06, + "loss": 0.8907, + "step": 3778 + }, + { + "epoch": 0.20799163410204194, + "grad_norm": 0.8781846761703491, + "learning_rate": 9.74783420003907e-06, + "loss": 0.867, + "step": 3779 + }, + { + "epoch": 0.2080466729043976, + "grad_norm": 0.8486423492431641, + "learning_rate": 9.747698262371052e-06, + "loss": 0.817, + "step": 3780 + }, + { + "epoch": 0.20810171170675326, + "grad_norm": 0.8242751359939575, + "learning_rate": 9.747562289020607e-06, + "loss": 0.7385, + "step": 3781 + }, + { + "epoch": 0.20815675050910892, + "grad_norm": 0.8776529431343079, + "learning_rate": 9.747426279988754e-06, + "loss": 0.8222, + "step": 3782 + }, + { + "epoch": 0.20821178931146458, + "grad_norm": 0.7428975105285645, + "learning_rate": 9.747290235276517e-06, + "loss": 0.6954, + "step": 3783 + }, + { + "epoch": 0.20826682811382025, + "grad_norm": 0.8631997108459473, + "learning_rate": 9.747154154884917e-06, + "loss": 0.7956, + "step": 3784 + }, + { + "epoch": 0.2083218669161759, + "grad_norm": 0.7819229364395142, + "learning_rate": 9.747018038814976e-06, + "loss": 0.778, + "step": 3785 + }, + { + "epoch": 0.20837690571853157, + "grad_norm": 0.7770963311195374, + "learning_rate": 9.746881887067718e-06, + "loss": 0.8055, + "step": 3786 + }, + { + "epoch": 0.20843194452088723, + "grad_norm": 0.7168729305267334, + "learning_rate": 9.746745699644169e-06, + "loss": 0.7476, + "step": 3787 + }, + { + "epoch": 0.2084869833232429, + "grad_norm": 0.7963632941246033, + "learning_rate": 9.746609476545348e-06, + "loss": 0.8083, + "step": 3788 + }, + { + "epoch": 0.20854202212559855, + "grad_norm": 0.6689679026603699, + "learning_rate": 9.746473217772281e-06, + "loss": 0.6687, + "step": 3789 + }, + { + "epoch": 0.20859706092795421, + "grad_norm": 0.8085560202598572, + "learning_rate": 9.746336923325991e-06, + "loss": 0.8221, + "step": 3790 + }, + { + "epoch": 0.20865209973030988, + "grad_norm": 0.7215744256973267, + "learning_rate": 9.746200593207505e-06, + "loss": 0.7261, + "step": 3791 + }, + { + "epoch": 0.20870713853266554, + "grad_norm": 0.7821729779243469, + "learning_rate": 9.746064227417844e-06, + "loss": 0.7683, + "step": 3792 + }, + { + "epoch": 0.2087621773350212, + "grad_norm": 1.0014925003051758, + "learning_rate": 9.745927825958036e-06, + "loss": 0.7485, + "step": 3793 + }, + { + "epoch": 0.20881721613737686, + "grad_norm": 0.9447367787361145, + "learning_rate": 9.745791388829102e-06, + "loss": 0.835, + "step": 3794 + }, + { + "epoch": 0.20887225493973252, + "grad_norm": 0.7333751916885376, + "learning_rate": 9.745654916032073e-06, + "loss": 0.811, + "step": 3795 + }, + { + "epoch": 0.20892729374208818, + "grad_norm": 0.7516912221908569, + "learning_rate": 9.745518407567973e-06, + "loss": 0.7669, + "step": 3796 + }, + { + "epoch": 0.20898233254444384, + "grad_norm": 0.7826053500175476, + "learning_rate": 9.745381863437824e-06, + "loss": 0.7963, + "step": 3797 + }, + { + "epoch": 0.2090373713467995, + "grad_norm": 0.8258751630783081, + "learning_rate": 9.745245283642658e-06, + "loss": 0.7929, + "step": 3798 + }, + { + "epoch": 0.20909241014915517, + "grad_norm": 0.7990522980690002, + "learning_rate": 9.745108668183497e-06, + "loss": 0.8518, + "step": 3799 + }, + { + "epoch": 0.20914744895151083, + "grad_norm": 1.3855403661727905, + "learning_rate": 9.744972017061369e-06, + "loss": 0.7768, + "step": 3800 + }, + { + "epoch": 0.2092024877538665, + "grad_norm": 0.8456707000732422, + "learning_rate": 9.744835330277302e-06, + "loss": 0.7629, + "step": 3801 + }, + { + "epoch": 0.20925752655622212, + "grad_norm": 0.8992564678192139, + "learning_rate": 9.744698607832323e-06, + "loss": 0.8991, + "step": 3802 + }, + { + "epoch": 0.20931256535857778, + "grad_norm": 0.8533509969711304, + "learning_rate": 9.744561849727459e-06, + "loss": 0.8883, + "step": 3803 + }, + { + "epoch": 0.20936760416093345, + "grad_norm": 0.8363122940063477, + "learning_rate": 9.744425055963739e-06, + "loss": 0.8537, + "step": 3804 + }, + { + "epoch": 0.2094226429632891, + "grad_norm": 0.7462213039398193, + "learning_rate": 9.744288226542189e-06, + "loss": 0.7713, + "step": 3805 + }, + { + "epoch": 0.20947768176564477, + "grad_norm": 0.8148539066314697, + "learning_rate": 9.744151361463841e-06, + "loss": 0.7887, + "step": 3806 + }, + { + "epoch": 0.20953272056800043, + "grad_norm": 0.7504319548606873, + "learning_rate": 9.744014460729718e-06, + "loss": 0.7385, + "step": 3807 + }, + { + "epoch": 0.2095877593703561, + "grad_norm": 0.9291114807128906, + "learning_rate": 9.743877524340854e-06, + "loss": 0.9886, + "step": 3808 + }, + { + "epoch": 0.20964279817271175, + "grad_norm": 0.7747925519943237, + "learning_rate": 9.743740552298276e-06, + "loss": 0.8772, + "step": 3809 + }, + { + "epoch": 0.20969783697506741, + "grad_norm": 0.7283097505569458, + "learning_rate": 9.743603544603016e-06, + "loss": 0.7403, + "step": 3810 + }, + { + "epoch": 0.20975287577742308, + "grad_norm": 0.8403457999229431, + "learning_rate": 9.743466501256098e-06, + "loss": 0.7998, + "step": 3811 + }, + { + "epoch": 0.20980791457977874, + "grad_norm": 0.8218665719032288, + "learning_rate": 9.743329422258557e-06, + "loss": 0.8019, + "step": 3812 + }, + { + "epoch": 0.2098629533821344, + "grad_norm": 0.6991317868232727, + "learning_rate": 9.743192307611423e-06, + "loss": 0.743, + "step": 3813 + }, + { + "epoch": 0.20991799218449006, + "grad_norm": 0.767295241355896, + "learning_rate": 9.743055157315725e-06, + "loss": 0.8003, + "step": 3814 + }, + { + "epoch": 0.20997303098684572, + "grad_norm": 0.9457303285598755, + "learning_rate": 9.742917971372492e-06, + "loss": 0.8448, + "step": 3815 + }, + { + "epoch": 0.21002806978920138, + "grad_norm": 0.7839058637619019, + "learning_rate": 9.742780749782758e-06, + "loss": 0.8828, + "step": 3816 + }, + { + "epoch": 0.21008310859155704, + "grad_norm": 0.7831344604492188, + "learning_rate": 9.742643492547553e-06, + "loss": 0.7714, + "step": 3817 + }, + { + "epoch": 0.2101381473939127, + "grad_norm": 0.7637175917625427, + "learning_rate": 9.74250619966791e-06, + "loss": 0.7508, + "step": 3818 + }, + { + "epoch": 0.21019318619626837, + "grad_norm": 0.8830221891403198, + "learning_rate": 9.74236887114486e-06, + "loss": 0.8508, + "step": 3819 + }, + { + "epoch": 0.21024822499862403, + "grad_norm": 0.7803365588188171, + "learning_rate": 9.742231506979434e-06, + "loss": 0.8094, + "step": 3820 + }, + { + "epoch": 0.2103032638009797, + "grad_norm": 0.7701493501663208, + "learning_rate": 9.742094107172666e-06, + "loss": 0.8851, + "step": 3821 + }, + { + "epoch": 0.21035830260333535, + "grad_norm": 0.6434544324874878, + "learning_rate": 9.741956671725588e-06, + "loss": 0.7015, + "step": 3822 + }, + { + "epoch": 0.210413341405691, + "grad_norm": 0.7294684052467346, + "learning_rate": 9.741819200639233e-06, + "loss": 0.7357, + "step": 3823 + }, + { + "epoch": 0.21046838020804667, + "grad_norm": 0.702367901802063, + "learning_rate": 9.741681693914635e-06, + "loss": 0.7518, + "step": 3824 + }, + { + "epoch": 0.21052341901040234, + "grad_norm": 0.7567246556282043, + "learning_rate": 9.741544151552826e-06, + "loss": 0.8259, + "step": 3825 + }, + { + "epoch": 0.210578457812758, + "grad_norm": 0.7478607892990112, + "learning_rate": 9.741406573554841e-06, + "loss": 0.81, + "step": 3826 + }, + { + "epoch": 0.21063349661511366, + "grad_norm": 0.7270129323005676, + "learning_rate": 9.741268959921712e-06, + "loss": 0.8201, + "step": 3827 + }, + { + "epoch": 0.21068853541746932, + "grad_norm": 0.8108176589012146, + "learning_rate": 9.741131310654475e-06, + "loss": 0.8425, + "step": 3828 + }, + { + "epoch": 0.21074357421982498, + "grad_norm": 0.7773691415786743, + "learning_rate": 9.740993625754165e-06, + "loss": 0.8372, + "step": 3829 + }, + { + "epoch": 0.21079861302218064, + "grad_norm": 0.8988421559333801, + "learning_rate": 9.740855905221816e-06, + "loss": 0.8285, + "step": 3830 + }, + { + "epoch": 0.2108536518245363, + "grad_norm": 0.7339534759521484, + "learning_rate": 9.740718149058462e-06, + "loss": 0.7567, + "step": 3831 + }, + { + "epoch": 0.21090869062689196, + "grad_norm": 0.8465108275413513, + "learning_rate": 9.740580357265141e-06, + "loss": 0.8747, + "step": 3832 + }, + { + "epoch": 0.21096372942924763, + "grad_norm": 0.7956714034080505, + "learning_rate": 9.740442529842885e-06, + "loss": 0.7665, + "step": 3833 + }, + { + "epoch": 0.2110187682316033, + "grad_norm": 0.96270751953125, + "learning_rate": 9.740304666792733e-06, + "loss": 0.8338, + "step": 3834 + }, + { + "epoch": 0.21107380703395895, + "grad_norm": 0.812329113483429, + "learning_rate": 9.74016676811572e-06, + "loss": 0.8407, + "step": 3835 + }, + { + "epoch": 0.2111288458363146, + "grad_norm": 0.7975192070007324, + "learning_rate": 9.740028833812882e-06, + "loss": 0.7836, + "step": 3836 + }, + { + "epoch": 0.21118388463867027, + "grad_norm": 0.826621949672699, + "learning_rate": 9.739890863885258e-06, + "loss": 0.732, + "step": 3837 + }, + { + "epoch": 0.21123892344102593, + "grad_norm": 0.9015662670135498, + "learning_rate": 9.73975285833388e-06, + "loss": 0.8837, + "step": 3838 + }, + { + "epoch": 0.2112939622433816, + "grad_norm": 0.7641518712043762, + "learning_rate": 9.73961481715979e-06, + "loss": 0.7334, + "step": 3839 + }, + { + "epoch": 0.21134900104573726, + "grad_norm": 0.8062206506729126, + "learning_rate": 9.739476740364023e-06, + "loss": 0.8381, + "step": 3840 + }, + { + "epoch": 0.21140403984809292, + "grad_norm": 0.7301875352859497, + "learning_rate": 9.739338627947619e-06, + "loss": 0.7389, + "step": 3841 + }, + { + "epoch": 0.21145907865044858, + "grad_norm": 0.8995181322097778, + "learning_rate": 9.739200479911612e-06, + "loss": 0.8111, + "step": 3842 + }, + { + "epoch": 0.21151411745280424, + "grad_norm": 0.7154433131217957, + "learning_rate": 9.739062296257045e-06, + "loss": 0.7501, + "step": 3843 + }, + { + "epoch": 0.2115691562551599, + "grad_norm": 0.8403087854385376, + "learning_rate": 9.738924076984954e-06, + "loss": 0.8212, + "step": 3844 + }, + { + "epoch": 0.21162419505751554, + "grad_norm": 0.7616639137268066, + "learning_rate": 9.738785822096377e-06, + "loss": 0.82, + "step": 3845 + }, + { + "epoch": 0.2116792338598712, + "grad_norm": 0.7897970080375671, + "learning_rate": 9.738647531592356e-06, + "loss": 0.7972, + "step": 3846 + }, + { + "epoch": 0.21173427266222686, + "grad_norm": 0.7909015417098999, + "learning_rate": 9.738509205473928e-06, + "loss": 0.7939, + "step": 3847 + }, + { + "epoch": 0.21178931146458252, + "grad_norm": 0.9553212523460388, + "learning_rate": 9.73837084374213e-06, + "loss": 0.8672, + "step": 3848 + }, + { + "epoch": 0.21184435026693818, + "grad_norm": 0.9558283686637878, + "learning_rate": 9.73823244639801e-06, + "loss": 0.897, + "step": 3849 + }, + { + "epoch": 0.21189938906929384, + "grad_norm": 0.819530725479126, + "learning_rate": 9.7380940134426e-06, + "loss": 0.86, + "step": 3850 + }, + { + "epoch": 0.2119544278716495, + "grad_norm": 0.7301751971244812, + "learning_rate": 9.737955544876945e-06, + "loss": 0.8265, + "step": 3851 + }, + { + "epoch": 0.21200946667400516, + "grad_norm": 0.8564972281455994, + "learning_rate": 9.737817040702085e-06, + "loss": 0.8253, + "step": 3852 + }, + { + "epoch": 0.21206450547636083, + "grad_norm": 0.7715204358100891, + "learning_rate": 9.737678500919059e-06, + "loss": 0.7779, + "step": 3853 + }, + { + "epoch": 0.2121195442787165, + "grad_norm": 0.7296929955482483, + "learning_rate": 9.73753992552891e-06, + "loss": 0.787, + "step": 3854 + }, + { + "epoch": 0.21217458308107215, + "grad_norm": 0.8574217557907104, + "learning_rate": 9.73740131453268e-06, + "loss": 0.797, + "step": 3855 + }, + { + "epoch": 0.2122296218834278, + "grad_norm": 0.8320643901824951, + "learning_rate": 9.737262667931409e-06, + "loss": 0.876, + "step": 3856 + }, + { + "epoch": 0.21228466068578347, + "grad_norm": 0.7313587069511414, + "learning_rate": 9.73712398572614e-06, + "loss": 0.7151, + "step": 3857 + }, + { + "epoch": 0.21233969948813913, + "grad_norm": 0.7039312720298767, + "learning_rate": 9.736985267917916e-06, + "loss": 0.7353, + "step": 3858 + }, + { + "epoch": 0.2123947382904948, + "grad_norm": 0.7893409132957458, + "learning_rate": 9.736846514507776e-06, + "loss": 0.8383, + "step": 3859 + }, + { + "epoch": 0.21244977709285046, + "grad_norm": 0.8771371245384216, + "learning_rate": 9.736707725496767e-06, + "loss": 0.7543, + "step": 3860 + }, + { + "epoch": 0.21250481589520612, + "grad_norm": 1.0067707300186157, + "learning_rate": 9.736568900885932e-06, + "loss": 0.796, + "step": 3861 + }, + { + "epoch": 0.21255985469756178, + "grad_norm": 0.9171931743621826, + "learning_rate": 9.736430040676312e-06, + "loss": 0.8174, + "step": 3862 + }, + { + "epoch": 0.21261489349991744, + "grad_norm": 0.7616068720817566, + "learning_rate": 9.736291144868952e-06, + "loss": 0.7762, + "step": 3863 + }, + { + "epoch": 0.2126699323022731, + "grad_norm": 0.789010226726532, + "learning_rate": 9.736152213464895e-06, + "loss": 0.7749, + "step": 3864 + }, + { + "epoch": 0.21272497110462876, + "grad_norm": 0.7943348288536072, + "learning_rate": 9.736013246465187e-06, + "loss": 0.6687, + "step": 3865 + }, + { + "epoch": 0.21278000990698442, + "grad_norm": 0.8351758718490601, + "learning_rate": 9.73587424387087e-06, + "loss": 0.9201, + "step": 3866 + }, + { + "epoch": 0.21283504870934009, + "grad_norm": 0.7710975408554077, + "learning_rate": 9.735735205682991e-06, + "loss": 0.8357, + "step": 3867 + }, + { + "epoch": 0.21289008751169575, + "grad_norm": 0.8955768942832947, + "learning_rate": 9.73559613190259e-06, + "loss": 0.8396, + "step": 3868 + }, + { + "epoch": 0.2129451263140514, + "grad_norm": 0.8664666414260864, + "learning_rate": 9.735457022530722e-06, + "loss": 0.8176, + "step": 3869 + }, + { + "epoch": 0.21300016511640707, + "grad_norm": 0.7955949902534485, + "learning_rate": 9.735317877568424e-06, + "loss": 0.8421, + "step": 3870 + }, + { + "epoch": 0.21305520391876273, + "grad_norm": 0.849866509437561, + "learning_rate": 9.735178697016742e-06, + "loss": 0.7677, + "step": 3871 + }, + { + "epoch": 0.2131102427211184, + "grad_norm": 0.7782625555992126, + "learning_rate": 9.735039480876727e-06, + "loss": 0.7838, + "step": 3872 + }, + { + "epoch": 0.21316528152347405, + "grad_norm": 0.7734919190406799, + "learning_rate": 9.734900229149423e-06, + "loss": 0.757, + "step": 3873 + }, + { + "epoch": 0.21322032032582972, + "grad_norm": 0.8462040424346924, + "learning_rate": 9.734760941835876e-06, + "loss": 0.8841, + "step": 3874 + }, + { + "epoch": 0.21327535912818538, + "grad_norm": 0.7219869494438171, + "learning_rate": 9.734621618937133e-06, + "loss": 0.7651, + "step": 3875 + }, + { + "epoch": 0.21333039793054104, + "grad_norm": 0.7550874352455139, + "learning_rate": 9.734482260454241e-06, + "loss": 0.8032, + "step": 3876 + }, + { + "epoch": 0.2133854367328967, + "grad_norm": 0.7504588961601257, + "learning_rate": 9.734342866388247e-06, + "loss": 0.7923, + "step": 3877 + }, + { + "epoch": 0.21344047553525236, + "grad_norm": 0.7407390475273132, + "learning_rate": 9.7342034367402e-06, + "loss": 0.7569, + "step": 3878 + }, + { + "epoch": 0.21349551433760802, + "grad_norm": 0.7911562323570251, + "learning_rate": 9.734063971511147e-06, + "loss": 0.8726, + "step": 3879 + }, + { + "epoch": 0.21355055313996368, + "grad_norm": 0.9132450819015503, + "learning_rate": 9.733924470702139e-06, + "loss": 0.9445, + "step": 3880 + }, + { + "epoch": 0.21360559194231934, + "grad_norm": 0.9639442563056946, + "learning_rate": 9.733784934314218e-06, + "loss": 0.7307, + "step": 3881 + }, + { + "epoch": 0.213660630744675, + "grad_norm": 0.7724352478981018, + "learning_rate": 9.73364536234844e-06, + "loss": 0.8337, + "step": 3882 + }, + { + "epoch": 0.21371566954703067, + "grad_norm": 0.9643296599388123, + "learning_rate": 9.733505754805848e-06, + "loss": 0.8277, + "step": 3883 + }, + { + "epoch": 0.21377070834938633, + "grad_norm": 0.8135218620300293, + "learning_rate": 9.733366111687494e-06, + "loss": 0.7933, + "step": 3884 + }, + { + "epoch": 0.213825747151742, + "grad_norm": 0.7527105212211609, + "learning_rate": 9.733226432994426e-06, + "loss": 0.7302, + "step": 3885 + }, + { + "epoch": 0.21388078595409765, + "grad_norm": 1.090550184249878, + "learning_rate": 9.733086718727698e-06, + "loss": 0.8646, + "step": 3886 + }, + { + "epoch": 0.2139358247564533, + "grad_norm": 0.9227491617202759, + "learning_rate": 9.732946968888358e-06, + "loss": 0.8525, + "step": 3887 + }, + { + "epoch": 0.21399086355880895, + "grad_norm": 0.7781830430030823, + "learning_rate": 9.732807183477454e-06, + "loss": 0.8757, + "step": 3888 + }, + { + "epoch": 0.2140459023611646, + "grad_norm": 0.7740090489387512, + "learning_rate": 9.732667362496036e-06, + "loss": 0.7557, + "step": 3889 + }, + { + "epoch": 0.21410094116352027, + "grad_norm": 0.7341694831848145, + "learning_rate": 9.732527505945159e-06, + "loss": 0.7481, + "step": 3890 + }, + { + "epoch": 0.21415597996587593, + "grad_norm": 0.8691402673721313, + "learning_rate": 9.732387613825872e-06, + "loss": 0.8395, + "step": 3891 + }, + { + "epoch": 0.2142110187682316, + "grad_norm": 0.7845497131347656, + "learning_rate": 9.732247686139227e-06, + "loss": 0.6999, + "step": 3892 + }, + { + "epoch": 0.21426605757058725, + "grad_norm": 0.7944281697273254, + "learning_rate": 9.732107722886275e-06, + "loss": 0.7677, + "step": 3893 + }, + { + "epoch": 0.21432109637294292, + "grad_norm": 0.904195249080658, + "learning_rate": 9.731967724068065e-06, + "loss": 0.8429, + "step": 3894 + }, + { + "epoch": 0.21437613517529858, + "grad_norm": 0.7968988418579102, + "learning_rate": 9.731827689685655e-06, + "loss": 0.8224, + "step": 3895 + }, + { + "epoch": 0.21443117397765424, + "grad_norm": 0.773674726486206, + "learning_rate": 9.731687619740095e-06, + "loss": 0.7743, + "step": 3896 + }, + { + "epoch": 0.2144862127800099, + "grad_norm": 0.7873631715774536, + "learning_rate": 9.731547514232439e-06, + "loss": 0.8581, + "step": 3897 + }, + { + "epoch": 0.21454125158236556, + "grad_norm": 0.7989653944969177, + "learning_rate": 9.731407373163735e-06, + "loss": 0.8447, + "step": 3898 + }, + { + "epoch": 0.21459629038472122, + "grad_norm": 0.74820876121521, + "learning_rate": 9.73126719653504e-06, + "loss": 0.8745, + "step": 3899 + }, + { + "epoch": 0.21465132918707688, + "grad_norm": 0.7191246747970581, + "learning_rate": 9.731126984347408e-06, + "loss": 0.7533, + "step": 3900 + }, + { + "epoch": 0.21470636798943254, + "grad_norm": 0.7718465328216553, + "learning_rate": 9.730986736601893e-06, + "loss": 0.8184, + "step": 3901 + }, + { + "epoch": 0.2147614067917882, + "grad_norm": 0.7055066823959351, + "learning_rate": 9.730846453299547e-06, + "loss": 0.7352, + "step": 3902 + }, + { + "epoch": 0.21481644559414387, + "grad_norm": 0.7500855326652527, + "learning_rate": 9.730706134441425e-06, + "loss": 0.8111, + "step": 3903 + }, + { + "epoch": 0.21487148439649953, + "grad_norm": 0.7568232417106628, + "learning_rate": 9.730565780028583e-06, + "loss": 0.8126, + "step": 3904 + }, + { + "epoch": 0.2149265231988552, + "grad_norm": 0.7418738007545471, + "learning_rate": 9.730425390062075e-06, + "loss": 0.8014, + "step": 3905 + }, + { + "epoch": 0.21498156200121085, + "grad_norm": 0.7967441082000732, + "learning_rate": 9.730284964542955e-06, + "loss": 0.7965, + "step": 3906 + }, + { + "epoch": 0.2150366008035665, + "grad_norm": 0.7444791197776794, + "learning_rate": 9.730144503472281e-06, + "loss": 0.7113, + "step": 3907 + }, + { + "epoch": 0.21509163960592217, + "grad_norm": 0.8372869491577148, + "learning_rate": 9.730004006851107e-06, + "loss": 0.838, + "step": 3908 + }, + { + "epoch": 0.21514667840827784, + "grad_norm": 0.7984300851821899, + "learning_rate": 9.729863474680488e-06, + "loss": 0.856, + "step": 3909 + }, + { + "epoch": 0.2152017172106335, + "grad_norm": 0.7508612871170044, + "learning_rate": 9.72972290696148e-06, + "loss": 0.7947, + "step": 3910 + }, + { + "epoch": 0.21525675601298916, + "grad_norm": 0.7559992074966431, + "learning_rate": 9.729582303695142e-06, + "loss": 0.785, + "step": 3911 + }, + { + "epoch": 0.21531179481534482, + "grad_norm": 0.7764164209365845, + "learning_rate": 9.729441664882531e-06, + "loss": 0.8297, + "step": 3912 + }, + { + "epoch": 0.21536683361770048, + "grad_norm": 0.8112726211547852, + "learning_rate": 9.7293009905247e-06, + "loss": 0.8073, + "step": 3913 + }, + { + "epoch": 0.21542187242005614, + "grad_norm": 0.9748952388763428, + "learning_rate": 9.729160280622709e-06, + "loss": 0.7584, + "step": 3914 + }, + { + "epoch": 0.2154769112224118, + "grad_norm": 0.789191484451294, + "learning_rate": 9.729019535177617e-06, + "loss": 0.7568, + "step": 3915 + }, + { + "epoch": 0.21553195002476747, + "grad_norm": 0.7300963401794434, + "learning_rate": 9.728878754190478e-06, + "loss": 0.8029, + "step": 3916 + }, + { + "epoch": 0.21558698882712313, + "grad_norm": 0.9201067686080933, + "learning_rate": 9.728737937662354e-06, + "loss": 0.8665, + "step": 3917 + }, + { + "epoch": 0.2156420276294788, + "grad_norm": 0.8820425271987915, + "learning_rate": 9.728597085594301e-06, + "loss": 0.8378, + "step": 3918 + }, + { + "epoch": 0.21569706643183445, + "grad_norm": 0.7762684226036072, + "learning_rate": 9.728456197987376e-06, + "loss": 0.8005, + "step": 3919 + }, + { + "epoch": 0.2157521052341901, + "grad_norm": 0.8429732918739319, + "learning_rate": 9.728315274842641e-06, + "loss": 0.8337, + "step": 3920 + }, + { + "epoch": 0.21580714403654577, + "grad_norm": 0.7820748090744019, + "learning_rate": 9.728174316161156e-06, + "loss": 0.8085, + "step": 3921 + }, + { + "epoch": 0.21586218283890143, + "grad_norm": 0.8748064637184143, + "learning_rate": 9.728033321943977e-06, + "loss": 0.7734, + "step": 3922 + }, + { + "epoch": 0.2159172216412571, + "grad_norm": 0.8878050446510315, + "learning_rate": 9.727892292192166e-06, + "loss": 0.9226, + "step": 3923 + }, + { + "epoch": 0.21597226044361276, + "grad_norm": 0.8156047463417053, + "learning_rate": 9.72775122690678e-06, + "loss": 0.8111, + "step": 3924 + }, + { + "epoch": 0.21602729924596842, + "grad_norm": 0.7392945885658264, + "learning_rate": 9.727610126088883e-06, + "loss": 0.758, + "step": 3925 + }, + { + "epoch": 0.21608233804832408, + "grad_norm": 0.7573148608207703, + "learning_rate": 9.727468989739532e-06, + "loss": 0.8142, + "step": 3926 + }, + { + "epoch": 0.21613737685067974, + "grad_norm": 0.831847608089447, + "learning_rate": 9.727327817859792e-06, + "loss": 0.7337, + "step": 3927 + }, + { + "epoch": 0.2161924156530354, + "grad_norm": 0.8012371063232422, + "learning_rate": 9.72718661045072e-06, + "loss": 0.8128, + "step": 3928 + }, + { + "epoch": 0.21624745445539106, + "grad_norm": 0.7985890507698059, + "learning_rate": 9.72704536751338e-06, + "loss": 0.8549, + "step": 3929 + }, + { + "epoch": 0.21630249325774673, + "grad_norm": 0.7194695472717285, + "learning_rate": 9.726904089048832e-06, + "loss": 0.775, + "step": 3930 + }, + { + "epoch": 0.21635753206010236, + "grad_norm": 0.8029330968856812, + "learning_rate": 9.726762775058138e-06, + "loss": 0.9167, + "step": 3931 + }, + { + "epoch": 0.21641257086245802, + "grad_norm": 0.7388954162597656, + "learning_rate": 9.72662142554236e-06, + "loss": 0.7295, + "step": 3932 + }, + { + "epoch": 0.21646760966481368, + "grad_norm": 0.798796534538269, + "learning_rate": 9.726480040502559e-06, + "loss": 0.8686, + "step": 3933 + }, + { + "epoch": 0.21652264846716934, + "grad_norm": 0.9977202415466309, + "learning_rate": 9.726338619939802e-06, + "loss": 0.8387, + "step": 3934 + }, + { + "epoch": 0.216577687269525, + "grad_norm": 0.8173295855522156, + "learning_rate": 9.726197163855148e-06, + "loss": 0.7773, + "step": 3935 + }, + { + "epoch": 0.21663272607188067, + "grad_norm": 0.6519538760185242, + "learning_rate": 9.72605567224966e-06, + "loss": 0.6319, + "step": 3936 + }, + { + "epoch": 0.21668776487423633, + "grad_norm": 0.8004894852638245, + "learning_rate": 9.725914145124404e-06, + "loss": 0.8281, + "step": 3937 + }, + { + "epoch": 0.216742803676592, + "grad_norm": 0.7327558398246765, + "learning_rate": 9.725772582480442e-06, + "loss": 0.7105, + "step": 3938 + }, + { + "epoch": 0.21679784247894765, + "grad_norm": 0.7624199986457825, + "learning_rate": 9.725630984318839e-06, + "loss": 0.7823, + "step": 3939 + }, + { + "epoch": 0.2168528812813033, + "grad_norm": 0.7750238180160522, + "learning_rate": 9.725489350640658e-06, + "loss": 0.8147, + "step": 3940 + }, + { + "epoch": 0.21690792008365897, + "grad_norm": 0.6886566877365112, + "learning_rate": 9.725347681446964e-06, + "loss": 0.7263, + "step": 3941 + }, + { + "epoch": 0.21696295888601463, + "grad_norm": 0.882060170173645, + "learning_rate": 9.725205976738821e-06, + "loss": 0.8931, + "step": 3942 + }, + { + "epoch": 0.2170179976883703, + "grad_norm": 0.7946881055831909, + "learning_rate": 9.725064236517297e-06, + "loss": 0.8036, + "step": 3943 + }, + { + "epoch": 0.21707303649072596, + "grad_norm": 0.7062187194824219, + "learning_rate": 9.724922460783453e-06, + "loss": 0.6915, + "step": 3944 + }, + { + "epoch": 0.21712807529308162, + "grad_norm": 0.7978640794754028, + "learning_rate": 9.724780649538356e-06, + "loss": 0.8873, + "step": 3945 + }, + { + "epoch": 0.21718311409543728, + "grad_norm": 0.8828096389770508, + "learning_rate": 9.724638802783073e-06, + "loss": 0.7114, + "step": 3946 + }, + { + "epoch": 0.21723815289779294, + "grad_norm": 0.7301073670387268, + "learning_rate": 9.724496920518672e-06, + "loss": 0.8107, + "step": 3947 + }, + { + "epoch": 0.2172931917001486, + "grad_norm": 0.7944212555885315, + "learning_rate": 9.724355002746213e-06, + "loss": 0.8135, + "step": 3948 + }, + { + "epoch": 0.21734823050250426, + "grad_norm": 0.7988898754119873, + "learning_rate": 9.724213049466768e-06, + "loss": 0.7173, + "step": 3949 + }, + { + "epoch": 0.21740326930485993, + "grad_norm": 0.7734915018081665, + "learning_rate": 9.724071060681401e-06, + "loss": 0.8131, + "step": 3950 + }, + { + "epoch": 0.2174583081072156, + "grad_norm": 0.6856646537780762, + "learning_rate": 9.723929036391183e-06, + "loss": 0.6873, + "step": 3951 + }, + { + "epoch": 0.21751334690957125, + "grad_norm": 0.8652976751327515, + "learning_rate": 9.723786976597179e-06, + "loss": 0.7908, + "step": 3952 + }, + { + "epoch": 0.2175683857119269, + "grad_norm": 0.7325445413589478, + "learning_rate": 9.723644881300453e-06, + "loss": 0.7389, + "step": 3953 + }, + { + "epoch": 0.21762342451428257, + "grad_norm": 0.8596270084381104, + "learning_rate": 9.723502750502079e-06, + "loss": 0.7785, + "step": 3954 + }, + { + "epoch": 0.21767846331663823, + "grad_norm": 0.739248514175415, + "learning_rate": 9.723360584203123e-06, + "loss": 0.8125, + "step": 3955 + }, + { + "epoch": 0.2177335021189939, + "grad_norm": 0.815617561340332, + "learning_rate": 9.723218382404652e-06, + "loss": 0.8682, + "step": 3956 + }, + { + "epoch": 0.21778854092134955, + "grad_norm": 0.758756160736084, + "learning_rate": 9.723076145107738e-06, + "loss": 0.7717, + "step": 3957 + }, + { + "epoch": 0.21784357972370522, + "grad_norm": 0.9007643461227417, + "learning_rate": 9.722933872313445e-06, + "loss": 0.7901, + "step": 3958 + }, + { + "epoch": 0.21789861852606088, + "grad_norm": 0.781548023223877, + "learning_rate": 9.722791564022846e-06, + "loss": 0.8338, + "step": 3959 + }, + { + "epoch": 0.21795365732841654, + "grad_norm": 0.7730190753936768, + "learning_rate": 9.722649220237011e-06, + "loss": 0.8032, + "step": 3960 + }, + { + "epoch": 0.2180086961307722, + "grad_norm": 0.8737791776657104, + "learning_rate": 9.722506840957009e-06, + "loss": 0.8436, + "step": 3961 + }, + { + "epoch": 0.21806373493312786, + "grad_norm": 0.8151329159736633, + "learning_rate": 9.722364426183908e-06, + "loss": 0.8115, + "step": 3962 + }, + { + "epoch": 0.21811877373548352, + "grad_norm": 0.7852860689163208, + "learning_rate": 9.722221975918782e-06, + "loss": 0.7977, + "step": 3963 + }, + { + "epoch": 0.21817381253783918, + "grad_norm": 0.9064140319824219, + "learning_rate": 9.722079490162698e-06, + "loss": 0.8799, + "step": 3964 + }, + { + "epoch": 0.21822885134019485, + "grad_norm": 0.8579906821250916, + "learning_rate": 9.72193696891673e-06, + "loss": 0.7825, + "step": 3965 + }, + { + "epoch": 0.2182838901425505, + "grad_norm": 0.8005900382995605, + "learning_rate": 9.721794412181946e-06, + "loss": 0.8601, + "step": 3966 + }, + { + "epoch": 0.21833892894490617, + "grad_norm": 0.7661529183387756, + "learning_rate": 9.721651819959421e-06, + "loss": 0.7446, + "step": 3967 + }, + { + "epoch": 0.21839396774726183, + "grad_norm": 0.7558436989784241, + "learning_rate": 9.721509192250224e-06, + "loss": 0.7484, + "step": 3968 + }, + { + "epoch": 0.2184490065496175, + "grad_norm": 0.765446126461029, + "learning_rate": 9.721366529055427e-06, + "loss": 0.7727, + "step": 3969 + }, + { + "epoch": 0.21850404535197315, + "grad_norm": 0.7329973578453064, + "learning_rate": 9.721223830376103e-06, + "loss": 0.797, + "step": 3970 + }, + { + "epoch": 0.21855908415432881, + "grad_norm": 0.8881974220275879, + "learning_rate": 9.721081096213324e-06, + "loss": 0.9199, + "step": 3971 + }, + { + "epoch": 0.21861412295668448, + "grad_norm": 0.8246786594390869, + "learning_rate": 9.720938326568165e-06, + "loss": 0.9108, + "step": 3972 + }, + { + "epoch": 0.21866916175904014, + "grad_norm": 0.7187291979789734, + "learning_rate": 9.720795521441697e-06, + "loss": 0.7756, + "step": 3973 + }, + { + "epoch": 0.21872420056139577, + "grad_norm": 0.7880695462226868, + "learning_rate": 9.720652680834995e-06, + "loss": 0.8548, + "step": 3974 + }, + { + "epoch": 0.21877923936375143, + "grad_norm": 0.8841108679771423, + "learning_rate": 9.720509804749128e-06, + "loss": 0.8477, + "step": 3975 + }, + { + "epoch": 0.2188342781661071, + "grad_norm": 0.9061402678489685, + "learning_rate": 9.720366893185173e-06, + "loss": 0.8235, + "step": 3976 + }, + { + "epoch": 0.21888931696846275, + "grad_norm": 0.8342392444610596, + "learning_rate": 9.720223946144206e-06, + "loss": 0.7777, + "step": 3977 + }, + { + "epoch": 0.21894435577081842, + "grad_norm": 0.7933762073516846, + "learning_rate": 9.720080963627299e-06, + "loss": 0.7943, + "step": 3978 + }, + { + "epoch": 0.21899939457317408, + "grad_norm": 0.8358896374702454, + "learning_rate": 9.719937945635527e-06, + "loss": 0.8932, + "step": 3979 + }, + { + "epoch": 0.21905443337552974, + "grad_norm": 0.7479808926582336, + "learning_rate": 9.719794892169964e-06, + "loss": 0.7446, + "step": 3980 + }, + { + "epoch": 0.2191094721778854, + "grad_norm": 0.7920958399772644, + "learning_rate": 9.719651803231685e-06, + "loss": 0.7489, + "step": 3981 + }, + { + "epoch": 0.21916451098024106, + "grad_norm": 0.7098824977874756, + "learning_rate": 9.719508678821768e-06, + "loss": 0.7763, + "step": 3982 + }, + { + "epoch": 0.21921954978259672, + "grad_norm": 0.8733491897583008, + "learning_rate": 9.719365518941288e-06, + "loss": 0.7325, + "step": 3983 + }, + { + "epoch": 0.21927458858495238, + "grad_norm": 0.8328796029090881, + "learning_rate": 9.719222323591318e-06, + "loss": 0.9097, + "step": 3984 + }, + { + "epoch": 0.21932962738730805, + "grad_norm": 0.7869352698326111, + "learning_rate": 9.719079092772936e-06, + "loss": 0.759, + "step": 3985 + }, + { + "epoch": 0.2193846661896637, + "grad_norm": 0.8278539180755615, + "learning_rate": 9.718935826487221e-06, + "loss": 0.8545, + "step": 3986 + }, + { + "epoch": 0.21943970499201937, + "grad_norm": 0.8122449517250061, + "learning_rate": 9.718792524735246e-06, + "loss": 0.7646, + "step": 3987 + }, + { + "epoch": 0.21949474379437503, + "grad_norm": 1.072253942489624, + "learning_rate": 9.71864918751809e-06, + "loss": 0.915, + "step": 3988 + }, + { + "epoch": 0.2195497825967307, + "grad_norm": 0.7770013213157654, + "learning_rate": 9.718505814836829e-06, + "loss": 0.7561, + "step": 3989 + }, + { + "epoch": 0.21960482139908635, + "grad_norm": 0.9011678695678711, + "learning_rate": 9.718362406692544e-06, + "loss": 0.7532, + "step": 3990 + }, + { + "epoch": 0.21965986020144201, + "grad_norm": 0.8867584466934204, + "learning_rate": 9.718218963086307e-06, + "loss": 0.8732, + "step": 3991 + }, + { + "epoch": 0.21971489900379768, + "grad_norm": 0.8884773850440979, + "learning_rate": 9.718075484019201e-06, + "loss": 0.7403, + "step": 3992 + }, + { + "epoch": 0.21976993780615334, + "grad_norm": 0.8995673060417175, + "learning_rate": 9.7179319694923e-06, + "loss": 0.9283, + "step": 3993 + }, + { + "epoch": 0.219824976608509, + "grad_norm": 0.7875818014144897, + "learning_rate": 9.717788419506688e-06, + "loss": 0.8633, + "step": 3994 + }, + { + "epoch": 0.21988001541086466, + "grad_norm": 0.7693219184875488, + "learning_rate": 9.71764483406344e-06, + "loss": 0.8073, + "step": 3995 + }, + { + "epoch": 0.21993505421322032, + "grad_norm": 0.7932817339897156, + "learning_rate": 9.717501213163636e-06, + "loss": 0.7537, + "step": 3996 + }, + { + "epoch": 0.21999009301557598, + "grad_norm": 0.8274912238121033, + "learning_rate": 9.717357556808358e-06, + "loss": 0.7715, + "step": 3997 + }, + { + "epoch": 0.22004513181793164, + "grad_norm": 0.7533993124961853, + "learning_rate": 9.71721386499868e-06, + "loss": 0.7482, + "step": 3998 + }, + { + "epoch": 0.2201001706202873, + "grad_norm": 1.028228759765625, + "learning_rate": 9.717070137735687e-06, + "loss": 0.9897, + "step": 3999 + }, + { + "epoch": 0.22015520942264297, + "grad_norm": 1.1093978881835938, + "learning_rate": 9.716926375020457e-06, + "loss": 0.8701, + "step": 4000 + }, + { + "epoch": 0.22021024822499863, + "grad_norm": 0.7891124486923218, + "learning_rate": 9.716782576854073e-06, + "loss": 0.8533, + "step": 4001 + }, + { + "epoch": 0.2202652870273543, + "grad_norm": 1.1783788204193115, + "learning_rate": 9.716638743237611e-06, + "loss": 0.8088, + "step": 4002 + }, + { + "epoch": 0.22032032582970995, + "grad_norm": 0.8713383078575134, + "learning_rate": 9.716494874172157e-06, + "loss": 0.8382, + "step": 4003 + }, + { + "epoch": 0.2203753646320656, + "grad_norm": 0.7821565270423889, + "learning_rate": 9.716350969658787e-06, + "loss": 0.8168, + "step": 4004 + }, + { + "epoch": 0.22043040343442127, + "grad_norm": 0.7642589211463928, + "learning_rate": 9.716207029698589e-06, + "loss": 0.7209, + "step": 4005 + }, + { + "epoch": 0.22048544223677694, + "grad_norm": 0.935625433921814, + "learning_rate": 9.716063054292639e-06, + "loss": 0.8436, + "step": 4006 + }, + { + "epoch": 0.2205404810391326, + "grad_norm": 0.7064627408981323, + "learning_rate": 9.715919043442024e-06, + "loss": 0.7651, + "step": 4007 + }, + { + "epoch": 0.22059551984148826, + "grad_norm": 0.6980876326560974, + "learning_rate": 9.715774997147823e-06, + "loss": 0.7842, + "step": 4008 + }, + { + "epoch": 0.22065055864384392, + "grad_norm": 0.7691119313240051, + "learning_rate": 9.715630915411118e-06, + "loss": 0.7345, + "step": 4009 + }, + { + "epoch": 0.22070559744619958, + "grad_norm": 0.8870186805725098, + "learning_rate": 9.715486798232994e-06, + "loss": 0.7531, + "step": 4010 + }, + { + "epoch": 0.22076063624855524, + "grad_norm": 0.7225383520126343, + "learning_rate": 9.715342645614533e-06, + "loss": 0.8543, + "step": 4011 + }, + { + "epoch": 0.2208156750509109, + "grad_norm": 0.7517428994178772, + "learning_rate": 9.71519845755682e-06, + "loss": 0.84, + "step": 4012 + }, + { + "epoch": 0.22087071385326656, + "grad_norm": 0.8115549087524414, + "learning_rate": 9.715054234060937e-06, + "loss": 0.7823, + "step": 4013 + }, + { + "epoch": 0.22092575265562223, + "grad_norm": 1.6656148433685303, + "learning_rate": 9.714909975127968e-06, + "loss": 0.8951, + "step": 4014 + }, + { + "epoch": 0.2209807914579779, + "grad_norm": 0.906508207321167, + "learning_rate": 9.714765680758997e-06, + "loss": 0.8599, + "step": 4015 + }, + { + "epoch": 0.22103583026033355, + "grad_norm": 0.8274093866348267, + "learning_rate": 9.71462135095511e-06, + "loss": 0.9568, + "step": 4016 + }, + { + "epoch": 0.22109086906268918, + "grad_norm": 0.7745386958122253, + "learning_rate": 9.714476985717393e-06, + "loss": 0.8641, + "step": 4017 + }, + { + "epoch": 0.22114590786504484, + "grad_norm": 0.8112689256668091, + "learning_rate": 9.714332585046928e-06, + "loss": 0.834, + "step": 4018 + }, + { + "epoch": 0.2212009466674005, + "grad_norm": 0.916847825050354, + "learning_rate": 9.714188148944799e-06, + "loss": 0.8546, + "step": 4019 + }, + { + "epoch": 0.22125598546975617, + "grad_norm": 0.8595414161682129, + "learning_rate": 9.714043677412096e-06, + "loss": 0.9388, + "step": 4020 + }, + { + "epoch": 0.22131102427211183, + "grad_norm": 0.8672438263893127, + "learning_rate": 9.713899170449901e-06, + "loss": 0.8151, + "step": 4021 + }, + { + "epoch": 0.2213660630744675, + "grad_norm": 0.699749767780304, + "learning_rate": 9.713754628059304e-06, + "loss": 0.7433, + "step": 4022 + }, + { + "epoch": 0.22142110187682315, + "grad_norm": 0.8071898818016052, + "learning_rate": 9.713610050241387e-06, + "loss": 0.7663, + "step": 4023 + }, + { + "epoch": 0.2214761406791788, + "grad_norm": 0.745030403137207, + "learning_rate": 9.713465436997239e-06, + "loss": 0.7733, + "step": 4024 + }, + { + "epoch": 0.22153117948153447, + "grad_norm": 0.8034930229187012, + "learning_rate": 9.713320788327947e-06, + "loss": 0.9015, + "step": 4025 + }, + { + "epoch": 0.22158621828389014, + "grad_norm": 0.8549708724021912, + "learning_rate": 9.713176104234597e-06, + "loss": 0.7127, + "step": 4026 + }, + { + "epoch": 0.2216412570862458, + "grad_norm": 0.8432256579399109, + "learning_rate": 9.713031384718277e-06, + "loss": 0.8163, + "step": 4027 + }, + { + "epoch": 0.22169629588860146, + "grad_norm": 0.7623703479766846, + "learning_rate": 9.712886629780075e-06, + "loss": 0.8272, + "step": 4028 + }, + { + "epoch": 0.22175133469095712, + "grad_norm": 0.8425806760787964, + "learning_rate": 9.712741839421079e-06, + "loss": 0.7907, + "step": 4029 + }, + { + "epoch": 0.22180637349331278, + "grad_norm": 0.7477750778198242, + "learning_rate": 9.712597013642376e-06, + "loss": 0.7662, + "step": 4030 + }, + { + "epoch": 0.22186141229566844, + "grad_norm": 0.7761805057525635, + "learning_rate": 9.712452152445056e-06, + "loss": 0.7999, + "step": 4031 + }, + { + "epoch": 0.2219164510980241, + "grad_norm": 0.8604531288146973, + "learning_rate": 9.712307255830207e-06, + "loss": 0.812, + "step": 4032 + }, + { + "epoch": 0.22197148990037976, + "grad_norm": 0.8113332986831665, + "learning_rate": 9.712162323798918e-06, + "loss": 0.8092, + "step": 4033 + }, + { + "epoch": 0.22202652870273543, + "grad_norm": 0.7980128526687622, + "learning_rate": 9.71201735635228e-06, + "loss": 0.6934, + "step": 4034 + }, + { + "epoch": 0.2220815675050911, + "grad_norm": 0.7819470763206482, + "learning_rate": 9.711872353491377e-06, + "loss": 0.8531, + "step": 4035 + }, + { + "epoch": 0.22213660630744675, + "grad_norm": 0.8283445835113525, + "learning_rate": 9.711727315217305e-06, + "loss": 0.8594, + "step": 4036 + }, + { + "epoch": 0.2221916451098024, + "grad_norm": 0.7282612919807434, + "learning_rate": 9.711582241531153e-06, + "loss": 0.7374, + "step": 4037 + }, + { + "epoch": 0.22224668391215807, + "grad_norm": 0.9564353823661804, + "learning_rate": 9.711437132434007e-06, + "loss": 0.7996, + "step": 4038 + }, + { + "epoch": 0.22230172271451373, + "grad_norm": 0.8559701442718506, + "learning_rate": 9.711291987926963e-06, + "loss": 0.949, + "step": 4039 + }, + { + "epoch": 0.2223567615168694, + "grad_norm": 0.7515334486961365, + "learning_rate": 9.71114680801111e-06, + "loss": 0.7188, + "step": 4040 + }, + { + "epoch": 0.22241180031922506, + "grad_norm": 0.7685608863830566, + "learning_rate": 9.711001592687537e-06, + "loss": 0.7679, + "step": 4041 + }, + { + "epoch": 0.22246683912158072, + "grad_norm": 0.6848913431167603, + "learning_rate": 9.710856341957337e-06, + "loss": 0.7666, + "step": 4042 + }, + { + "epoch": 0.22252187792393638, + "grad_norm": 0.7270542979240417, + "learning_rate": 9.710711055821602e-06, + "loss": 0.7563, + "step": 4043 + }, + { + "epoch": 0.22257691672629204, + "grad_norm": 0.7965164184570312, + "learning_rate": 9.710565734281424e-06, + "loss": 0.7586, + "step": 4044 + }, + { + "epoch": 0.2226319555286477, + "grad_norm": 0.7872949242591858, + "learning_rate": 9.710420377337895e-06, + "loss": 0.8423, + "step": 4045 + }, + { + "epoch": 0.22268699433100336, + "grad_norm": 0.7466526627540588, + "learning_rate": 9.710274984992107e-06, + "loss": 0.7578, + "step": 4046 + }, + { + "epoch": 0.22274203313335902, + "grad_norm": 0.7208731770515442, + "learning_rate": 9.710129557245154e-06, + "loss": 0.7019, + "step": 4047 + }, + { + "epoch": 0.22279707193571469, + "grad_norm": 0.6953400373458862, + "learning_rate": 9.709984094098127e-06, + "loss": 0.7234, + "step": 4048 + }, + { + "epoch": 0.22285211073807035, + "grad_norm": 0.7866283059120178, + "learning_rate": 9.709838595552122e-06, + "loss": 0.785, + "step": 4049 + }, + { + "epoch": 0.222907149540426, + "grad_norm": 0.7404114007949829, + "learning_rate": 9.709693061608227e-06, + "loss": 0.7706, + "step": 4050 + }, + { + "epoch": 0.22296218834278167, + "grad_norm": 0.8788254857063293, + "learning_rate": 9.709547492267544e-06, + "loss": 0.8392, + "step": 4051 + }, + { + "epoch": 0.22301722714513733, + "grad_norm": 0.7493161559104919, + "learning_rate": 9.70940188753116e-06, + "loss": 0.8346, + "step": 4052 + }, + { + "epoch": 0.223072265947493, + "grad_norm": 0.7340379357337952, + "learning_rate": 9.709256247400174e-06, + "loss": 0.7715, + "step": 4053 + }, + { + "epoch": 0.22312730474984865, + "grad_norm": 0.7291178107261658, + "learning_rate": 9.709110571875677e-06, + "loss": 0.866, + "step": 4054 + }, + { + "epoch": 0.22318234355220432, + "grad_norm": 0.8046013712882996, + "learning_rate": 9.708964860958765e-06, + "loss": 0.7885, + "step": 4055 + }, + { + "epoch": 0.22323738235455998, + "grad_norm": 0.832941472530365, + "learning_rate": 9.708819114650535e-06, + "loss": 0.873, + "step": 4056 + }, + { + "epoch": 0.22329242115691564, + "grad_norm": 0.6933377981185913, + "learning_rate": 9.70867333295208e-06, + "loss": 0.7944, + "step": 4057 + }, + { + "epoch": 0.2233474599592713, + "grad_norm": 0.7976044416427612, + "learning_rate": 9.708527515864499e-06, + "loss": 0.72, + "step": 4058 + }, + { + "epoch": 0.22340249876162696, + "grad_norm": 0.7698904871940613, + "learning_rate": 9.708381663388884e-06, + "loss": 0.7603, + "step": 4059 + }, + { + "epoch": 0.2234575375639826, + "grad_norm": 0.7554401159286499, + "learning_rate": 9.708235775526331e-06, + "loss": 0.7488, + "step": 4060 + }, + { + "epoch": 0.22351257636633826, + "grad_norm": 0.7382954359054565, + "learning_rate": 9.70808985227794e-06, + "loss": 0.7418, + "step": 4061 + }, + { + "epoch": 0.22356761516869392, + "grad_norm": 0.7220499515533447, + "learning_rate": 9.707943893644806e-06, + "loss": 0.7691, + "step": 4062 + }, + { + "epoch": 0.22362265397104958, + "grad_norm": 0.727542519569397, + "learning_rate": 9.707797899628027e-06, + "loss": 0.7603, + "step": 4063 + }, + { + "epoch": 0.22367769277340524, + "grad_norm": 0.7857500910758972, + "learning_rate": 9.707651870228697e-06, + "loss": 0.8633, + "step": 4064 + }, + { + "epoch": 0.2237327315757609, + "grad_norm": 0.7975600361824036, + "learning_rate": 9.707505805447917e-06, + "loss": 0.8591, + "step": 4065 + }, + { + "epoch": 0.22378777037811656, + "grad_norm": 1.0063475370407104, + "learning_rate": 9.707359705286784e-06, + "loss": 0.7935, + "step": 4066 + }, + { + "epoch": 0.22384280918047222, + "grad_norm": 0.7307062745094299, + "learning_rate": 9.707213569746393e-06, + "loss": 0.797, + "step": 4067 + }, + { + "epoch": 0.22389784798282789, + "grad_norm": 0.7891914248466492, + "learning_rate": 9.707067398827847e-06, + "loss": 0.853, + "step": 4068 + }, + { + "epoch": 0.22395288678518355, + "grad_norm": 0.7479422092437744, + "learning_rate": 9.706921192532242e-06, + "loss": 0.7359, + "step": 4069 + }, + { + "epoch": 0.2240079255875392, + "grad_norm": 0.8436065912246704, + "learning_rate": 9.706774950860676e-06, + "loss": 0.7916, + "step": 4070 + }, + { + "epoch": 0.22406296438989487, + "grad_norm": 0.7586960196495056, + "learning_rate": 9.706628673814252e-06, + "loss": 0.7871, + "step": 4071 + }, + { + "epoch": 0.22411800319225053, + "grad_norm": 0.8181111812591553, + "learning_rate": 9.706482361394064e-06, + "loss": 0.7782, + "step": 4072 + }, + { + "epoch": 0.2241730419946062, + "grad_norm": 0.7205253839492798, + "learning_rate": 9.706336013601217e-06, + "loss": 0.7912, + "step": 4073 + }, + { + "epoch": 0.22422808079696185, + "grad_norm": 0.9823397397994995, + "learning_rate": 9.706189630436806e-06, + "loss": 0.8393, + "step": 4074 + }, + { + "epoch": 0.22428311959931752, + "grad_norm": 0.7360854148864746, + "learning_rate": 9.706043211901935e-06, + "loss": 0.8239, + "step": 4075 + }, + { + "epoch": 0.22433815840167318, + "grad_norm": 0.7590144872665405, + "learning_rate": 9.705896757997701e-06, + "loss": 0.7177, + "step": 4076 + }, + { + "epoch": 0.22439319720402884, + "grad_norm": 0.7691343426704407, + "learning_rate": 9.70575026872521e-06, + "loss": 0.7731, + "step": 4077 + }, + { + "epoch": 0.2244482360063845, + "grad_norm": 0.7057286500930786, + "learning_rate": 9.705603744085556e-06, + "loss": 0.7746, + "step": 4078 + }, + { + "epoch": 0.22450327480874016, + "grad_norm": 0.7954769134521484, + "learning_rate": 9.705457184079847e-06, + "loss": 0.8215, + "step": 4079 + }, + { + "epoch": 0.22455831361109582, + "grad_norm": 0.7089072465896606, + "learning_rate": 9.70531058870918e-06, + "loss": 0.7263, + "step": 4080 + }, + { + "epoch": 0.22461335241345148, + "grad_norm": 0.9847552180290222, + "learning_rate": 9.705163957974657e-06, + "loss": 0.8948, + "step": 4081 + }, + { + "epoch": 0.22466839121580715, + "grad_norm": 0.7977012395858765, + "learning_rate": 9.705017291877383e-06, + "loss": 0.7518, + "step": 4082 + }, + { + "epoch": 0.2247234300181628, + "grad_norm": 0.8084518909454346, + "learning_rate": 9.704870590418458e-06, + "loss": 0.8711, + "step": 4083 + }, + { + "epoch": 0.22477846882051847, + "grad_norm": 0.9151536822319031, + "learning_rate": 9.704723853598986e-06, + "loss": 0.8217, + "step": 4084 + }, + { + "epoch": 0.22483350762287413, + "grad_norm": 0.908136248588562, + "learning_rate": 9.704577081420065e-06, + "loss": 0.6961, + "step": 4085 + }, + { + "epoch": 0.2248885464252298, + "grad_norm": 0.8569996953010559, + "learning_rate": 9.704430273882806e-06, + "loss": 0.8405, + "step": 4086 + }, + { + "epoch": 0.22494358522758545, + "grad_norm": 0.7687774300575256, + "learning_rate": 9.704283430988307e-06, + "loss": 0.6903, + "step": 4087 + }, + { + "epoch": 0.2249986240299411, + "grad_norm": 0.863203763961792, + "learning_rate": 9.704136552737673e-06, + "loss": 0.8927, + "step": 4088 + }, + { + "epoch": 0.22505366283229677, + "grad_norm": 1.252581238746643, + "learning_rate": 9.703989639132008e-06, + "loss": 0.8792, + "step": 4089 + }, + { + "epoch": 0.22510870163465244, + "grad_norm": 0.7844160795211792, + "learning_rate": 9.703842690172415e-06, + "loss": 0.844, + "step": 4090 + }, + { + "epoch": 0.2251637404370081, + "grad_norm": 0.8669766187667847, + "learning_rate": 9.703695705860002e-06, + "loss": 0.7008, + "step": 4091 + }, + { + "epoch": 0.22521877923936376, + "grad_norm": 0.7180137634277344, + "learning_rate": 9.703548686195869e-06, + "loss": 0.8242, + "step": 4092 + }, + { + "epoch": 0.22527381804171942, + "grad_norm": 0.7225000858306885, + "learning_rate": 9.703401631181124e-06, + "loss": 0.724, + "step": 4093 + }, + { + "epoch": 0.22532885684407508, + "grad_norm": 0.8348065614700317, + "learning_rate": 9.70325454081687e-06, + "loss": 0.7996, + "step": 4094 + }, + { + "epoch": 0.22538389564643074, + "grad_norm": 0.8099488019943237, + "learning_rate": 9.703107415104216e-06, + "loss": 0.7498, + "step": 4095 + }, + { + "epoch": 0.2254389344487864, + "grad_norm": 0.7051188945770264, + "learning_rate": 9.702960254044264e-06, + "loss": 0.7322, + "step": 4096 + }, + { + "epoch": 0.22549397325114207, + "grad_norm": 0.742859423160553, + "learning_rate": 9.702813057638122e-06, + "loss": 0.746, + "step": 4097 + }, + { + "epoch": 0.22554901205349773, + "grad_norm": 0.7981536984443665, + "learning_rate": 9.702665825886897e-06, + "loss": 0.8705, + "step": 4098 + }, + { + "epoch": 0.2256040508558534, + "grad_norm": 1.0317178964614868, + "learning_rate": 9.702518558791693e-06, + "loss": 0.8261, + "step": 4099 + }, + { + "epoch": 0.22565908965820905, + "grad_norm": 0.7811983823776245, + "learning_rate": 9.702371256353618e-06, + "loss": 0.7633, + "step": 4100 + }, + { + "epoch": 0.2257141284605647, + "grad_norm": 0.8288078308105469, + "learning_rate": 9.702223918573782e-06, + "loss": 0.7974, + "step": 4101 + }, + { + "epoch": 0.22576916726292034, + "grad_norm": 0.8932577967643738, + "learning_rate": 9.702076545453286e-06, + "loss": 0.7517, + "step": 4102 + }, + { + "epoch": 0.225824206065276, + "grad_norm": 0.8342248201370239, + "learning_rate": 9.701929136993243e-06, + "loss": 0.8634, + "step": 4103 + }, + { + "epoch": 0.22587924486763167, + "grad_norm": 0.790392279624939, + "learning_rate": 9.701781693194761e-06, + "loss": 0.7705, + "step": 4104 + }, + { + "epoch": 0.22593428366998733, + "grad_norm": 0.824691891670227, + "learning_rate": 9.701634214058944e-06, + "loss": 0.877, + "step": 4105 + }, + { + "epoch": 0.225989322472343, + "grad_norm": 0.9237051010131836, + "learning_rate": 9.701486699586904e-06, + "loss": 0.842, + "step": 4106 + }, + { + "epoch": 0.22604436127469865, + "grad_norm": 0.7453535199165344, + "learning_rate": 9.701339149779747e-06, + "loss": 0.8217, + "step": 4107 + }, + { + "epoch": 0.2260994000770543, + "grad_norm": 0.727872371673584, + "learning_rate": 9.701191564638586e-06, + "loss": 0.849, + "step": 4108 + }, + { + "epoch": 0.22615443887940997, + "grad_norm": 0.966585636138916, + "learning_rate": 9.701043944164526e-06, + "loss": 0.7742, + "step": 4109 + }, + { + "epoch": 0.22620947768176564, + "grad_norm": 0.7556117177009583, + "learning_rate": 9.700896288358678e-06, + "loss": 0.7498, + "step": 4110 + }, + { + "epoch": 0.2262645164841213, + "grad_norm": 0.848143458366394, + "learning_rate": 9.700748597222151e-06, + "loss": 0.7237, + "step": 4111 + }, + { + "epoch": 0.22631955528647696, + "grad_norm": 0.9046787619590759, + "learning_rate": 9.700600870756056e-06, + "loss": 0.8066, + "step": 4112 + }, + { + "epoch": 0.22637459408883262, + "grad_norm": 0.923159658908844, + "learning_rate": 9.700453108961505e-06, + "loss": 0.8404, + "step": 4113 + }, + { + "epoch": 0.22642963289118828, + "grad_norm": 0.8697664737701416, + "learning_rate": 9.700305311839606e-06, + "loss": 0.7269, + "step": 4114 + }, + { + "epoch": 0.22648467169354394, + "grad_norm": 0.8179994821548462, + "learning_rate": 9.70015747939147e-06, + "loss": 0.8083, + "step": 4115 + }, + { + "epoch": 0.2265397104958996, + "grad_norm": 0.7961694002151489, + "learning_rate": 9.700009611618208e-06, + "loss": 0.7327, + "step": 4116 + }, + { + "epoch": 0.22659474929825527, + "grad_norm": 0.7317802309989929, + "learning_rate": 9.699861708520934e-06, + "loss": 0.8273, + "step": 4117 + }, + { + "epoch": 0.22664978810061093, + "grad_norm": 0.9190557599067688, + "learning_rate": 9.699713770100757e-06, + "loss": 0.8027, + "step": 4118 + }, + { + "epoch": 0.2267048269029666, + "grad_norm": 0.7618072628974915, + "learning_rate": 9.699565796358788e-06, + "loss": 0.7669, + "step": 4119 + }, + { + "epoch": 0.22675986570532225, + "grad_norm": 1.0236154794692993, + "learning_rate": 9.699417787296139e-06, + "loss": 0.7511, + "step": 4120 + }, + { + "epoch": 0.2268149045076779, + "grad_norm": 0.8011670708656311, + "learning_rate": 9.699269742913927e-06, + "loss": 0.7644, + "step": 4121 + }, + { + "epoch": 0.22686994331003357, + "grad_norm": 0.7808024287223816, + "learning_rate": 9.69912166321326e-06, + "loss": 0.7894, + "step": 4122 + }, + { + "epoch": 0.22692498211238923, + "grad_norm": 0.8645655512809753, + "learning_rate": 9.698973548195252e-06, + "loss": 0.7989, + "step": 4123 + }, + { + "epoch": 0.2269800209147449, + "grad_norm": 0.7478770613670349, + "learning_rate": 9.698825397861017e-06, + "loss": 0.7758, + "step": 4124 + }, + { + "epoch": 0.22703505971710056, + "grad_norm": 0.8988361954689026, + "learning_rate": 9.698677212211668e-06, + "loss": 0.8312, + "step": 4125 + }, + { + "epoch": 0.22709009851945622, + "grad_norm": 0.773028552532196, + "learning_rate": 9.69852899124832e-06, + "loss": 0.7415, + "step": 4126 + }, + { + "epoch": 0.22714513732181188, + "grad_norm": 0.8173778653144836, + "learning_rate": 9.698380734972085e-06, + "loss": 0.8241, + "step": 4127 + }, + { + "epoch": 0.22720017612416754, + "grad_norm": 0.7868672013282776, + "learning_rate": 9.698232443384078e-06, + "loss": 0.7294, + "step": 4128 + }, + { + "epoch": 0.2272552149265232, + "grad_norm": 0.8662189841270447, + "learning_rate": 9.698084116485413e-06, + "loss": 0.9307, + "step": 4129 + }, + { + "epoch": 0.22731025372887886, + "grad_norm": 0.7571321129798889, + "learning_rate": 9.697935754277207e-06, + "loss": 0.7756, + "step": 4130 + }, + { + "epoch": 0.22736529253123453, + "grad_norm": 0.8222649097442627, + "learning_rate": 9.697787356760574e-06, + "loss": 0.8689, + "step": 4131 + }, + { + "epoch": 0.2274203313335902, + "grad_norm": 0.8302241563796997, + "learning_rate": 9.697638923936626e-06, + "loss": 0.8139, + "step": 4132 + }, + { + "epoch": 0.22747537013594585, + "grad_norm": 0.779951274394989, + "learning_rate": 9.697490455806482e-06, + "loss": 0.7493, + "step": 4133 + }, + { + "epoch": 0.2275304089383015, + "grad_norm": 0.8409813046455383, + "learning_rate": 9.697341952371257e-06, + "loss": 0.777, + "step": 4134 + }, + { + "epoch": 0.22758544774065717, + "grad_norm": 0.8599729537963867, + "learning_rate": 9.697193413632068e-06, + "loss": 0.7678, + "step": 4135 + }, + { + "epoch": 0.22764048654301283, + "grad_norm": 0.7505115270614624, + "learning_rate": 9.69704483959003e-06, + "loss": 0.787, + "step": 4136 + }, + { + "epoch": 0.2276955253453685, + "grad_norm": 0.7326868176460266, + "learning_rate": 9.696896230246262e-06, + "loss": 0.7066, + "step": 4137 + }, + { + "epoch": 0.22775056414772415, + "grad_norm": 0.8269753456115723, + "learning_rate": 9.696747585601878e-06, + "loss": 0.7379, + "step": 4138 + }, + { + "epoch": 0.22780560295007982, + "grad_norm": 0.7841970324516296, + "learning_rate": 9.696598905657997e-06, + "loss": 0.764, + "step": 4139 + }, + { + "epoch": 0.22786064175243548, + "grad_norm": 0.7131417989730835, + "learning_rate": 9.696450190415735e-06, + "loss": 0.7629, + "step": 4140 + }, + { + "epoch": 0.22791568055479114, + "grad_norm": 0.7922703623771667, + "learning_rate": 9.69630143987621e-06, + "loss": 0.8354, + "step": 4141 + }, + { + "epoch": 0.2279707193571468, + "grad_norm": 0.9628629684448242, + "learning_rate": 9.696152654040543e-06, + "loss": 0.8077, + "step": 4142 + }, + { + "epoch": 0.22802575815950246, + "grad_norm": 0.8566663265228271, + "learning_rate": 9.696003832909847e-06, + "loss": 0.685, + "step": 4143 + }, + { + "epoch": 0.22808079696185812, + "grad_norm": 0.7181339859962463, + "learning_rate": 9.695854976485244e-06, + "loss": 0.8135, + "step": 4144 + }, + { + "epoch": 0.22813583576421376, + "grad_norm": 0.9119813442230225, + "learning_rate": 9.695706084767853e-06, + "loss": 0.7276, + "step": 4145 + }, + { + "epoch": 0.22819087456656942, + "grad_norm": 0.8547400832176208, + "learning_rate": 9.69555715775879e-06, + "loss": 0.8656, + "step": 4146 + }, + { + "epoch": 0.22824591336892508, + "grad_norm": 0.77585768699646, + "learning_rate": 9.695408195459179e-06, + "loss": 0.8218, + "step": 4147 + }, + { + "epoch": 0.22830095217128074, + "grad_norm": 0.7832447290420532, + "learning_rate": 9.695259197870135e-06, + "loss": 0.8002, + "step": 4148 + }, + { + "epoch": 0.2283559909736364, + "grad_norm": 0.9184865355491638, + "learning_rate": 9.69511016499278e-06, + "loss": 0.8651, + "step": 4149 + }, + { + "epoch": 0.22841102977599206, + "grad_norm": 0.8663797974586487, + "learning_rate": 9.694961096828235e-06, + "loss": 0.7381, + "step": 4150 + }, + { + "epoch": 0.22846606857834773, + "grad_norm": 0.843265950679779, + "learning_rate": 9.694811993377617e-06, + "loss": 0.8546, + "step": 4151 + }, + { + "epoch": 0.2285211073807034, + "grad_norm": 0.8021818399429321, + "learning_rate": 9.694662854642049e-06, + "loss": 0.9166, + "step": 4152 + }, + { + "epoch": 0.22857614618305905, + "grad_norm": 0.7762879729270935, + "learning_rate": 9.694513680622653e-06, + "loss": 0.7055, + "step": 4153 + }, + { + "epoch": 0.2286311849854147, + "grad_norm": 0.809352457523346, + "learning_rate": 9.694364471320548e-06, + "loss": 0.7988, + "step": 4154 + }, + { + "epoch": 0.22868622378777037, + "grad_norm": 0.7239902019500732, + "learning_rate": 9.694215226736858e-06, + "loss": 0.7783, + "step": 4155 + }, + { + "epoch": 0.22874126259012603, + "grad_norm": 0.7072625160217285, + "learning_rate": 9.694065946872702e-06, + "loss": 0.7607, + "step": 4156 + }, + { + "epoch": 0.2287963013924817, + "grad_norm": 0.7696169018745422, + "learning_rate": 9.693916631729201e-06, + "loss": 0.7519, + "step": 4157 + }, + { + "epoch": 0.22885134019483735, + "grad_norm": 0.9198557734489441, + "learning_rate": 9.69376728130748e-06, + "loss": 0.7754, + "step": 4158 + }, + { + "epoch": 0.22890637899719302, + "grad_norm": 0.7589097619056702, + "learning_rate": 9.693617895608662e-06, + "loss": 0.7258, + "step": 4159 + }, + { + "epoch": 0.22896141779954868, + "grad_norm": 0.8351333141326904, + "learning_rate": 9.693468474633867e-06, + "loss": 0.8633, + "step": 4160 + }, + { + "epoch": 0.22901645660190434, + "grad_norm": 0.8331828713417053, + "learning_rate": 9.69331901838422e-06, + "loss": 0.7361, + "step": 4161 + }, + { + "epoch": 0.22907149540426, + "grad_norm": 0.8810774087905884, + "learning_rate": 9.693169526860843e-06, + "loss": 0.7651, + "step": 4162 + }, + { + "epoch": 0.22912653420661566, + "grad_norm": 0.8151684999465942, + "learning_rate": 9.69302000006486e-06, + "loss": 0.8533, + "step": 4163 + }, + { + "epoch": 0.22918157300897132, + "grad_norm": 0.8683320879936218, + "learning_rate": 9.692870437997394e-06, + "loss": 0.8323, + "step": 4164 + }, + { + "epoch": 0.22923661181132698, + "grad_norm": 0.7488875389099121, + "learning_rate": 9.692720840659572e-06, + "loss": 0.8414, + "step": 4165 + }, + { + "epoch": 0.22929165061368265, + "grad_norm": 0.7916452288627625, + "learning_rate": 9.692571208052515e-06, + "loss": 0.7058, + "step": 4166 + }, + { + "epoch": 0.2293466894160383, + "grad_norm": 0.8228384256362915, + "learning_rate": 9.69242154017735e-06, + "loss": 0.7667, + "step": 4167 + }, + { + "epoch": 0.22940172821839397, + "grad_norm": 0.7395613789558411, + "learning_rate": 9.692271837035202e-06, + "loss": 0.7649, + "step": 4168 + }, + { + "epoch": 0.22945676702074963, + "grad_norm": 0.7187666893005371, + "learning_rate": 9.692122098627192e-06, + "loss": 0.7575, + "step": 4169 + }, + { + "epoch": 0.2295118058231053, + "grad_norm": 0.7060030102729797, + "learning_rate": 9.691972324954449e-06, + "loss": 0.8309, + "step": 4170 + }, + { + "epoch": 0.22956684462546095, + "grad_norm": 0.7807210087776184, + "learning_rate": 9.691822516018099e-06, + "loss": 0.8185, + "step": 4171 + }, + { + "epoch": 0.22962188342781661, + "grad_norm": 0.6918593645095825, + "learning_rate": 9.691672671819265e-06, + "loss": 0.6983, + "step": 4172 + }, + { + "epoch": 0.22967692223017228, + "grad_norm": 0.7947858572006226, + "learning_rate": 9.691522792359077e-06, + "loss": 0.8098, + "step": 4173 + }, + { + "epoch": 0.22973196103252794, + "grad_norm": 0.7907306551933289, + "learning_rate": 9.691372877638658e-06, + "loss": 0.8, + "step": 4174 + }, + { + "epoch": 0.2297869998348836, + "grad_norm": 0.7669435739517212, + "learning_rate": 9.691222927659137e-06, + "loss": 0.8121, + "step": 4175 + }, + { + "epoch": 0.22984203863723926, + "grad_norm": 0.8128299117088318, + "learning_rate": 9.691072942421642e-06, + "loss": 0.7554, + "step": 4176 + }, + { + "epoch": 0.22989707743959492, + "grad_norm": 0.9043960571289062, + "learning_rate": 9.690922921927295e-06, + "loss": 0.8601, + "step": 4177 + }, + { + "epoch": 0.22995211624195058, + "grad_norm": 0.835445761680603, + "learning_rate": 9.690772866177229e-06, + "loss": 0.8185, + "step": 4178 + }, + { + "epoch": 0.23000715504430624, + "grad_norm": 0.734601616859436, + "learning_rate": 9.69062277517257e-06, + "loss": 0.6486, + "step": 4179 + }, + { + "epoch": 0.2300621938466619, + "grad_norm": 0.8252671957015991, + "learning_rate": 9.690472648914445e-06, + "loss": 0.8455, + "step": 4180 + }, + { + "epoch": 0.23011723264901757, + "grad_norm": 0.8266329169273376, + "learning_rate": 9.690322487403984e-06, + "loss": 0.7348, + "step": 4181 + }, + { + "epoch": 0.23017227145137323, + "grad_norm": 0.8280256390571594, + "learning_rate": 9.690172290642314e-06, + "loss": 0.8191, + "step": 4182 + }, + { + "epoch": 0.2302273102537289, + "grad_norm": 0.8854276537895203, + "learning_rate": 9.690022058630564e-06, + "loss": 0.9327, + "step": 4183 + }, + { + "epoch": 0.23028234905608455, + "grad_norm": 0.7308807969093323, + "learning_rate": 9.689871791369865e-06, + "loss": 0.8144, + "step": 4184 + }, + { + "epoch": 0.2303373878584402, + "grad_norm": 0.7171719670295715, + "learning_rate": 9.689721488861344e-06, + "loss": 0.8265, + "step": 4185 + }, + { + "epoch": 0.23039242666079587, + "grad_norm": 0.7955548763275146, + "learning_rate": 9.689571151106131e-06, + "loss": 0.7313, + "step": 4186 + }, + { + "epoch": 0.23044746546315154, + "grad_norm": 0.8218876123428345, + "learning_rate": 9.689420778105359e-06, + "loss": 0.883, + "step": 4187 + }, + { + "epoch": 0.23050250426550717, + "grad_norm": 0.79570072889328, + "learning_rate": 9.689270369860154e-06, + "loss": 0.8898, + "step": 4188 + }, + { + "epoch": 0.23055754306786283, + "grad_norm": 0.8163344264030457, + "learning_rate": 9.689119926371649e-06, + "loss": 0.8638, + "step": 4189 + }, + { + "epoch": 0.2306125818702185, + "grad_norm": 0.7767764329910278, + "learning_rate": 9.688969447640972e-06, + "loss": 0.7822, + "step": 4190 + }, + { + "epoch": 0.23066762067257415, + "grad_norm": 0.9357114434242249, + "learning_rate": 9.688818933669258e-06, + "loss": 0.8031, + "step": 4191 + }, + { + "epoch": 0.23072265947492981, + "grad_norm": 0.8340080380439758, + "learning_rate": 9.688668384457635e-06, + "loss": 0.8947, + "step": 4192 + }, + { + "epoch": 0.23077769827728548, + "grad_norm": 0.8187471628189087, + "learning_rate": 9.688517800007235e-06, + "loss": 0.7989, + "step": 4193 + }, + { + "epoch": 0.23083273707964114, + "grad_norm": 0.8131871819496155, + "learning_rate": 9.688367180319191e-06, + "loss": 0.8377, + "step": 4194 + }, + { + "epoch": 0.2308877758819968, + "grad_norm": 0.7933448553085327, + "learning_rate": 9.688216525394634e-06, + "loss": 0.8723, + "step": 4195 + }, + { + "epoch": 0.23094281468435246, + "grad_norm": 0.7262325286865234, + "learning_rate": 9.688065835234695e-06, + "loss": 0.7802, + "step": 4196 + }, + { + "epoch": 0.23099785348670812, + "grad_norm": 0.8289293050765991, + "learning_rate": 9.68791510984051e-06, + "loss": 0.642, + "step": 4197 + }, + { + "epoch": 0.23105289228906378, + "grad_norm": 0.8835988640785217, + "learning_rate": 9.687764349213211e-06, + "loss": 0.9002, + "step": 4198 + }, + { + "epoch": 0.23110793109141944, + "grad_norm": 0.9478649497032166, + "learning_rate": 9.687613553353927e-06, + "loss": 0.8668, + "step": 4199 + }, + { + "epoch": 0.2311629698937751, + "grad_norm": 0.872936487197876, + "learning_rate": 9.687462722263796e-06, + "loss": 0.8312, + "step": 4200 + }, + { + "epoch": 0.23121800869613077, + "grad_norm": 0.7073879241943359, + "learning_rate": 9.68731185594395e-06, + "loss": 0.776, + "step": 4201 + }, + { + "epoch": 0.23127304749848643, + "grad_norm": 0.8265218734741211, + "learning_rate": 9.687160954395522e-06, + "loss": 0.8152, + "step": 4202 + }, + { + "epoch": 0.2313280863008421, + "grad_norm": 0.8027207255363464, + "learning_rate": 9.687010017619649e-06, + "loss": 0.9514, + "step": 4203 + }, + { + "epoch": 0.23138312510319775, + "grad_norm": 0.7416790127754211, + "learning_rate": 9.68685904561746e-06, + "loss": 0.7708, + "step": 4204 + }, + { + "epoch": 0.2314381639055534, + "grad_norm": 0.7916150689125061, + "learning_rate": 9.686708038390096e-06, + "loss": 0.7753, + "step": 4205 + }, + { + "epoch": 0.23149320270790907, + "grad_norm": 0.7213300466537476, + "learning_rate": 9.686556995938688e-06, + "loss": 0.83, + "step": 4206 + }, + { + "epoch": 0.23154824151026474, + "grad_norm": 0.7595892548561096, + "learning_rate": 9.68640591826437e-06, + "loss": 0.8186, + "step": 4207 + }, + { + "epoch": 0.2316032803126204, + "grad_norm": 0.7042104601860046, + "learning_rate": 9.686254805368282e-06, + "loss": 0.7126, + "step": 4208 + }, + { + "epoch": 0.23165831911497606, + "grad_norm": 0.7416805028915405, + "learning_rate": 9.686103657251558e-06, + "loss": 0.7791, + "step": 4209 + }, + { + "epoch": 0.23171335791733172, + "grad_norm": 0.9868568181991577, + "learning_rate": 9.685952473915333e-06, + "loss": 0.8453, + "step": 4210 + }, + { + "epoch": 0.23176839671968738, + "grad_norm": 0.7133191823959351, + "learning_rate": 9.68580125536074e-06, + "loss": 0.6061, + "step": 4211 + }, + { + "epoch": 0.23182343552204304, + "grad_norm": 0.8307366967201233, + "learning_rate": 9.685650001588921e-06, + "loss": 0.8403, + "step": 4212 + }, + { + "epoch": 0.2318784743243987, + "grad_norm": 0.8395226001739502, + "learning_rate": 9.685498712601014e-06, + "loss": 0.7945, + "step": 4213 + }, + { + "epoch": 0.23193351312675436, + "grad_norm": 0.7557219862937927, + "learning_rate": 9.68534738839815e-06, + "loss": 0.7765, + "step": 4214 + }, + { + "epoch": 0.23198855192911003, + "grad_norm": 0.7003554105758667, + "learning_rate": 9.68519602898147e-06, + "loss": 0.7228, + "step": 4215 + }, + { + "epoch": 0.2320435907314657, + "grad_norm": 0.8422999382019043, + "learning_rate": 9.68504463435211e-06, + "loss": 0.8524, + "step": 4216 + }, + { + "epoch": 0.23209862953382135, + "grad_norm": 0.9369016289710999, + "learning_rate": 9.68489320451121e-06, + "loss": 0.7646, + "step": 4217 + }, + { + "epoch": 0.232153668336177, + "grad_norm": 0.8456607460975647, + "learning_rate": 9.684741739459905e-06, + "loss": 0.7481, + "step": 4218 + }, + { + "epoch": 0.23220870713853267, + "grad_norm": 0.9284812211990356, + "learning_rate": 9.684590239199336e-06, + "loss": 0.8192, + "step": 4219 + }, + { + "epoch": 0.23226374594088833, + "grad_norm": 0.8474242687225342, + "learning_rate": 9.68443870373064e-06, + "loss": 0.7143, + "step": 4220 + }, + { + "epoch": 0.232318784743244, + "grad_norm": 0.8259334564208984, + "learning_rate": 9.684287133054957e-06, + "loss": 0.8667, + "step": 4221 + }, + { + "epoch": 0.23237382354559966, + "grad_norm": 0.8016416430473328, + "learning_rate": 9.684135527173427e-06, + "loss": 0.8694, + "step": 4222 + }, + { + "epoch": 0.23242886234795532, + "grad_norm": 0.7575937509536743, + "learning_rate": 9.683983886087186e-06, + "loss": 0.7591, + "step": 4223 + }, + { + "epoch": 0.23248390115031098, + "grad_norm": 0.7004683613777161, + "learning_rate": 9.683832209797377e-06, + "loss": 0.739, + "step": 4224 + }, + { + "epoch": 0.23253893995266664, + "grad_norm": 0.8265832662582397, + "learning_rate": 9.68368049830514e-06, + "loss": 0.7705, + "step": 4225 + }, + { + "epoch": 0.2325939787550223, + "grad_norm": 0.7705711722373962, + "learning_rate": 9.683528751611612e-06, + "loss": 0.7896, + "step": 4226 + }, + { + "epoch": 0.23264901755737796, + "grad_norm": 0.7426978349685669, + "learning_rate": 9.683376969717937e-06, + "loss": 0.8217, + "step": 4227 + }, + { + "epoch": 0.23270405635973362, + "grad_norm": 0.7425839304924011, + "learning_rate": 9.683225152625255e-06, + "loss": 0.7426, + "step": 4228 + }, + { + "epoch": 0.23275909516208929, + "grad_norm": 1.0415440797805786, + "learning_rate": 9.683073300334705e-06, + "loss": 0.8585, + "step": 4229 + }, + { + "epoch": 0.23281413396444495, + "grad_norm": 0.7706055045127869, + "learning_rate": 9.68292141284743e-06, + "loss": 0.8349, + "step": 4230 + }, + { + "epoch": 0.23286917276680058, + "grad_norm": 0.8407607674598694, + "learning_rate": 9.682769490164572e-06, + "loss": 0.8592, + "step": 4231 + }, + { + "epoch": 0.23292421156915624, + "grad_norm": 0.6830767393112183, + "learning_rate": 9.68261753228727e-06, + "loss": 0.6773, + "step": 4232 + }, + { + "epoch": 0.2329792503715119, + "grad_norm": 1.6661429405212402, + "learning_rate": 9.68246553921667e-06, + "loss": 1.005, + "step": 4233 + }, + { + "epoch": 0.23303428917386756, + "grad_norm": 0.7677092552185059, + "learning_rate": 9.682313510953912e-06, + "loss": 0.7689, + "step": 4234 + }, + { + "epoch": 0.23308932797622323, + "grad_norm": 0.7232248187065125, + "learning_rate": 9.682161447500139e-06, + "loss": 0.7765, + "step": 4235 + }, + { + "epoch": 0.2331443667785789, + "grad_norm": 0.8667388558387756, + "learning_rate": 9.682009348856494e-06, + "loss": 0.8099, + "step": 4236 + }, + { + "epoch": 0.23319940558093455, + "grad_norm": 0.8220446705818176, + "learning_rate": 9.68185721502412e-06, + "loss": 0.8078, + "step": 4237 + }, + { + "epoch": 0.2332544443832902, + "grad_norm": 0.9670323133468628, + "learning_rate": 9.68170504600416e-06, + "loss": 0.8912, + "step": 4238 + }, + { + "epoch": 0.23330948318564587, + "grad_norm": 0.7950771450996399, + "learning_rate": 9.68155284179776e-06, + "loss": 0.8165, + "step": 4239 + }, + { + "epoch": 0.23336452198800153, + "grad_norm": 0.7606233358383179, + "learning_rate": 9.68140060240606e-06, + "loss": 0.7795, + "step": 4240 + }, + { + "epoch": 0.2334195607903572, + "grad_norm": 0.9580656886100769, + "learning_rate": 9.681248327830205e-06, + "loss": 0.7949, + "step": 4241 + }, + { + "epoch": 0.23347459959271286, + "grad_norm": 0.6878347992897034, + "learning_rate": 9.681096018071341e-06, + "loss": 0.7776, + "step": 4242 + }, + { + "epoch": 0.23352963839506852, + "grad_norm": 0.8449816107749939, + "learning_rate": 9.680943673130614e-06, + "loss": 0.8456, + "step": 4243 + }, + { + "epoch": 0.23358467719742418, + "grad_norm": 0.77314692735672, + "learning_rate": 9.680791293009167e-06, + "loss": 0.7915, + "step": 4244 + }, + { + "epoch": 0.23363971599977984, + "grad_norm": 0.8034142255783081, + "learning_rate": 9.680638877708146e-06, + "loss": 0.7377, + "step": 4245 + }, + { + "epoch": 0.2336947548021355, + "grad_norm": 0.8754952549934387, + "learning_rate": 9.680486427228695e-06, + "loss": 0.8072, + "step": 4246 + }, + { + "epoch": 0.23374979360449116, + "grad_norm": 0.8169820308685303, + "learning_rate": 9.680333941571963e-06, + "loss": 0.8253, + "step": 4247 + }, + { + "epoch": 0.23380483240684682, + "grad_norm": 0.7848341464996338, + "learning_rate": 9.680181420739092e-06, + "loss": 0.8243, + "step": 4248 + }, + { + "epoch": 0.23385987120920249, + "grad_norm": 0.7599799036979675, + "learning_rate": 9.68002886473123e-06, + "loss": 0.781, + "step": 4249 + }, + { + "epoch": 0.23391491001155815, + "grad_norm": 0.8920254707336426, + "learning_rate": 9.679876273549524e-06, + "loss": 0.8199, + "step": 4250 + }, + { + "epoch": 0.2339699488139138, + "grad_norm": 0.7813586592674255, + "learning_rate": 9.679723647195121e-06, + "loss": 0.7758, + "step": 4251 + }, + { + "epoch": 0.23402498761626947, + "grad_norm": 0.735282838344574, + "learning_rate": 9.679570985669168e-06, + "loss": 0.7651, + "step": 4252 + }, + { + "epoch": 0.23408002641862513, + "grad_norm": 0.7305853962898254, + "learning_rate": 9.679418288972813e-06, + "loss": 0.8202, + "step": 4253 + }, + { + "epoch": 0.2341350652209808, + "grad_norm": 0.8331005573272705, + "learning_rate": 9.6792655571072e-06, + "loss": 0.8784, + "step": 4254 + }, + { + "epoch": 0.23419010402333645, + "grad_norm": 0.8526305556297302, + "learning_rate": 9.679112790073481e-06, + "loss": 0.8116, + "step": 4255 + }, + { + "epoch": 0.23424514282569212, + "grad_norm": 0.741073489189148, + "learning_rate": 9.678959987872805e-06, + "loss": 0.6928, + "step": 4256 + }, + { + "epoch": 0.23430018162804778, + "grad_norm": 0.727859616279602, + "learning_rate": 9.678807150506315e-06, + "loss": 0.7571, + "step": 4257 + }, + { + "epoch": 0.23435522043040344, + "grad_norm": 0.8890698552131653, + "learning_rate": 9.678654277975165e-06, + "loss": 0.8145, + "step": 4258 + }, + { + "epoch": 0.2344102592327591, + "grad_norm": 0.7372937798500061, + "learning_rate": 9.6785013702805e-06, + "loss": 0.7104, + "step": 4259 + }, + { + "epoch": 0.23446529803511476, + "grad_norm": 0.7205008268356323, + "learning_rate": 9.678348427423472e-06, + "loss": 0.7498, + "step": 4260 + }, + { + "epoch": 0.23452033683747042, + "grad_norm": 0.7766392230987549, + "learning_rate": 9.67819544940523e-06, + "loss": 0.7814, + "step": 4261 + }, + { + "epoch": 0.23457537563982608, + "grad_norm": 0.7441498637199402, + "learning_rate": 9.678042436226922e-06, + "loss": 0.7429, + "step": 4262 + }, + { + "epoch": 0.23463041444218175, + "grad_norm": 0.8838522434234619, + "learning_rate": 9.677889387889701e-06, + "loss": 0.8719, + "step": 4263 + }, + { + "epoch": 0.2346854532445374, + "grad_norm": 1.2349655628204346, + "learning_rate": 9.677736304394716e-06, + "loss": 0.8491, + "step": 4264 + }, + { + "epoch": 0.23474049204689307, + "grad_norm": 0.8050087690353394, + "learning_rate": 9.677583185743116e-06, + "loss": 0.795, + "step": 4265 + }, + { + "epoch": 0.23479553084924873, + "grad_norm": 0.7885709404945374, + "learning_rate": 9.677430031936051e-06, + "loss": 0.8594, + "step": 4266 + }, + { + "epoch": 0.2348505696516044, + "grad_norm": 0.7753557562828064, + "learning_rate": 9.677276842974676e-06, + "loss": 0.8196, + "step": 4267 + }, + { + "epoch": 0.23490560845396005, + "grad_norm": 0.7325392961502075, + "learning_rate": 9.67712361886014e-06, + "loss": 0.7905, + "step": 4268 + }, + { + "epoch": 0.2349606472563157, + "grad_norm": 0.7925617694854736, + "learning_rate": 9.676970359593594e-06, + "loss": 0.7416, + "step": 4269 + }, + { + "epoch": 0.23501568605867137, + "grad_norm": 0.7981371283531189, + "learning_rate": 9.676817065176192e-06, + "loss": 0.81, + "step": 4270 + }, + { + "epoch": 0.23507072486102704, + "grad_norm": 0.7490524053573608, + "learning_rate": 9.676663735609084e-06, + "loss": 0.8347, + "step": 4271 + }, + { + "epoch": 0.2351257636633827, + "grad_norm": 1.000349521636963, + "learning_rate": 9.676510370893424e-06, + "loss": 0.7469, + "step": 4272 + }, + { + "epoch": 0.23518080246573836, + "grad_norm": 0.9310774207115173, + "learning_rate": 9.676356971030364e-06, + "loss": 0.8088, + "step": 4273 + }, + { + "epoch": 0.235235841268094, + "grad_norm": 0.8868544101715088, + "learning_rate": 9.676203536021055e-06, + "loss": 0.7472, + "step": 4274 + }, + { + "epoch": 0.23529088007044965, + "grad_norm": 0.7702255845069885, + "learning_rate": 9.676050065866653e-06, + "loss": 0.8395, + "step": 4275 + }, + { + "epoch": 0.23534591887280532, + "grad_norm": 0.7138833999633789, + "learning_rate": 9.675896560568311e-06, + "loss": 0.8529, + "step": 4276 + }, + { + "epoch": 0.23540095767516098, + "grad_norm": 0.8399729132652283, + "learning_rate": 9.675743020127182e-06, + "loss": 0.7844, + "step": 4277 + }, + { + "epoch": 0.23545599647751664, + "grad_norm": 0.8500726819038391, + "learning_rate": 9.67558944454442e-06, + "loss": 0.8209, + "step": 4278 + }, + { + "epoch": 0.2355110352798723, + "grad_norm": 0.766638994216919, + "learning_rate": 9.675435833821178e-06, + "loss": 0.7834, + "step": 4279 + }, + { + "epoch": 0.23556607408222796, + "grad_norm": 0.9121370315551758, + "learning_rate": 9.675282187958613e-06, + "loss": 0.8697, + "step": 4280 + }, + { + "epoch": 0.23562111288458362, + "grad_norm": 0.7862319946289062, + "learning_rate": 9.675128506957879e-06, + "loss": 0.8262, + "step": 4281 + }, + { + "epoch": 0.23567615168693928, + "grad_norm": 1.072777509689331, + "learning_rate": 9.67497479082013e-06, + "loss": 0.7963, + "step": 4282 + }, + { + "epoch": 0.23573119048929495, + "grad_norm": 0.7574695944786072, + "learning_rate": 9.67482103954652e-06, + "loss": 0.8178, + "step": 4283 + }, + { + "epoch": 0.2357862292916506, + "grad_norm": 0.7996877431869507, + "learning_rate": 9.674667253138209e-06, + "loss": 0.8465, + "step": 4284 + }, + { + "epoch": 0.23584126809400627, + "grad_norm": 0.711513340473175, + "learning_rate": 9.674513431596349e-06, + "loss": 0.7445, + "step": 4285 + }, + { + "epoch": 0.23589630689636193, + "grad_norm": 0.7431296706199646, + "learning_rate": 9.674359574922098e-06, + "loss": 0.8102, + "step": 4286 + }, + { + "epoch": 0.2359513456987176, + "grad_norm": 0.7745676040649414, + "learning_rate": 9.674205683116612e-06, + "loss": 0.8733, + "step": 4287 + }, + { + "epoch": 0.23600638450107325, + "grad_norm": 1.0117937326431274, + "learning_rate": 9.674051756181046e-06, + "loss": 0.9035, + "step": 4288 + }, + { + "epoch": 0.2360614233034289, + "grad_norm": 0.7848078608512878, + "learning_rate": 9.67389779411656e-06, + "loss": 0.8486, + "step": 4289 + }, + { + "epoch": 0.23611646210578457, + "grad_norm": 0.8439378142356873, + "learning_rate": 9.673743796924307e-06, + "loss": 0.8032, + "step": 4290 + }, + { + "epoch": 0.23617150090814024, + "grad_norm": 0.8268104791641235, + "learning_rate": 9.673589764605449e-06, + "loss": 0.8182, + "step": 4291 + }, + { + "epoch": 0.2362265397104959, + "grad_norm": 0.8896234631538391, + "learning_rate": 9.67343569716114e-06, + "loss": 0.8081, + "step": 4292 + }, + { + "epoch": 0.23628157851285156, + "grad_norm": 0.8515019416809082, + "learning_rate": 9.67328159459254e-06, + "loss": 0.8239, + "step": 4293 + }, + { + "epoch": 0.23633661731520722, + "grad_norm": 0.7779792547225952, + "learning_rate": 9.673127456900806e-06, + "loss": 0.8437, + "step": 4294 + }, + { + "epoch": 0.23639165611756288, + "grad_norm": 0.7782402634620667, + "learning_rate": 9.672973284087097e-06, + "loss": 0.8498, + "step": 4295 + }, + { + "epoch": 0.23644669491991854, + "grad_norm": 0.7588973641395569, + "learning_rate": 9.67281907615257e-06, + "loss": 0.7034, + "step": 4296 + }, + { + "epoch": 0.2365017337222742, + "grad_norm": 0.8426640629768372, + "learning_rate": 9.67266483309839e-06, + "loss": 0.803, + "step": 4297 + }, + { + "epoch": 0.23655677252462987, + "grad_norm": 0.8945889472961426, + "learning_rate": 9.672510554925707e-06, + "loss": 0.8971, + "step": 4298 + }, + { + "epoch": 0.23661181132698553, + "grad_norm": 0.8604227304458618, + "learning_rate": 9.672356241635688e-06, + "loss": 0.7548, + "step": 4299 + }, + { + "epoch": 0.2366668501293412, + "grad_norm": 0.7277490496635437, + "learning_rate": 9.672201893229489e-06, + "loss": 0.8083, + "step": 4300 + }, + { + "epoch": 0.23672188893169685, + "grad_norm": 0.9089379906654358, + "learning_rate": 9.672047509708273e-06, + "loss": 0.9717, + "step": 4301 + }, + { + "epoch": 0.2367769277340525, + "grad_norm": 0.7207155823707581, + "learning_rate": 9.671893091073198e-06, + "loss": 0.6794, + "step": 4302 + }, + { + "epoch": 0.23683196653640817, + "grad_norm": 0.7319806814193726, + "learning_rate": 9.671738637325425e-06, + "loss": 0.6821, + "step": 4303 + }, + { + "epoch": 0.23688700533876383, + "grad_norm": 0.7339589595794678, + "learning_rate": 9.671584148466112e-06, + "loss": 0.7895, + "step": 4304 + }, + { + "epoch": 0.2369420441411195, + "grad_norm": 0.7725476622581482, + "learning_rate": 9.671429624496428e-06, + "loss": 0.7414, + "step": 4305 + }, + { + "epoch": 0.23699708294347516, + "grad_norm": 0.7040137648582458, + "learning_rate": 9.671275065417527e-06, + "loss": 0.696, + "step": 4306 + }, + { + "epoch": 0.23705212174583082, + "grad_norm": 0.8804189562797546, + "learning_rate": 9.671120471230572e-06, + "loss": 0.8184, + "step": 4307 + }, + { + "epoch": 0.23710716054818648, + "grad_norm": 0.8062872886657715, + "learning_rate": 9.670965841936728e-06, + "loss": 0.7856, + "step": 4308 + }, + { + "epoch": 0.23716219935054214, + "grad_norm": 0.7537097930908203, + "learning_rate": 9.670811177537154e-06, + "loss": 0.7562, + "step": 4309 + }, + { + "epoch": 0.2372172381528978, + "grad_norm": 0.8168618083000183, + "learning_rate": 9.670656478033013e-06, + "loss": 0.7416, + "step": 4310 + }, + { + "epoch": 0.23727227695525346, + "grad_norm": 0.8367040157318115, + "learning_rate": 9.670501743425469e-06, + "loss": 0.7759, + "step": 4311 + }, + { + "epoch": 0.23732731575760913, + "grad_norm": 0.860418975353241, + "learning_rate": 9.670346973715683e-06, + "loss": 0.9013, + "step": 4312 + }, + { + "epoch": 0.2373823545599648, + "grad_norm": 0.8736678957939148, + "learning_rate": 9.67019216890482e-06, + "loss": 0.8677, + "step": 4313 + }, + { + "epoch": 0.23743739336232045, + "grad_norm": 0.8258964419364929, + "learning_rate": 9.670037328994044e-06, + "loss": 0.8208, + "step": 4314 + }, + { + "epoch": 0.2374924321646761, + "grad_norm": 0.7936292886734009, + "learning_rate": 9.669882453984516e-06, + "loss": 0.8643, + "step": 4315 + }, + { + "epoch": 0.23754747096703177, + "grad_norm": 0.805500864982605, + "learning_rate": 9.669727543877401e-06, + "loss": 0.779, + "step": 4316 + }, + { + "epoch": 0.2376025097693874, + "grad_norm": 0.8072311282157898, + "learning_rate": 9.669572598673866e-06, + "loss": 0.8258, + "step": 4317 + }, + { + "epoch": 0.23765754857174307, + "grad_norm": 0.8917607665061951, + "learning_rate": 9.669417618375072e-06, + "loss": 0.7528, + "step": 4318 + }, + { + "epoch": 0.23771258737409873, + "grad_norm": 0.7054246068000793, + "learning_rate": 9.669262602982186e-06, + "loss": 0.86, + "step": 4319 + }, + { + "epoch": 0.2377676261764544, + "grad_norm": 0.8600299954414368, + "learning_rate": 9.66910755249637e-06, + "loss": 0.8165, + "step": 4320 + }, + { + "epoch": 0.23782266497881005, + "grad_norm": 0.8685561418533325, + "learning_rate": 9.668952466918793e-06, + "loss": 0.8129, + "step": 4321 + }, + { + "epoch": 0.2378777037811657, + "grad_norm": 0.7859770655632019, + "learning_rate": 9.668797346250618e-06, + "loss": 0.8703, + "step": 4322 + }, + { + "epoch": 0.23793274258352137, + "grad_norm": 0.8128730058670044, + "learning_rate": 9.668642190493015e-06, + "loss": 0.7595, + "step": 4323 + }, + { + "epoch": 0.23798778138587703, + "grad_norm": 0.8223204612731934, + "learning_rate": 9.668486999647143e-06, + "loss": 0.825, + "step": 4324 + }, + { + "epoch": 0.2380428201882327, + "grad_norm": 0.859619677066803, + "learning_rate": 9.668331773714175e-06, + "loss": 0.8239, + "step": 4325 + }, + { + "epoch": 0.23809785899058836, + "grad_norm": 0.9861679673194885, + "learning_rate": 9.668176512695273e-06, + "loss": 0.8409, + "step": 4326 + }, + { + "epoch": 0.23815289779294402, + "grad_norm": 0.7178627252578735, + "learning_rate": 9.668021216591607e-06, + "loss": 0.818, + "step": 4327 + }, + { + "epoch": 0.23820793659529968, + "grad_norm": 0.9160923957824707, + "learning_rate": 9.667865885404343e-06, + "loss": 0.8703, + "step": 4328 + }, + { + "epoch": 0.23826297539765534, + "grad_norm": 0.7043942213058472, + "learning_rate": 9.667710519134648e-06, + "loss": 0.6884, + "step": 4329 + }, + { + "epoch": 0.238318014200011, + "grad_norm": 1.213121771812439, + "learning_rate": 9.667555117783691e-06, + "loss": 0.7843, + "step": 4330 + }, + { + "epoch": 0.23837305300236666, + "grad_norm": 0.8008033037185669, + "learning_rate": 9.66739968135264e-06, + "loss": 0.9312, + "step": 4331 + }, + { + "epoch": 0.23842809180472233, + "grad_norm": 0.7862009406089783, + "learning_rate": 9.667244209842662e-06, + "loss": 0.6965, + "step": 4332 + }, + { + "epoch": 0.238483130607078, + "grad_norm": 1.081398844718933, + "learning_rate": 9.667088703254923e-06, + "loss": 0.9793, + "step": 4333 + }, + { + "epoch": 0.23853816940943365, + "grad_norm": 0.7672395706176758, + "learning_rate": 9.666933161590597e-06, + "loss": 0.813, + "step": 4334 + }, + { + "epoch": 0.2385932082117893, + "grad_norm": 0.6955493092536926, + "learning_rate": 9.66677758485085e-06, + "loss": 0.7778, + "step": 4335 + }, + { + "epoch": 0.23864824701414497, + "grad_norm": 0.8609682321548462, + "learning_rate": 9.666621973036854e-06, + "loss": 0.7817, + "step": 4336 + }, + { + "epoch": 0.23870328581650063, + "grad_norm": 0.7312196493148804, + "learning_rate": 9.666466326149774e-06, + "loss": 0.7368, + "step": 4337 + }, + { + "epoch": 0.2387583246188563, + "grad_norm": 0.7964538931846619, + "learning_rate": 9.666310644190782e-06, + "loss": 0.8124, + "step": 4338 + }, + { + "epoch": 0.23881336342121195, + "grad_norm": 1.1138910055160522, + "learning_rate": 9.66615492716105e-06, + "loss": 0.8886, + "step": 4339 + }, + { + "epoch": 0.23886840222356762, + "grad_norm": 0.8789949417114258, + "learning_rate": 9.665999175061747e-06, + "loss": 0.7854, + "step": 4340 + }, + { + "epoch": 0.23892344102592328, + "grad_norm": 0.7761380076408386, + "learning_rate": 9.665843387894041e-06, + "loss": 0.7915, + "step": 4341 + }, + { + "epoch": 0.23897847982827894, + "grad_norm": 0.888482928276062, + "learning_rate": 9.665687565659106e-06, + "loss": 0.8799, + "step": 4342 + }, + { + "epoch": 0.2390335186306346, + "grad_norm": 0.7799200415611267, + "learning_rate": 9.665531708358111e-06, + "loss": 0.8519, + "step": 4343 + }, + { + "epoch": 0.23908855743299026, + "grad_norm": 0.7407697439193726, + "learning_rate": 9.665375815992231e-06, + "loss": 0.7637, + "step": 4344 + }, + { + "epoch": 0.23914359623534592, + "grad_norm": 0.8098278045654297, + "learning_rate": 9.665219888562634e-06, + "loss": 0.7991, + "step": 4345 + }, + { + "epoch": 0.23919863503770158, + "grad_norm": 0.7585136294364929, + "learning_rate": 9.665063926070493e-06, + "loss": 0.8478, + "step": 4346 + }, + { + "epoch": 0.23925367384005725, + "grad_norm": 0.7294817566871643, + "learning_rate": 9.66490792851698e-06, + "loss": 0.8312, + "step": 4347 + }, + { + "epoch": 0.2393087126424129, + "grad_norm": 0.8325762748718262, + "learning_rate": 9.664751895903269e-06, + "loss": 0.9365, + "step": 4348 + }, + { + "epoch": 0.23936375144476857, + "grad_norm": 0.9992470741271973, + "learning_rate": 9.66459582823053e-06, + "loss": 0.8649, + "step": 4349 + }, + { + "epoch": 0.23941879024712423, + "grad_norm": 0.7206875681877136, + "learning_rate": 9.664439725499938e-06, + "loss": 0.7013, + "step": 4350 + }, + { + "epoch": 0.2394738290494799, + "grad_norm": 0.946657657623291, + "learning_rate": 9.664283587712665e-06, + "loss": 0.7953, + "step": 4351 + }, + { + "epoch": 0.23952886785183555, + "grad_norm": 0.7684911489486694, + "learning_rate": 9.664127414869887e-06, + "loss": 0.8403, + "step": 4352 + }, + { + "epoch": 0.23958390665419121, + "grad_norm": 0.7875770926475525, + "learning_rate": 9.663971206972773e-06, + "loss": 0.7961, + "step": 4353 + }, + { + "epoch": 0.23963894545654688, + "grad_norm": 0.7387273907661438, + "learning_rate": 9.663814964022502e-06, + "loss": 0.8265, + "step": 4354 + }, + { + "epoch": 0.23969398425890254, + "grad_norm": 0.7413492202758789, + "learning_rate": 9.663658686020245e-06, + "loss": 0.8458, + "step": 4355 + }, + { + "epoch": 0.2397490230612582, + "grad_norm": 0.7563235759735107, + "learning_rate": 9.663502372967177e-06, + "loss": 0.8498, + "step": 4356 + }, + { + "epoch": 0.23980406186361386, + "grad_norm": 0.7529472708702087, + "learning_rate": 9.663346024864475e-06, + "loss": 0.7597, + "step": 4357 + }, + { + "epoch": 0.23985910066596952, + "grad_norm": 0.7582191824913025, + "learning_rate": 9.663189641713314e-06, + "loss": 0.804, + "step": 4358 + }, + { + "epoch": 0.23991413946832518, + "grad_norm": 0.8394485712051392, + "learning_rate": 9.663033223514865e-06, + "loss": 0.8329, + "step": 4359 + }, + { + "epoch": 0.23996917827068082, + "grad_norm": 0.7088292241096497, + "learning_rate": 9.662876770270308e-06, + "loss": 0.7131, + "step": 4360 + }, + { + "epoch": 0.24002421707303648, + "grad_norm": 0.8548080325126648, + "learning_rate": 9.662720281980817e-06, + "loss": 0.8925, + "step": 4361 + }, + { + "epoch": 0.24007925587539214, + "grad_norm": 0.8027567267417908, + "learning_rate": 9.662563758647568e-06, + "loss": 0.8652, + "step": 4362 + }, + { + "epoch": 0.2401342946777478, + "grad_norm": 0.7471736669540405, + "learning_rate": 9.662407200271738e-06, + "loss": 0.7722, + "step": 4363 + }, + { + "epoch": 0.24018933348010346, + "grad_norm": 0.7358804941177368, + "learning_rate": 9.662250606854504e-06, + "loss": 0.767, + "step": 4364 + }, + { + "epoch": 0.24024437228245912, + "grad_norm": 0.7948476672172546, + "learning_rate": 9.662093978397042e-06, + "loss": 0.961, + "step": 4365 + }, + { + "epoch": 0.24029941108481478, + "grad_norm": 0.7030961513519287, + "learning_rate": 9.66193731490053e-06, + "loss": 0.7826, + "step": 4366 + }, + { + "epoch": 0.24035444988717045, + "grad_norm": 0.8376098871231079, + "learning_rate": 9.661780616366145e-06, + "loss": 0.7697, + "step": 4367 + }, + { + "epoch": 0.2404094886895261, + "grad_norm": 0.7449594140052795, + "learning_rate": 9.661623882795065e-06, + "loss": 0.7944, + "step": 4368 + }, + { + "epoch": 0.24046452749188177, + "grad_norm": 0.7317184805870056, + "learning_rate": 9.661467114188468e-06, + "loss": 0.7059, + "step": 4369 + }, + { + "epoch": 0.24051956629423743, + "grad_norm": 0.843912661075592, + "learning_rate": 9.661310310547531e-06, + "loss": 0.7889, + "step": 4370 + }, + { + "epoch": 0.2405746050965931, + "grad_norm": 0.8673211336135864, + "learning_rate": 9.661153471873435e-06, + "loss": 0.7234, + "step": 4371 + }, + { + "epoch": 0.24062964389894875, + "grad_norm": 0.8179688453674316, + "learning_rate": 9.660996598167354e-06, + "loss": 0.8937, + "step": 4372 + }, + { + "epoch": 0.24068468270130441, + "grad_norm": 0.7800211906433105, + "learning_rate": 9.660839689430473e-06, + "loss": 0.8596, + "step": 4373 + }, + { + "epoch": 0.24073972150366008, + "grad_norm": 0.8781671524047852, + "learning_rate": 9.660682745663967e-06, + "loss": 0.8507, + "step": 4374 + }, + { + "epoch": 0.24079476030601574, + "grad_norm": 0.7701708674430847, + "learning_rate": 9.660525766869019e-06, + "loss": 0.8212, + "step": 4375 + }, + { + "epoch": 0.2408497991083714, + "grad_norm": 0.7721084356307983, + "learning_rate": 9.660368753046806e-06, + "loss": 0.7493, + "step": 4376 + }, + { + "epoch": 0.24090483791072706, + "grad_norm": 0.8126489520072937, + "learning_rate": 9.660211704198508e-06, + "loss": 0.8527, + "step": 4377 + }, + { + "epoch": 0.24095987671308272, + "grad_norm": 0.8172717690467834, + "learning_rate": 9.660054620325307e-06, + "loss": 0.8448, + "step": 4378 + }, + { + "epoch": 0.24101491551543838, + "grad_norm": 0.8293611407279968, + "learning_rate": 9.659897501428384e-06, + "loss": 0.9318, + "step": 4379 + }, + { + "epoch": 0.24106995431779404, + "grad_norm": 0.7445098161697388, + "learning_rate": 9.659740347508917e-06, + "loss": 0.7358, + "step": 4380 + }, + { + "epoch": 0.2411249931201497, + "grad_norm": 0.7778907418251038, + "learning_rate": 9.659583158568088e-06, + "loss": 0.7671, + "step": 4381 + }, + { + "epoch": 0.24118003192250537, + "grad_norm": 0.7828608751296997, + "learning_rate": 9.659425934607082e-06, + "loss": 0.8141, + "step": 4382 + }, + { + "epoch": 0.24123507072486103, + "grad_norm": 0.9433113932609558, + "learning_rate": 9.659268675627075e-06, + "loss": 0.7904, + "step": 4383 + }, + { + "epoch": 0.2412901095272167, + "grad_norm": 0.7097491025924683, + "learning_rate": 9.659111381629255e-06, + "loss": 0.7445, + "step": 4384 + }, + { + "epoch": 0.24134514832957235, + "grad_norm": 0.7450230717658997, + "learning_rate": 9.6589540526148e-06, + "loss": 0.6869, + "step": 4385 + }, + { + "epoch": 0.241400187131928, + "grad_norm": 0.7429760694503784, + "learning_rate": 9.658796688584893e-06, + "loss": 0.7367, + "step": 4386 + }, + { + "epoch": 0.24145522593428367, + "grad_norm": 0.7250030040740967, + "learning_rate": 9.658639289540716e-06, + "loss": 0.7502, + "step": 4387 + }, + { + "epoch": 0.24151026473663934, + "grad_norm": 0.6577159762382507, + "learning_rate": 9.658481855483455e-06, + "loss": 0.5785, + "step": 4388 + }, + { + "epoch": 0.241565303538995, + "grad_norm": 0.7846524119377136, + "learning_rate": 9.65832438641429e-06, + "loss": 0.7435, + "step": 4389 + }, + { + "epoch": 0.24162034234135066, + "grad_norm": 0.8370404839515686, + "learning_rate": 9.658166882334408e-06, + "loss": 0.8536, + "step": 4390 + }, + { + "epoch": 0.24167538114370632, + "grad_norm": 0.7451018691062927, + "learning_rate": 9.658009343244987e-06, + "loss": 0.8443, + "step": 4391 + }, + { + "epoch": 0.24173041994606198, + "grad_norm": 0.7629074454307556, + "learning_rate": 9.657851769147218e-06, + "loss": 0.7394, + "step": 4392 + }, + { + "epoch": 0.24178545874841764, + "grad_norm": 0.7767705321311951, + "learning_rate": 9.657694160042282e-06, + "loss": 0.8497, + "step": 4393 + }, + { + "epoch": 0.2418404975507733, + "grad_norm": 0.8635357022285461, + "learning_rate": 9.65753651593136e-06, + "loss": 0.8495, + "step": 4394 + }, + { + "epoch": 0.24189553635312896, + "grad_norm": 0.7652365565299988, + "learning_rate": 9.657378836815643e-06, + "loss": 0.7967, + "step": 4395 + }, + { + "epoch": 0.24195057515548463, + "grad_norm": 0.7721680402755737, + "learning_rate": 9.657221122696313e-06, + "loss": 0.8227, + "step": 4396 + }, + { + "epoch": 0.2420056139578403, + "grad_norm": 1.016366720199585, + "learning_rate": 9.657063373574555e-06, + "loss": 0.8291, + "step": 4397 + }, + { + "epoch": 0.24206065276019595, + "grad_norm": 0.7770145535469055, + "learning_rate": 9.656905589451555e-06, + "loss": 0.8335, + "step": 4398 + }, + { + "epoch": 0.2421156915625516, + "grad_norm": 0.812882125377655, + "learning_rate": 9.6567477703285e-06, + "loss": 0.8189, + "step": 4399 + }, + { + "epoch": 0.24217073036490727, + "grad_norm": 0.7253247499465942, + "learning_rate": 9.656589916206576e-06, + "loss": 0.8418, + "step": 4400 + }, + { + "epoch": 0.24222576916726293, + "grad_norm": 0.7784958481788635, + "learning_rate": 9.656432027086969e-06, + "loss": 0.8541, + "step": 4401 + }, + { + "epoch": 0.2422808079696186, + "grad_norm": 0.8001978397369385, + "learning_rate": 9.656274102970865e-06, + "loss": 0.8888, + "step": 4402 + }, + { + "epoch": 0.24233584677197423, + "grad_norm": 0.7535765767097473, + "learning_rate": 9.656116143859448e-06, + "loss": 0.7691, + "step": 4403 + }, + { + "epoch": 0.2423908855743299, + "grad_norm": 0.6554346680641174, + "learning_rate": 9.655958149753913e-06, + "loss": 0.7592, + "step": 4404 + }, + { + "epoch": 0.24244592437668555, + "grad_norm": 0.8599995374679565, + "learning_rate": 9.655800120655439e-06, + "loss": 0.8396, + "step": 4405 + }, + { + "epoch": 0.2425009631790412, + "grad_norm": 0.8172232508659363, + "learning_rate": 9.65564205656522e-06, + "loss": 0.6931, + "step": 4406 + }, + { + "epoch": 0.24255600198139687, + "grad_norm": 0.8005852699279785, + "learning_rate": 9.65548395748444e-06, + "loss": 0.8344, + "step": 4407 + }, + { + "epoch": 0.24261104078375254, + "grad_norm": 0.7823762893676758, + "learning_rate": 9.65532582341429e-06, + "loss": 0.7991, + "step": 4408 + }, + { + "epoch": 0.2426660795861082, + "grad_norm": 0.7743250727653503, + "learning_rate": 9.655167654355957e-06, + "loss": 0.9048, + "step": 4409 + }, + { + "epoch": 0.24272111838846386, + "grad_norm": 0.9825221300125122, + "learning_rate": 9.655009450310629e-06, + "loss": 0.7491, + "step": 4410 + }, + { + "epoch": 0.24277615719081952, + "grad_norm": 1.2921068668365479, + "learning_rate": 9.654851211279496e-06, + "loss": 0.8175, + "step": 4411 + }, + { + "epoch": 0.24283119599317518, + "grad_norm": 0.8267684578895569, + "learning_rate": 9.65469293726375e-06, + "loss": 0.8896, + "step": 4412 + }, + { + "epoch": 0.24288623479553084, + "grad_norm": 0.8020186424255371, + "learning_rate": 9.654534628264576e-06, + "loss": 0.7145, + "step": 4413 + }, + { + "epoch": 0.2429412735978865, + "grad_norm": 0.8192574977874756, + "learning_rate": 9.654376284283166e-06, + "loss": 0.7451, + "step": 4414 + }, + { + "epoch": 0.24299631240024216, + "grad_norm": 0.7733662128448486, + "learning_rate": 9.65421790532071e-06, + "loss": 0.768, + "step": 4415 + }, + { + "epoch": 0.24305135120259783, + "grad_norm": 0.8342406153678894, + "learning_rate": 9.654059491378396e-06, + "loss": 0.8137, + "step": 4416 + }, + { + "epoch": 0.2431063900049535, + "grad_norm": 1.014755368232727, + "learning_rate": 9.653901042457418e-06, + "loss": 0.8922, + "step": 4417 + }, + { + "epoch": 0.24316142880730915, + "grad_norm": 0.864608645439148, + "learning_rate": 9.653742558558967e-06, + "loss": 0.9412, + "step": 4418 + }, + { + "epoch": 0.2432164676096648, + "grad_norm": 0.7383908033370972, + "learning_rate": 9.65358403968423e-06, + "loss": 0.8261, + "step": 4419 + }, + { + "epoch": 0.24327150641202047, + "grad_norm": 0.7464672923088074, + "learning_rate": 9.653425485834403e-06, + "loss": 0.7074, + "step": 4420 + }, + { + "epoch": 0.24332654521437613, + "grad_norm": 0.7010141611099243, + "learning_rate": 9.653266897010676e-06, + "loss": 0.6849, + "step": 4421 + }, + { + "epoch": 0.2433815840167318, + "grad_norm": 0.7135268449783325, + "learning_rate": 9.653108273214239e-06, + "loss": 0.8228, + "step": 4422 + }, + { + "epoch": 0.24343662281908746, + "grad_norm": 0.8061006665229797, + "learning_rate": 9.652949614446287e-06, + "loss": 0.8345, + "step": 4423 + }, + { + "epoch": 0.24349166162144312, + "grad_norm": 0.6954759955406189, + "learning_rate": 9.652790920708011e-06, + "loss": 0.7189, + "step": 4424 + }, + { + "epoch": 0.24354670042379878, + "grad_norm": 0.8669333457946777, + "learning_rate": 9.652632192000603e-06, + "loss": 0.8872, + "step": 4425 + }, + { + "epoch": 0.24360173922615444, + "grad_norm": 0.7445051670074463, + "learning_rate": 9.652473428325258e-06, + "loss": 0.826, + "step": 4426 + }, + { + "epoch": 0.2436567780285101, + "grad_norm": 0.7444632649421692, + "learning_rate": 9.652314629683165e-06, + "loss": 0.8568, + "step": 4427 + }, + { + "epoch": 0.24371181683086576, + "grad_norm": 0.7160165309906006, + "learning_rate": 9.652155796075524e-06, + "loss": 0.799, + "step": 4428 + }, + { + "epoch": 0.24376685563322142, + "grad_norm": 0.7098904252052307, + "learning_rate": 9.651996927503526e-06, + "loss": 0.8148, + "step": 4429 + }, + { + "epoch": 0.24382189443557709, + "grad_norm": 0.7911115288734436, + "learning_rate": 9.651838023968363e-06, + "loss": 0.8279, + "step": 4430 + }, + { + "epoch": 0.24387693323793275, + "grad_norm": 0.8887501955032349, + "learning_rate": 9.651679085471229e-06, + "loss": 0.8464, + "step": 4431 + }, + { + "epoch": 0.2439319720402884, + "grad_norm": 0.8343196511268616, + "learning_rate": 9.651520112013321e-06, + "loss": 0.7364, + "step": 4432 + }, + { + "epoch": 0.24398701084264407, + "grad_norm": 0.7279361486434937, + "learning_rate": 9.651361103595835e-06, + "loss": 0.7958, + "step": 4433 + }, + { + "epoch": 0.24404204964499973, + "grad_norm": 0.8221089243888855, + "learning_rate": 9.651202060219962e-06, + "loss": 0.7753, + "step": 4434 + }, + { + "epoch": 0.2440970884473554, + "grad_norm": 0.7205086350440979, + "learning_rate": 9.6510429818869e-06, + "loss": 0.7411, + "step": 4435 + }, + { + "epoch": 0.24415212724971105, + "grad_norm": 0.854967474937439, + "learning_rate": 9.650883868597845e-06, + "loss": 0.8192, + "step": 4436 + }, + { + "epoch": 0.24420716605206672, + "grad_norm": 0.7622473835945129, + "learning_rate": 9.65072472035399e-06, + "loss": 0.7645, + "step": 4437 + }, + { + "epoch": 0.24426220485442238, + "grad_norm": 0.7430302500724792, + "learning_rate": 9.650565537156533e-06, + "loss": 0.7817, + "step": 4438 + }, + { + "epoch": 0.24431724365677804, + "grad_norm": 0.8022677898406982, + "learning_rate": 9.650406319006672e-06, + "loss": 0.8035, + "step": 4439 + }, + { + "epoch": 0.2443722824591337, + "grad_norm": 0.7346476912498474, + "learning_rate": 9.6502470659056e-06, + "loss": 0.826, + "step": 4440 + }, + { + "epoch": 0.24442732126148936, + "grad_norm": 0.8393376469612122, + "learning_rate": 9.650087777854517e-06, + "loss": 0.8073, + "step": 4441 + }, + { + "epoch": 0.24448236006384502, + "grad_norm": 0.7920215129852295, + "learning_rate": 9.649928454854618e-06, + "loss": 0.7774, + "step": 4442 + }, + { + "epoch": 0.24453739886620068, + "grad_norm": 0.8192804455757141, + "learning_rate": 9.649769096907102e-06, + "loss": 0.7817, + "step": 4443 + }, + { + "epoch": 0.24459243766855635, + "grad_norm": 0.7727654576301575, + "learning_rate": 9.649609704013167e-06, + "loss": 0.8201, + "step": 4444 + }, + { + "epoch": 0.244647476470912, + "grad_norm": 0.8005746603012085, + "learning_rate": 9.649450276174008e-06, + "loss": 0.8893, + "step": 4445 + }, + { + "epoch": 0.24470251527326764, + "grad_norm": 0.9029125571250916, + "learning_rate": 9.649290813390828e-06, + "loss": 0.7735, + "step": 4446 + }, + { + "epoch": 0.2447575540756233, + "grad_norm": 0.8336170315742493, + "learning_rate": 9.64913131566482e-06, + "loss": 0.7505, + "step": 4447 + }, + { + "epoch": 0.24481259287797896, + "grad_norm": 1.0272265672683716, + "learning_rate": 9.648971782997188e-06, + "loss": 0.8371, + "step": 4448 + }, + { + "epoch": 0.24486763168033462, + "grad_norm": 0.8095843195915222, + "learning_rate": 9.648812215389128e-06, + "loss": 0.7599, + "step": 4449 + }, + { + "epoch": 0.24492267048269029, + "grad_norm": 0.7690166234970093, + "learning_rate": 9.648652612841837e-06, + "loss": 0.8172, + "step": 4450 + }, + { + "epoch": 0.24497770928504595, + "grad_norm": 0.8282617926597595, + "learning_rate": 9.64849297535652e-06, + "loss": 0.8477, + "step": 4451 + }, + { + "epoch": 0.2450327480874016, + "grad_norm": 0.8307822346687317, + "learning_rate": 9.648333302934373e-06, + "loss": 0.7744, + "step": 4452 + }, + { + "epoch": 0.24508778688975727, + "grad_norm": 0.7619080543518066, + "learning_rate": 9.6481735955766e-06, + "loss": 0.8417, + "step": 4453 + }, + { + "epoch": 0.24514282569211293, + "grad_norm": 0.7879447937011719, + "learning_rate": 9.648013853284396e-06, + "loss": 0.7799, + "step": 4454 + }, + { + "epoch": 0.2451978644944686, + "grad_norm": 0.7352256774902344, + "learning_rate": 9.647854076058965e-06, + "loss": 0.8386, + "step": 4455 + }, + { + "epoch": 0.24525290329682425, + "grad_norm": 0.8318933248519897, + "learning_rate": 9.647694263901507e-06, + "loss": 0.7631, + "step": 4456 + }, + { + "epoch": 0.24530794209917992, + "grad_norm": 0.8609912395477295, + "learning_rate": 9.647534416813221e-06, + "loss": 0.7479, + "step": 4457 + }, + { + "epoch": 0.24536298090153558, + "grad_norm": 0.9590480327606201, + "learning_rate": 9.647374534795311e-06, + "loss": 0.8543, + "step": 4458 + }, + { + "epoch": 0.24541801970389124, + "grad_norm": 0.7902723550796509, + "learning_rate": 9.647214617848979e-06, + "loss": 0.6796, + "step": 4459 + }, + { + "epoch": 0.2454730585062469, + "grad_norm": 0.7725642919540405, + "learning_rate": 9.647054665975427e-06, + "loss": 0.7563, + "step": 4460 + }, + { + "epoch": 0.24552809730860256, + "grad_norm": 0.8387014269828796, + "learning_rate": 9.646894679175853e-06, + "loss": 0.8184, + "step": 4461 + }, + { + "epoch": 0.24558313611095822, + "grad_norm": 0.9200852513313293, + "learning_rate": 9.646734657451464e-06, + "loss": 0.8436, + "step": 4462 + }, + { + "epoch": 0.24563817491331388, + "grad_norm": 0.7565840482711792, + "learning_rate": 9.646574600803462e-06, + "loss": 0.7393, + "step": 4463 + }, + { + "epoch": 0.24569321371566955, + "grad_norm": 0.7685559988021851, + "learning_rate": 9.646414509233048e-06, + "loss": 0.7836, + "step": 4464 + }, + { + "epoch": 0.2457482525180252, + "grad_norm": 0.8172003030776978, + "learning_rate": 9.646254382741428e-06, + "loss": 0.787, + "step": 4465 + }, + { + "epoch": 0.24580329132038087, + "grad_norm": 0.902632474899292, + "learning_rate": 9.646094221329802e-06, + "loss": 0.7139, + "step": 4466 + }, + { + "epoch": 0.24585833012273653, + "grad_norm": 0.7810692191123962, + "learning_rate": 9.645934024999374e-06, + "loss": 0.6904, + "step": 4467 + }, + { + "epoch": 0.2459133689250922, + "grad_norm": 0.7242134213447571, + "learning_rate": 9.645773793751352e-06, + "loss": 0.7035, + "step": 4468 + }, + { + "epoch": 0.24596840772744785, + "grad_norm": 0.7192920446395874, + "learning_rate": 9.645613527586938e-06, + "loss": 0.7081, + "step": 4469 + }, + { + "epoch": 0.2460234465298035, + "grad_norm": 0.7613840103149414, + "learning_rate": 9.645453226507336e-06, + "loss": 0.8066, + "step": 4470 + }, + { + "epoch": 0.24607848533215917, + "grad_norm": 0.8154922127723694, + "learning_rate": 9.64529289051375e-06, + "loss": 0.812, + "step": 4471 + }, + { + "epoch": 0.24613352413451484, + "grad_norm": 0.9521573185920715, + "learning_rate": 9.645132519607387e-06, + "loss": 0.7456, + "step": 4472 + }, + { + "epoch": 0.2461885629368705, + "grad_norm": 0.785943329334259, + "learning_rate": 9.64497211378945e-06, + "loss": 0.832, + "step": 4473 + }, + { + "epoch": 0.24624360173922616, + "grad_norm": 0.7675127983093262, + "learning_rate": 9.644811673061148e-06, + "loss": 0.7984, + "step": 4474 + }, + { + "epoch": 0.24629864054158182, + "grad_norm": 0.7317580580711365, + "learning_rate": 9.644651197423683e-06, + "loss": 0.7634, + "step": 4475 + }, + { + "epoch": 0.24635367934393748, + "grad_norm": 0.744937539100647, + "learning_rate": 9.644490686878265e-06, + "loss": 0.729, + "step": 4476 + }, + { + "epoch": 0.24640871814629314, + "grad_norm": 0.7472458481788635, + "learning_rate": 9.644330141426097e-06, + "loss": 0.7517, + "step": 4477 + }, + { + "epoch": 0.2464637569486488, + "grad_norm": 0.8379414677619934, + "learning_rate": 9.644169561068387e-06, + "loss": 0.8008, + "step": 4478 + }, + { + "epoch": 0.24651879575100447, + "grad_norm": 0.8845154047012329, + "learning_rate": 9.64400894580634e-06, + "loss": 0.8135, + "step": 4479 + }, + { + "epoch": 0.24657383455336013, + "grad_norm": 0.7394443154335022, + "learning_rate": 9.643848295641167e-06, + "loss": 0.7697, + "step": 4480 + }, + { + "epoch": 0.2466288733557158, + "grad_norm": 0.8840840458869934, + "learning_rate": 9.643687610574073e-06, + "loss": 0.825, + "step": 4481 + }, + { + "epoch": 0.24668391215807145, + "grad_norm": 0.7924874424934387, + "learning_rate": 9.643526890606265e-06, + "loss": 0.793, + "step": 4482 + }, + { + "epoch": 0.2467389509604271, + "grad_norm": 0.7966769933700562, + "learning_rate": 9.643366135738951e-06, + "loss": 0.8042, + "step": 4483 + }, + { + "epoch": 0.24679398976278277, + "grad_norm": 0.911756694316864, + "learning_rate": 9.643205345973343e-06, + "loss": 0.7801, + "step": 4484 + }, + { + "epoch": 0.24684902856513843, + "grad_norm": 0.903378963470459, + "learning_rate": 9.643044521310645e-06, + "loss": 0.7863, + "step": 4485 + }, + { + "epoch": 0.2469040673674941, + "grad_norm": 0.9021226167678833, + "learning_rate": 9.642883661752067e-06, + "loss": 0.8005, + "step": 4486 + }, + { + "epoch": 0.24695910616984976, + "grad_norm": 0.8853413462638855, + "learning_rate": 9.64272276729882e-06, + "loss": 0.8371, + "step": 4487 + }, + { + "epoch": 0.24701414497220542, + "grad_norm": 1.0654630661010742, + "learning_rate": 9.642561837952108e-06, + "loss": 0.92, + "step": 4488 + }, + { + "epoch": 0.24706918377456105, + "grad_norm": 0.8663573265075684, + "learning_rate": 9.642400873713146e-06, + "loss": 0.8066, + "step": 4489 + }, + { + "epoch": 0.2471242225769167, + "grad_norm": 0.7483134269714355, + "learning_rate": 9.642239874583143e-06, + "loss": 0.9013, + "step": 4490 + }, + { + "epoch": 0.24717926137927237, + "grad_norm": 0.7582293748855591, + "learning_rate": 9.642078840563306e-06, + "loss": 0.7795, + "step": 4491 + }, + { + "epoch": 0.24723430018162804, + "grad_norm": 0.8276637196540833, + "learning_rate": 9.641917771654848e-06, + "loss": 0.7756, + "step": 4492 + }, + { + "epoch": 0.2472893389839837, + "grad_norm": 0.697088360786438, + "learning_rate": 9.641756667858976e-06, + "loss": 0.7092, + "step": 4493 + }, + { + "epoch": 0.24734437778633936, + "grad_norm": 0.8960816860198975, + "learning_rate": 9.641595529176907e-06, + "loss": 0.8835, + "step": 4494 + }, + { + "epoch": 0.24739941658869502, + "grad_norm": 0.9210898280143738, + "learning_rate": 9.641434355609846e-06, + "loss": 0.7881, + "step": 4495 + }, + { + "epoch": 0.24745445539105068, + "grad_norm": 0.7205467820167542, + "learning_rate": 9.64127314715901e-06, + "loss": 0.7204, + "step": 4496 + }, + { + "epoch": 0.24750949419340634, + "grad_norm": 0.7313701510429382, + "learning_rate": 9.641111903825603e-06, + "loss": 0.8296, + "step": 4497 + }, + { + "epoch": 0.247564532995762, + "grad_norm": 0.771159827709198, + "learning_rate": 9.640950625610845e-06, + "loss": 0.7974, + "step": 4498 + }, + { + "epoch": 0.24761957179811767, + "grad_norm": 0.9227705597877502, + "learning_rate": 9.64078931251594e-06, + "loss": 0.9215, + "step": 4499 + }, + { + "epoch": 0.24767461060047333, + "grad_norm": 0.7569915652275085, + "learning_rate": 9.64062796454211e-06, + "loss": 0.83, + "step": 4500 + }, + { + "epoch": 0.247729649402829, + "grad_norm": 0.7453131675720215, + "learning_rate": 9.64046658169056e-06, + "loss": 0.6747, + "step": 4501 + }, + { + "epoch": 0.24778468820518465, + "grad_norm": 0.7228132486343384, + "learning_rate": 9.640305163962504e-06, + "loss": 0.7535, + "step": 4502 + }, + { + "epoch": 0.2478397270075403, + "grad_norm": 0.8160690069198608, + "learning_rate": 9.640143711359159e-06, + "loss": 0.8655, + "step": 4503 + }, + { + "epoch": 0.24789476580989597, + "grad_norm": 0.7641691565513611, + "learning_rate": 9.639982223881735e-06, + "loss": 0.8353, + "step": 4504 + }, + { + "epoch": 0.24794980461225163, + "grad_norm": 0.8669107556343079, + "learning_rate": 9.639820701531445e-06, + "loss": 0.8614, + "step": 4505 + }, + { + "epoch": 0.2480048434146073, + "grad_norm": 0.7433111667633057, + "learning_rate": 9.639659144309508e-06, + "loss": 0.6891, + "step": 4506 + }, + { + "epoch": 0.24805988221696296, + "grad_norm": 1.4303346872329712, + "learning_rate": 9.639497552217131e-06, + "loss": 0.8016, + "step": 4507 + }, + { + "epoch": 0.24811492101931862, + "grad_norm": 0.8684772253036499, + "learning_rate": 9.639335925255535e-06, + "loss": 0.8324, + "step": 4508 + }, + { + "epoch": 0.24816995982167428, + "grad_norm": 0.9222162365913391, + "learning_rate": 9.639174263425932e-06, + "loss": 0.8715, + "step": 4509 + }, + { + "epoch": 0.24822499862402994, + "grad_norm": 0.9789180755615234, + "learning_rate": 9.639012566729535e-06, + "loss": 0.823, + "step": 4510 + }, + { + "epoch": 0.2482800374263856, + "grad_norm": 0.8475140333175659, + "learning_rate": 9.638850835167564e-06, + "loss": 0.768, + "step": 4511 + }, + { + "epoch": 0.24833507622874126, + "grad_norm": 0.7943722605705261, + "learning_rate": 9.63868906874123e-06, + "loss": 0.788, + "step": 4512 + }, + { + "epoch": 0.24839011503109693, + "grad_norm": 0.8723915815353394, + "learning_rate": 9.63852726745175e-06, + "loss": 0.7865, + "step": 4513 + }, + { + "epoch": 0.2484451538334526, + "grad_norm": 0.837001383304596, + "learning_rate": 9.638365431300342e-06, + "loss": 0.7799, + "step": 4514 + }, + { + "epoch": 0.24850019263580825, + "grad_norm": 0.7992665767669678, + "learning_rate": 9.638203560288222e-06, + "loss": 0.8951, + "step": 4515 + }, + { + "epoch": 0.2485552314381639, + "grad_norm": 0.8712993264198303, + "learning_rate": 9.638041654416603e-06, + "loss": 0.8157, + "step": 4516 + }, + { + "epoch": 0.24861027024051957, + "grad_norm": 0.7176356911659241, + "learning_rate": 9.637879713686706e-06, + "loss": 0.8197, + "step": 4517 + }, + { + "epoch": 0.24866530904287523, + "grad_norm": 0.7624368071556091, + "learning_rate": 9.637717738099747e-06, + "loss": 0.7545, + "step": 4518 + }, + { + "epoch": 0.2487203478452309, + "grad_norm": 0.857222318649292, + "learning_rate": 9.637555727656943e-06, + "loss": 0.8146, + "step": 4519 + }, + { + "epoch": 0.24877538664758655, + "grad_norm": 0.7461313605308533, + "learning_rate": 9.637393682359511e-06, + "loss": 0.8569, + "step": 4520 + }, + { + "epoch": 0.24883042544994222, + "grad_norm": 0.8491896986961365, + "learning_rate": 9.637231602208668e-06, + "loss": 0.863, + "step": 4521 + }, + { + "epoch": 0.24888546425229788, + "grad_norm": 0.8139386177062988, + "learning_rate": 9.637069487205635e-06, + "loss": 0.7105, + "step": 4522 + }, + { + "epoch": 0.24894050305465354, + "grad_norm": 0.7782894968986511, + "learning_rate": 9.636907337351629e-06, + "loss": 0.8044, + "step": 4523 + }, + { + "epoch": 0.2489955418570092, + "grad_norm": 0.8225486874580383, + "learning_rate": 9.636745152647868e-06, + "loss": 0.7877, + "step": 4524 + }, + { + "epoch": 0.24905058065936486, + "grad_norm": 0.9087927341461182, + "learning_rate": 9.636582933095573e-06, + "loss": 0.8017, + "step": 4525 + }, + { + "epoch": 0.24910561946172052, + "grad_norm": 0.7392508387565613, + "learning_rate": 9.636420678695962e-06, + "loss": 0.7953, + "step": 4526 + }, + { + "epoch": 0.24916065826407618, + "grad_norm": 0.7906273007392883, + "learning_rate": 9.636258389450253e-06, + "loss": 0.9491, + "step": 4527 + }, + { + "epoch": 0.24921569706643185, + "grad_norm": 0.840394139289856, + "learning_rate": 9.636096065359666e-06, + "loss": 0.8621, + "step": 4528 + }, + { + "epoch": 0.2492707358687875, + "grad_norm": 0.7923862934112549, + "learning_rate": 9.635933706425424e-06, + "loss": 0.8215, + "step": 4529 + }, + { + "epoch": 0.24932577467114317, + "grad_norm": 0.8372805714607239, + "learning_rate": 9.635771312648744e-06, + "loss": 0.8845, + "step": 4530 + }, + { + "epoch": 0.24938081347349883, + "grad_norm": 0.7569165229797363, + "learning_rate": 9.635608884030848e-06, + "loss": 0.8406, + "step": 4531 + }, + { + "epoch": 0.24943585227585446, + "grad_norm": 0.8260865807533264, + "learning_rate": 9.635446420572956e-06, + "loss": 0.8418, + "step": 4532 + }, + { + "epoch": 0.24949089107821013, + "grad_norm": 0.6841318607330322, + "learning_rate": 9.635283922276291e-06, + "loss": 0.6732, + "step": 4533 + }, + { + "epoch": 0.2495459298805658, + "grad_norm": 0.7055326104164124, + "learning_rate": 9.635121389142072e-06, + "loss": 0.7702, + "step": 4534 + }, + { + "epoch": 0.24960096868292145, + "grad_norm": 0.7293457388877869, + "learning_rate": 9.63495882117152e-06, + "loss": 0.6836, + "step": 4535 + }, + { + "epoch": 0.2496560074852771, + "grad_norm": 0.7411924004554749, + "learning_rate": 9.63479621836586e-06, + "loss": 0.8686, + "step": 4536 + }, + { + "epoch": 0.24971104628763277, + "grad_norm": 0.7864643931388855, + "learning_rate": 9.634633580726313e-06, + "loss": 0.7801, + "step": 4537 + }, + { + "epoch": 0.24976608508998843, + "grad_norm": 0.9730797410011292, + "learning_rate": 9.634470908254099e-06, + "loss": 0.8362, + "step": 4538 + }, + { + "epoch": 0.2498211238923441, + "grad_norm": 0.8390370011329651, + "learning_rate": 9.634308200950442e-06, + "loss": 0.8079, + "step": 4539 + }, + { + "epoch": 0.24987616269469975, + "grad_norm": 0.8951246738433838, + "learning_rate": 9.634145458816566e-06, + "loss": 0.7662, + "step": 4540 + }, + { + "epoch": 0.24993120149705542, + "grad_norm": 0.7654157280921936, + "learning_rate": 9.633982681853693e-06, + "loss": 0.8699, + "step": 4541 + }, + { + "epoch": 0.24998624029941108, + "grad_norm": 0.8152109980583191, + "learning_rate": 9.633819870063046e-06, + "loss": 0.7875, + "step": 4542 + }, + { + "epoch": 0.25004127910176677, + "grad_norm": 0.9407321214675903, + "learning_rate": 9.63365702344585e-06, + "loss": 0.7708, + "step": 4543 + }, + { + "epoch": 0.2500963179041224, + "grad_norm": 0.8169927597045898, + "learning_rate": 9.633494142003327e-06, + "loss": 0.8078, + "step": 4544 + }, + { + "epoch": 0.2501513567064781, + "grad_norm": 0.7380755543708801, + "learning_rate": 9.633331225736704e-06, + "loss": 0.7818, + "step": 4545 + } + ], + "logging_steps": 1, + "max_steps": 36338, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 909, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3412573570609971e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4545/training_args.bin b/checkpoint-4545/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..006183302820d451c9ca29db41c5d8a020225b2a --- /dev/null +++ b/checkpoint-4545/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccc35e326b781438eda71a1f881999ddc9f323429e8f60e362a744617b4ee255 +size 7928 diff --git a/checkpoint-4545/zero_to_fp32.py b/checkpoint-4545/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-4545/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-5454/config.json b/checkpoint-5454/config.json new file mode 100644 index 0000000000000000000000000000000000000000..fda0153f8ee396146a87c398da9234b3dce005be --- /dev/null +++ b/checkpoint-5454/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "./meta-llama_Llama-3.1-8B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.46.1", + "use_cache": false, + "vocab_size": 128259 +} diff --git a/checkpoint-5454/generation_config.json b/checkpoint-5454/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eab5082496e8b01f9c606a306676cbfabe0cce9d --- /dev/null +++ b/checkpoint-5454/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.46.1" +} diff --git a/checkpoint-5454/global_step5454/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-5454/global_step5454/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2724b71a463c119039c4b759368f8d6e771fda3 --- /dev/null +++ b/checkpoint-5454/global_step5454/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca798506d41ee04863d83c2c9ec540cab088aaa0ba017a3420d14b626f055c71 +size 12045435328 diff --git a/checkpoint-5454/global_step5454/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-5454/global_step5454/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfdc861a0185a9eca375257c1ecb3b7a1d930c87 --- /dev/null +++ b/checkpoint-5454/global_step5454/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47d30e2389914fe73e044a543af00178997a2d2a04ce676f4229b771b470f105 +size 12045436096 diff --git a/checkpoint-5454/global_step5454/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-5454/global_step5454/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2aac37937eadafe91a63f6c73e3c30ee26bc4b3 --- /dev/null +++ b/checkpoint-5454/global_step5454/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc14a5ff7294f527eb4d7d8f5c070a13b87bf8aa8472d93e940b1b2e732df3e7 +size 12045436352 diff --git a/checkpoint-5454/global_step5454/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-5454/global_step5454/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..36daf8070dc9812259c7c1c84f79076cb052923f --- /dev/null +++ b/checkpoint-5454/global_step5454/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6333ec0eb7c032eb8b0b00e3d4672c2d516ea170a79d552bb44ef5c0bd97cbb2 +size 12045436096 diff --git a/checkpoint-5454/global_step5454/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-5454/global_step5454/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..66bc3b2bc2906c99b0cc799ba1584cae9d21438f --- /dev/null +++ b/checkpoint-5454/global_step5454/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f543281b9044d5406a7db61f8621247992c1547cfd78393fbb11aca6a4fe6fa1 +size 12045436352 diff --git a/checkpoint-5454/global_step5454/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-5454/global_step5454/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d119bf5554aa3a072fb9bb9cd72004204c7f0aa5 --- /dev/null +++ b/checkpoint-5454/global_step5454/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f25450cb49fa62de770b9844ec698a8938941cee018e4618ad100dc833b79b8 +size 12045436416 diff --git a/checkpoint-5454/global_step5454/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-5454/global_step5454/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..30ba3dc806e3ab0ae1d389875db8dba161ccd689 --- /dev/null +++ b/checkpoint-5454/global_step5454/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58ea6c94f3c6a8905cf0078e6801d30320750f7ce6bc22cf824733a09efa3634 +size 12045436096 diff --git a/checkpoint-5454/global_step5454/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-5454/global_step5454/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4f073abfddbd17b78a2a5d19f204417129cd839 --- /dev/null +++ b/checkpoint-5454/global_step5454/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de943ff2dbe55aa6f506d6d8e62740c1b5daaf0e83129dde02f7263029419f73 +size 12045435008 diff --git a/checkpoint-5454/global_step5454/mp_rank_00_model_states.pt b/checkpoint-5454/global_step5454/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..27b43babc2670b6ec364364f5c074266c9126ca4 --- /dev/null +++ b/checkpoint-5454/global_step5454/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc87139f3f8c36be1c225169c489c5fc20ca904daebd3b0f83ec0b9018838c9d +size 16060659704 diff --git a/checkpoint-5454/latest b/checkpoint-5454/latest new file mode 100644 index 0000000000000000000000000000000000000000..56712c654ac69115e570f3d5303e6d0b43c2da0a --- /dev/null +++ b/checkpoint-5454/latest @@ -0,0 +1 @@ +global_step5454 \ No newline at end of file diff --git a/checkpoint-5454/model-00001-of-00004.safetensors b/checkpoint-5454/model-00001-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d256798516a60007f29a4079af28b11e37d8cc81 --- /dev/null +++ b/checkpoint-5454/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b61b0b1150cca0c72f15a9430ba2331799a7de657aa0bd8dc5d49af3d0723cf +size 4976723248 diff --git a/checkpoint-5454/model-00002-of-00004.safetensors b/checkpoint-5454/model-00002-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..59a89a18e9ee266f983791d50ee95fe3c9e96fec --- /dev/null +++ b/checkpoint-5454/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b62978fb9259dd5a757af22e7b2db00dcadef277bae1339f4c08a283987a67a0 +size 4999802720 diff --git a/checkpoint-5454/model-00003-of-00004.safetensors b/checkpoint-5454/model-00003-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5bc196730f44c4ade290deb47efb783ed6e5cc89 --- /dev/null +++ b/checkpoint-5454/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94b534b79d348d7c6d111a0d89a11f8444b150b492f88ec7d8d514cd5d8374d9 +size 4915916176 diff --git a/checkpoint-5454/model-00004-of-00004.safetensors b/checkpoint-5454/model-00004-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..225fffeb97e79e4820278d63b8dd5425872a97d1 --- /dev/null +++ b/checkpoint-5454/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5336f7a57a32a2dd2bfbda61de642eb1ad71beabb002c734802e4a9f69e1fdb9 +size 1168163384 diff --git a/checkpoint-5454/model.safetensors.index.json b/checkpoint-5454/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..e734f8f9bcabe95e936a11f19b77148f54640122 --- /dev/null +++ b/checkpoint-5454/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 16060571648 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/checkpoint-5454/rng_state_0.pth b/checkpoint-5454/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b6473612e41c5cfd6973c2e71fa5f3ad2b2bcad1 --- /dev/null +++ b/checkpoint-5454/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575119a228f98110923ffa2dedcb50e3317251b26054355d015e0b2240d566f2 +size 15984 diff --git a/checkpoint-5454/rng_state_1.pth b/checkpoint-5454/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..8506e00431b6ac7067699c0ea4f59adb6fa0ba20 --- /dev/null +++ b/checkpoint-5454/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0728b56dab7abb5ef8a0d4bae3519c5767c97467bdd886d26bf19cc8599d0312 +size 15984 diff --git a/checkpoint-5454/rng_state_2.pth b/checkpoint-5454/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea499e285c97cca07fedd34662c3d4ab44ff6f47 --- /dev/null +++ b/checkpoint-5454/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e481d4ef1546694da7337f6bb6c658b866dcb79b85deeb477da0d27ebe851e +size 15984 diff --git a/checkpoint-5454/rng_state_3.pth b/checkpoint-5454/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..aeb38f92f106ac3f08bae4f82179a8a12243bccb --- /dev/null +++ b/checkpoint-5454/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:353c60be37ea56fc992fca446598ceca5d1fd002aa3bd6dbb9ad740e6f47ebb3 +size 15984 diff --git a/checkpoint-5454/rng_state_4.pth b/checkpoint-5454/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d5856cb7a3f15092fa5593507022316916f648e --- /dev/null +++ b/checkpoint-5454/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9107fe964ba7205e354084b85210e5a5ea1c98cfd4d38adb9cd3926945dcae4 +size 15984 diff --git a/checkpoint-5454/rng_state_5.pth b/checkpoint-5454/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b824ee24d256695aad4a69a62d8e7125f51a17f2 --- /dev/null +++ b/checkpoint-5454/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69d1bb1abee38b92e53f3f23549b642ce0f1edcdccf7b6129847ac61636e96d5 +size 15984 diff --git a/checkpoint-5454/rng_state_6.pth b/checkpoint-5454/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9fd0364bb8f1a8e91eca45be5e1b6672b4d9afd --- /dev/null +++ b/checkpoint-5454/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afd5516048e20f36959601574e29e40106085a7d3cdc7bf425ce5e84633490e6 +size 15984 diff --git a/checkpoint-5454/rng_state_7.pth b/checkpoint-5454/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e80125fd18efcb1097384319888b699f4dce7e7 --- /dev/null +++ b/checkpoint-5454/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e2c46927fc06939b4c976a01e4b95dec1f8b98ceaea86d31a5d756fc30ff006 +size 15984 diff --git a/checkpoint-5454/scheduler.pt b/checkpoint-5454/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..95e0ee240f95f4401f2fc470254ee7fc2833f6e1 --- /dev/null +++ b/checkpoint-5454/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5619406f3e9dd1500b33a55885d312273c9e90079468e8c44c83d794c30870e7 +size 1064 diff --git a/checkpoint-5454/special_tokens_map.json b/checkpoint-5454/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-5454/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-5454/tokenizer.json b/checkpoint-5454/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..9d29771c68b37af9541b4c450532cb095b564ca5 --- /dev/null +++ b/checkpoint-5454/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91a36f231bc2882e8c2e1859bc27098f73c95ea211ccb73ad0cdb441a16f49c6 +size 17210280 diff --git a/checkpoint-5454/tokenizer_config.json b/checkpoint-5454/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3a695c457b54a00f10768564f6c25b0142ccc840 --- /dev/null +++ b/checkpoint-5454/tokenizer_config.json @@ -0,0 +1,2087 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|im_title|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|end_title|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|im_op|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|end_op|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|im_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|end_date|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|begin_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|end_of_post|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|im_khey|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|end_khey|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|im_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|end_pseudo|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|autheur|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|khey|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|sujet|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128258": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|><|khey|><|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-5454/trainer_state.json b/checkpoint-5454/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4e8ecf6e957309889225a82167441230acfd87a5 --- /dev/null +++ b/checkpoint-5454/trainer_state.json @@ -0,0 +1,38211 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3001816280477737, + "eval_steps": 500, + "global_step": 5454, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.503880235566074e-05, + "grad_norm": 459.8753356933594, + "learning_rate": 1.0000000000000001e-07, + "loss": 3.303, + "step": 1 + }, + { + "epoch": 0.00011007760471132149, + "grad_norm": 314.2561950683594, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.8226, + "step": 2 + }, + { + "epoch": 0.0001651164070669822, + "grad_norm": 314.1292419433594, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.8517, + "step": 3 + }, + { + "epoch": 0.00022015520942264297, + "grad_norm": 312.4049072265625, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.6248, + "step": 4 + }, + { + "epoch": 0.0002751940117783037, + "grad_norm": 353.7213134765625, + "learning_rate": 5.000000000000001e-07, + "loss": 2.7883, + "step": 5 + }, + { + "epoch": 0.0003302328141339644, + "grad_norm": 278.41668701171875, + "learning_rate": 6.000000000000001e-07, + "loss": 2.5468, + "step": 6 + }, + { + "epoch": 0.0003852716164896252, + "grad_norm": 336.14532470703125, + "learning_rate": 7.000000000000001e-07, + "loss": 2.7721, + "step": 7 + }, + { + "epoch": 0.00044031041884528595, + "grad_norm": 201.19374084472656, + "learning_rate": 8.000000000000001e-07, + "loss": 2.4873, + "step": 8 + }, + { + "epoch": 0.0004953492212009466, + "grad_norm": 184.7027587890625, + "learning_rate": 9.000000000000001e-07, + "loss": 2.6647, + "step": 9 + }, + { + "epoch": 0.0005503880235566074, + "grad_norm": 154.597412109375, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.602, + "step": 10 + }, + { + "epoch": 0.0006054268259122681, + "grad_norm": 40.47785568237305, + "learning_rate": 1.1e-06, + "loss": 2.6716, + "step": 11 + }, + { + "epoch": 0.0006604656282679288, + "grad_norm": 25.338607788085938, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.2631, + "step": 12 + }, + { + "epoch": 0.0007155044306235897, + "grad_norm": 24.976919174194336, + "learning_rate": 1.3e-06, + "loss": 2.3564, + "step": 13 + }, + { + "epoch": 0.0007705432329792504, + "grad_norm": 15.239912033081055, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.3295, + "step": 14 + }, + { + "epoch": 0.0008255820353349112, + "grad_norm": 14.125042915344238, + "learning_rate": 1.5e-06, + "loss": 2.307, + "step": 15 + }, + { + "epoch": 0.0008806208376905719, + "grad_norm": 13.163726806640625, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.1493, + "step": 16 + }, + { + "epoch": 0.0009356596400462326, + "grad_norm": 8.726515769958496, + "learning_rate": 1.7000000000000002e-06, + "loss": 2.0333, + "step": 17 + }, + { + "epoch": 0.0009906984424018933, + "grad_norm": 9.072502136230469, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.2046, + "step": 18 + }, + { + "epoch": 0.001045737244757554, + "grad_norm": 9.412588119506836, + "learning_rate": 1.9000000000000002e-06, + "loss": 2.2001, + "step": 19 + }, + { + "epoch": 0.0011007760471132147, + "grad_norm": 8.67534065246582, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.7679, + "step": 20 + }, + { + "epoch": 0.0011558148494688755, + "grad_norm": 14.015918731689453, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.9566, + "step": 21 + }, + { + "epoch": 0.0012108536518245362, + "grad_norm": 7.9474687576293945, + "learning_rate": 2.2e-06, + "loss": 1.9085, + "step": 22 + }, + { + "epoch": 0.001265892454180197, + "grad_norm": 6.806368350982666, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.7918, + "step": 23 + }, + { + "epoch": 0.0013209312565358577, + "grad_norm": 5.3452582359313965, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.8321, + "step": 24 + }, + { + "epoch": 0.0013759700588915184, + "grad_norm": 8.744244575500488, + "learning_rate": 2.5e-06, + "loss": 1.6317, + "step": 25 + }, + { + "epoch": 0.0014310088612471794, + "grad_norm": 5.304683685302734, + "learning_rate": 2.6e-06, + "loss": 1.6846, + "step": 26 + }, + { + "epoch": 0.00148604766360284, + "grad_norm": 5.650127410888672, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.7449, + "step": 27 + }, + { + "epoch": 0.0015410864659585008, + "grad_norm": 5.479269504547119, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.8158, + "step": 28 + }, + { + "epoch": 0.0015961252683141616, + "grad_norm": 4.873537063598633, + "learning_rate": 2.9e-06, + "loss": 1.8015, + "step": 29 + }, + { + "epoch": 0.0016511640706698223, + "grad_norm": 4.971101760864258, + "learning_rate": 3e-06, + "loss": 1.9034, + "step": 30 + }, + { + "epoch": 0.001706202873025483, + "grad_norm": 4.407571315765381, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.9037, + "step": 31 + }, + { + "epoch": 0.0017612416753811438, + "grad_norm": 4.429073810577393, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.6812, + "step": 32 + }, + { + "epoch": 0.0018162804777368045, + "grad_norm": 5.16085147857666, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.7627, + "step": 33 + }, + { + "epoch": 0.0018713192800924653, + "grad_norm": 4.0805768966674805, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.6799, + "step": 34 + }, + { + "epoch": 0.001926358082448126, + "grad_norm": 4.548702239990234, + "learning_rate": 3.5e-06, + "loss": 1.7799, + "step": 35 + }, + { + "epoch": 0.0019813968848037865, + "grad_norm": 5.181888580322266, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.8235, + "step": 36 + }, + { + "epoch": 0.0020364356871594475, + "grad_norm": 3.9876129627227783, + "learning_rate": 3.7e-06, + "loss": 1.5999, + "step": 37 + }, + { + "epoch": 0.002091474489515108, + "grad_norm": 6.325051307678223, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.7499, + "step": 38 + }, + { + "epoch": 0.002146513291870769, + "grad_norm": 6.199049949645996, + "learning_rate": 3.900000000000001e-06, + "loss": 1.784, + "step": 39 + }, + { + "epoch": 0.0022015520942264295, + "grad_norm": 4.83912992477417, + "learning_rate": 4.000000000000001e-06, + "loss": 1.8895, + "step": 40 + }, + { + "epoch": 0.0022565908965820904, + "grad_norm": 4.515626907348633, + "learning_rate": 4.1e-06, + "loss": 1.4887, + "step": 41 + }, + { + "epoch": 0.002311629698937751, + "grad_norm": 5.032265663146973, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.7324, + "step": 42 + }, + { + "epoch": 0.002366668501293412, + "grad_norm": 4.1879048347473145, + "learning_rate": 4.3e-06, + "loss": 1.4912, + "step": 43 + }, + { + "epoch": 0.0024217073036490724, + "grad_norm": 4.128026485443115, + "learning_rate": 4.4e-06, + "loss": 1.554, + "step": 44 + }, + { + "epoch": 0.0024767461060047334, + "grad_norm": 4.527958393096924, + "learning_rate": 4.5e-06, + "loss": 1.652, + "step": 45 + }, + { + "epoch": 0.002531784908360394, + "grad_norm": 4.8388190269470215, + "learning_rate": 4.600000000000001e-06, + "loss": 1.6696, + "step": 46 + }, + { + "epoch": 0.002586823710716055, + "grad_norm": 4.2088541984558105, + "learning_rate": 4.7e-06, + "loss": 1.568, + "step": 47 + }, + { + "epoch": 0.0026418625130717154, + "grad_norm": 4.789997577667236, + "learning_rate": 4.800000000000001e-06, + "loss": 1.642, + "step": 48 + }, + { + "epoch": 0.0026969013154273763, + "grad_norm": 4.408346652984619, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.5181, + "step": 49 + }, + { + "epoch": 0.002751940117783037, + "grad_norm": 4.572340488433838, + "learning_rate": 5e-06, + "loss": 1.6698, + "step": 50 + }, + { + "epoch": 0.0028069789201386978, + "grad_norm": 4.728564739227295, + "learning_rate": 5.1e-06, + "loss": 1.5785, + "step": 51 + }, + { + "epoch": 0.0028620177224943587, + "grad_norm": 4.449855327606201, + "learning_rate": 5.2e-06, + "loss": 1.4624, + "step": 52 + }, + { + "epoch": 0.0029170565248500193, + "grad_norm": 4.127189636230469, + "learning_rate": 5.300000000000001e-06, + "loss": 1.6061, + "step": 53 + }, + { + "epoch": 0.00297209532720568, + "grad_norm": 4.244532108306885, + "learning_rate": 5.400000000000001e-06, + "loss": 1.491, + "step": 54 + }, + { + "epoch": 0.0030271341295613407, + "grad_norm": 3.437682628631592, + "learning_rate": 5.500000000000001e-06, + "loss": 1.1967, + "step": 55 + }, + { + "epoch": 0.0030821729319170017, + "grad_norm": 3.83516788482666, + "learning_rate": 5.600000000000001e-06, + "loss": 1.4731, + "step": 56 + }, + { + "epoch": 0.003137211734272662, + "grad_norm": 3.9108972549438477, + "learning_rate": 5.7e-06, + "loss": 1.4393, + "step": 57 + }, + { + "epoch": 0.003192250536628323, + "grad_norm": 3.5258419513702393, + "learning_rate": 5.8e-06, + "loss": 1.4206, + "step": 58 + }, + { + "epoch": 0.0032472893389839837, + "grad_norm": 4.124903678894043, + "learning_rate": 5.9e-06, + "loss": 1.4747, + "step": 59 + }, + { + "epoch": 0.0033023281413396446, + "grad_norm": 4.055769920349121, + "learning_rate": 6e-06, + "loss": 1.4655, + "step": 60 + }, + { + "epoch": 0.003357366943695305, + "grad_norm": 3.904837131500244, + "learning_rate": 6.1e-06, + "loss": 1.5125, + "step": 61 + }, + { + "epoch": 0.003412405746050966, + "grad_norm": 3.2904794216156006, + "learning_rate": 6.200000000000001e-06, + "loss": 1.4596, + "step": 62 + }, + { + "epoch": 0.0034674445484066266, + "grad_norm": 3.24053692817688, + "learning_rate": 6.300000000000001e-06, + "loss": 1.3851, + "step": 63 + }, + { + "epoch": 0.0035224833507622876, + "grad_norm": 3.457639217376709, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.4019, + "step": 64 + }, + { + "epoch": 0.003577522153117948, + "grad_norm": 3.073054790496826, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.2872, + "step": 65 + }, + { + "epoch": 0.003632560955473609, + "grad_norm": 2.6726694107055664, + "learning_rate": 6.600000000000001e-06, + "loss": 1.2361, + "step": 66 + }, + { + "epoch": 0.0036875997578292696, + "grad_norm": 2.9378459453582764, + "learning_rate": 6.700000000000001e-06, + "loss": 1.4452, + "step": 67 + }, + { + "epoch": 0.0037426385601849305, + "grad_norm": 2.81107234954834, + "learning_rate": 6.800000000000001e-06, + "loss": 1.4804, + "step": 68 + }, + { + "epoch": 0.003797677362540591, + "grad_norm": 2.60062313079834, + "learning_rate": 6.9e-06, + "loss": 1.3263, + "step": 69 + }, + { + "epoch": 0.003852716164896252, + "grad_norm": 2.5642921924591064, + "learning_rate": 7e-06, + "loss": 1.2751, + "step": 70 + }, + { + "epoch": 0.0039077549672519125, + "grad_norm": 2.3608031272888184, + "learning_rate": 7.100000000000001e-06, + "loss": 1.2614, + "step": 71 + }, + { + "epoch": 0.003962793769607573, + "grad_norm": 2.7201738357543945, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.5018, + "step": 72 + }, + { + "epoch": 0.004017832571963234, + "grad_norm": 2.584726095199585, + "learning_rate": 7.3e-06, + "loss": 1.3519, + "step": 73 + }, + { + "epoch": 0.004072871374318895, + "grad_norm": 1.9693044424057007, + "learning_rate": 7.4e-06, + "loss": 1.0934, + "step": 74 + }, + { + "epoch": 0.0041279101766745555, + "grad_norm": 2.220736503601074, + "learning_rate": 7.500000000000001e-06, + "loss": 1.4687, + "step": 75 + }, + { + "epoch": 0.004182948979030216, + "grad_norm": 2.2629456520080566, + "learning_rate": 7.600000000000001e-06, + "loss": 1.3328, + "step": 76 + }, + { + "epoch": 0.004237987781385877, + "grad_norm": 2.051820993423462, + "learning_rate": 7.7e-06, + "loss": 1.3058, + "step": 77 + }, + { + "epoch": 0.004293026583741538, + "grad_norm": 2.2451820373535156, + "learning_rate": 7.800000000000002e-06, + "loss": 1.3556, + "step": 78 + }, + { + "epoch": 0.004348065386097198, + "grad_norm": 3.13584303855896, + "learning_rate": 7.9e-06, + "loss": 1.3262, + "step": 79 + }, + { + "epoch": 0.004403104188452859, + "grad_norm": 5.024479866027832, + "learning_rate": 8.000000000000001e-06, + "loss": 1.2103, + "step": 80 + }, + { + "epoch": 0.00445814299080852, + "grad_norm": 2.070889711380005, + "learning_rate": 8.1e-06, + "loss": 1.1994, + "step": 81 + }, + { + "epoch": 0.004513181793164181, + "grad_norm": 2.797286033630371, + "learning_rate": 8.2e-06, + "loss": 1.3075, + "step": 82 + }, + { + "epoch": 0.004568220595519841, + "grad_norm": 2.11370849609375, + "learning_rate": 8.3e-06, + "loss": 1.36, + "step": 83 + }, + { + "epoch": 0.004623259397875502, + "grad_norm": 2.5416152477264404, + "learning_rate": 8.400000000000001e-06, + "loss": 1.3484, + "step": 84 + }, + { + "epoch": 0.004678298200231163, + "grad_norm": 2.4702343940734863, + "learning_rate": 8.5e-06, + "loss": 1.3677, + "step": 85 + }, + { + "epoch": 0.004733337002586824, + "grad_norm": 3.670365333557129, + "learning_rate": 8.6e-06, + "loss": 1.2192, + "step": 86 + }, + { + "epoch": 0.004788375804942484, + "grad_norm": 2.282954692840576, + "learning_rate": 8.700000000000001e-06, + "loss": 1.2982, + "step": 87 + }, + { + "epoch": 0.004843414607298145, + "grad_norm": 2.3659238815307617, + "learning_rate": 8.8e-06, + "loss": 1.3206, + "step": 88 + }, + { + "epoch": 0.004898453409653806, + "grad_norm": 4.939981460571289, + "learning_rate": 8.900000000000001e-06, + "loss": 1.4328, + "step": 89 + }, + { + "epoch": 0.004953492212009467, + "grad_norm": 2.335858106613159, + "learning_rate": 9e-06, + "loss": 1.2603, + "step": 90 + }, + { + "epoch": 0.005008531014365127, + "grad_norm": 2.2165043354034424, + "learning_rate": 9.100000000000001e-06, + "loss": 1.3141, + "step": 91 + }, + { + "epoch": 0.005063569816720788, + "grad_norm": 2.7872185707092285, + "learning_rate": 9.200000000000002e-06, + "loss": 1.3314, + "step": 92 + }, + { + "epoch": 0.005118608619076449, + "grad_norm": 2.6353912353515625, + "learning_rate": 9.3e-06, + "loss": 1.2027, + "step": 93 + }, + { + "epoch": 0.00517364742143211, + "grad_norm": 3.2509102821350098, + "learning_rate": 9.4e-06, + "loss": 1.2316, + "step": 94 + }, + { + "epoch": 0.00522868622378777, + "grad_norm": 2.4560611248016357, + "learning_rate": 9.5e-06, + "loss": 1.1848, + "step": 95 + }, + { + "epoch": 0.005283725026143431, + "grad_norm": 2.338151216506958, + "learning_rate": 9.600000000000001e-06, + "loss": 1.2392, + "step": 96 + }, + { + "epoch": 0.005338763828499092, + "grad_norm": 2.231065034866333, + "learning_rate": 9.7e-06, + "loss": 1.2089, + "step": 97 + }, + { + "epoch": 0.005393802630854753, + "grad_norm": 2.278428077697754, + "learning_rate": 9.800000000000001e-06, + "loss": 1.2267, + "step": 98 + }, + { + "epoch": 0.005448841433210413, + "grad_norm": 2.4422810077667236, + "learning_rate": 9.9e-06, + "loss": 1.2041, + "step": 99 + }, + { + "epoch": 0.005503880235566074, + "grad_norm": 2.216248035430908, + "learning_rate": 1e-05, + "loss": 1.0798, + "step": 100 + }, + { + "epoch": 0.005558919037921735, + "grad_norm": 2.3301615715026855, + "learning_rate": 9.99999998121067e-06, + "loss": 1.3069, + "step": 101 + }, + { + "epoch": 0.0056139578402773956, + "grad_norm": 2.315436363220215, + "learning_rate": 9.999999924842678e-06, + "loss": 1.1589, + "step": 102 + }, + { + "epoch": 0.005668996642633056, + "grad_norm": 2.3522140979766846, + "learning_rate": 9.999999830896024e-06, + "loss": 1.0978, + "step": 103 + }, + { + "epoch": 0.0057240354449887175, + "grad_norm": 2.5798308849334717, + "learning_rate": 9.99999969937071e-06, + "loss": 1.0599, + "step": 104 + }, + { + "epoch": 0.005779074247344378, + "grad_norm": 2.456644058227539, + "learning_rate": 9.999999530266738e-06, + "loss": 1.1682, + "step": 105 + }, + { + "epoch": 0.0058341130497000385, + "grad_norm": 2.1559031009674072, + "learning_rate": 9.999999323584106e-06, + "loss": 1.0631, + "step": 106 + }, + { + "epoch": 0.005889151852055699, + "grad_norm": 2.2985048294067383, + "learning_rate": 9.99999907932282e-06, + "loss": 1.1455, + "step": 107 + }, + { + "epoch": 0.00594419065441136, + "grad_norm": 2.596167802810669, + "learning_rate": 9.999998797482877e-06, + "loss": 1.1686, + "step": 108 + }, + { + "epoch": 0.005999229456767021, + "grad_norm": 2.378618001937866, + "learning_rate": 9.999998478064283e-06, + "loss": 1.2226, + "step": 109 + }, + { + "epoch": 0.0060542682591226814, + "grad_norm": 2.228116750717163, + "learning_rate": 9.999998121067038e-06, + "loss": 1.1396, + "step": 110 + }, + { + "epoch": 0.006109307061478342, + "grad_norm": 2.4419472217559814, + "learning_rate": 9.999997726491146e-06, + "loss": 1.1401, + "step": 111 + }, + { + "epoch": 0.006164345863834003, + "grad_norm": 2.0695526599884033, + "learning_rate": 9.999997294336608e-06, + "loss": 1.1868, + "step": 112 + }, + { + "epoch": 0.006219384666189664, + "grad_norm": 2.3170363903045654, + "learning_rate": 9.99999682460343e-06, + "loss": 1.1172, + "step": 113 + }, + { + "epoch": 0.006274423468545324, + "grad_norm": 2.670466184616089, + "learning_rate": 9.999996317291615e-06, + "loss": 1.2481, + "step": 114 + }, + { + "epoch": 0.006329462270900985, + "grad_norm": 2.1214540004730225, + "learning_rate": 9.999995772401166e-06, + "loss": 0.9994, + "step": 115 + }, + { + "epoch": 0.006384501073256646, + "grad_norm": 1.9283969402313232, + "learning_rate": 9.999995189932085e-06, + "loss": 1.0692, + "step": 116 + }, + { + "epoch": 0.006439539875612307, + "grad_norm": 2.2620882987976074, + "learning_rate": 9.99999456988438e-06, + "loss": 1.0725, + "step": 117 + }, + { + "epoch": 0.006494578677967967, + "grad_norm": 2.2121341228485107, + "learning_rate": 9.999993912258055e-06, + "loss": 1.1328, + "step": 118 + }, + { + "epoch": 0.006549617480323628, + "grad_norm": 2.298126220703125, + "learning_rate": 9.999993217053113e-06, + "loss": 1.1272, + "step": 119 + }, + { + "epoch": 0.006604656282679289, + "grad_norm": 1.81593656539917, + "learning_rate": 9.99999248426956e-06, + "loss": 1.017, + "step": 120 + }, + { + "epoch": 0.00665969508503495, + "grad_norm": 2.1174378395080566, + "learning_rate": 9.999991713907403e-06, + "loss": 1.0557, + "step": 121 + }, + { + "epoch": 0.00671473388739061, + "grad_norm": 1.9061017036437988, + "learning_rate": 9.999990905966647e-06, + "loss": 1.0379, + "step": 122 + }, + { + "epoch": 0.006769772689746271, + "grad_norm": 1.912500023841858, + "learning_rate": 9.999990060447297e-06, + "loss": 1.104, + "step": 123 + }, + { + "epoch": 0.006824811492101932, + "grad_norm": 1.9249529838562012, + "learning_rate": 9.99998917734936e-06, + "loss": 1.0136, + "step": 124 + }, + { + "epoch": 0.006879850294457593, + "grad_norm": 1.8504948616027832, + "learning_rate": 9.999988256672843e-06, + "loss": 0.99, + "step": 125 + }, + { + "epoch": 0.006934889096813253, + "grad_norm": 1.720042109489441, + "learning_rate": 9.999987298417753e-06, + "loss": 1.0666, + "step": 126 + }, + { + "epoch": 0.006989927899168914, + "grad_norm": 1.778251051902771, + "learning_rate": 9.999986302584097e-06, + "loss": 1.0424, + "step": 127 + }, + { + "epoch": 0.007044966701524575, + "grad_norm": 1.9485961198806763, + "learning_rate": 9.999985269171881e-06, + "loss": 1.105, + "step": 128 + }, + { + "epoch": 0.007100005503880236, + "grad_norm": 3.0802104473114014, + "learning_rate": 9.999984198181114e-06, + "loss": 1.1081, + "step": 129 + }, + { + "epoch": 0.007155044306235896, + "grad_norm": 1.7476954460144043, + "learning_rate": 9.999983089611806e-06, + "loss": 0.9677, + "step": 130 + }, + { + "epoch": 0.007210083108591557, + "grad_norm": 1.6127299070358276, + "learning_rate": 9.999981943463963e-06, + "loss": 0.9937, + "step": 131 + }, + { + "epoch": 0.007265121910947218, + "grad_norm": 2.1477208137512207, + "learning_rate": 9.999980759737594e-06, + "loss": 1.0319, + "step": 132 + }, + { + "epoch": 0.007320160713302879, + "grad_norm": 1.531163215637207, + "learning_rate": 9.999979538432707e-06, + "loss": 0.8696, + "step": 133 + }, + { + "epoch": 0.007375199515658539, + "grad_norm": 1.8226820230484009, + "learning_rate": 9.999978279549313e-06, + "loss": 1.2061, + "step": 134 + }, + { + "epoch": 0.0074302383180142, + "grad_norm": 1.481895923614502, + "learning_rate": 9.99997698308742e-06, + "loss": 0.949, + "step": 135 + }, + { + "epoch": 0.007485277120369861, + "grad_norm": 1.6715927124023438, + "learning_rate": 9.99997564904704e-06, + "loss": 1.1579, + "step": 136 + }, + { + "epoch": 0.0075403159227255215, + "grad_norm": 1.4235272407531738, + "learning_rate": 9.999974277428179e-06, + "loss": 1.064, + "step": 137 + }, + { + "epoch": 0.007595354725081182, + "grad_norm": 1.3524872064590454, + "learning_rate": 9.999972868230852e-06, + "loss": 0.9141, + "step": 138 + }, + { + "epoch": 0.007650393527436843, + "grad_norm": 1.3741765022277832, + "learning_rate": 9.999971421455066e-06, + "loss": 1.0256, + "step": 139 + }, + { + "epoch": 0.007705432329792504, + "grad_norm": 1.9869598150253296, + "learning_rate": 9.999969937100835e-06, + "loss": 0.9489, + "step": 140 + }, + { + "epoch": 0.0077604711321481645, + "grad_norm": 1.4785465002059937, + "learning_rate": 9.999968415168166e-06, + "loss": 0.9243, + "step": 141 + }, + { + "epoch": 0.007815509934503825, + "grad_norm": 1.5476176738739014, + "learning_rate": 9.999966855657074e-06, + "loss": 1.178, + "step": 142 + }, + { + "epoch": 0.007870548736859486, + "grad_norm": 1.500401258468628, + "learning_rate": 9.99996525856757e-06, + "loss": 0.9837, + "step": 143 + }, + { + "epoch": 0.007925587539215146, + "grad_norm": 1.3777157068252563, + "learning_rate": 9.999963623899664e-06, + "loss": 1.0732, + "step": 144 + }, + { + "epoch": 0.007980626341570807, + "grad_norm": 1.4466841220855713, + "learning_rate": 9.99996195165337e-06, + "loss": 0.9779, + "step": 145 + }, + { + "epoch": 0.008035665143926469, + "grad_norm": 1.5304051637649536, + "learning_rate": 9.9999602418287e-06, + "loss": 1.196, + "step": 146 + }, + { + "epoch": 0.008090703946282128, + "grad_norm": 1.9012362957000732, + "learning_rate": 9.99995849442567e-06, + "loss": 0.9797, + "step": 147 + }, + { + "epoch": 0.00814574274863779, + "grad_norm": 1.430679202079773, + "learning_rate": 9.999956709444289e-06, + "loss": 0.9869, + "step": 148 + }, + { + "epoch": 0.00820078155099345, + "grad_norm": 1.3489817380905151, + "learning_rate": 9.99995488688457e-06, + "loss": 1.0137, + "step": 149 + }, + { + "epoch": 0.008255820353349111, + "grad_norm": 1.1878125667572021, + "learning_rate": 9.999953026746531e-06, + "loss": 0.9355, + "step": 150 + }, + { + "epoch": 0.008310859155704772, + "grad_norm": 1.3481942415237427, + "learning_rate": 9.999951129030182e-06, + "loss": 1.1235, + "step": 151 + }, + { + "epoch": 0.008365897958060432, + "grad_norm": 1.7335314750671387, + "learning_rate": 9.999949193735539e-06, + "loss": 0.9382, + "step": 152 + }, + { + "epoch": 0.008420936760416093, + "grad_norm": 1.2029480934143066, + "learning_rate": 9.999947220862615e-06, + "loss": 0.9419, + "step": 153 + }, + { + "epoch": 0.008475975562771755, + "grad_norm": 1.2104203701019287, + "learning_rate": 9.999945210411428e-06, + "loss": 0.9196, + "step": 154 + }, + { + "epoch": 0.008531014365127414, + "grad_norm": 1.1857126951217651, + "learning_rate": 9.999943162381991e-06, + "loss": 0.9421, + "step": 155 + }, + { + "epoch": 0.008586053167483076, + "grad_norm": 1.115027904510498, + "learning_rate": 9.999941076774319e-06, + "loss": 0.9634, + "step": 156 + }, + { + "epoch": 0.008641091969838737, + "grad_norm": 1.4227553606033325, + "learning_rate": 9.999938953588428e-06, + "loss": 1.0036, + "step": 157 + }, + { + "epoch": 0.008696130772194397, + "grad_norm": 1.2913776636123657, + "learning_rate": 9.999936792824334e-06, + "loss": 0.9232, + "step": 158 + }, + { + "epoch": 0.008751169574550058, + "grad_norm": 1.2817318439483643, + "learning_rate": 9.999934594482055e-06, + "loss": 0.9691, + "step": 159 + }, + { + "epoch": 0.008806208376905718, + "grad_norm": 1.5647841691970825, + "learning_rate": 9.999932358561604e-06, + "loss": 1.1842, + "step": 160 + }, + { + "epoch": 0.00886124717926138, + "grad_norm": 1.368135929107666, + "learning_rate": 9.999930085063002e-06, + "loss": 1.0873, + "step": 161 + }, + { + "epoch": 0.00891628598161704, + "grad_norm": 1.2297240495681763, + "learning_rate": 9.999927773986262e-06, + "loss": 1.0778, + "step": 162 + }, + { + "epoch": 0.0089713247839727, + "grad_norm": 1.0658279657363892, + "learning_rate": 9.999925425331405e-06, + "loss": 0.9008, + "step": 163 + }, + { + "epoch": 0.009026363586328362, + "grad_norm": 1.3484326601028442, + "learning_rate": 9.999923039098445e-06, + "loss": 1.0664, + "step": 164 + }, + { + "epoch": 0.009081402388684023, + "grad_norm": 1.1839075088500977, + "learning_rate": 9.999920615287401e-06, + "loss": 0.9257, + "step": 165 + }, + { + "epoch": 0.009136441191039683, + "grad_norm": 1.2757254838943481, + "learning_rate": 9.999918153898295e-06, + "loss": 0.9473, + "step": 166 + }, + { + "epoch": 0.009191479993395344, + "grad_norm": 1.2414579391479492, + "learning_rate": 9.99991565493114e-06, + "loss": 1.1091, + "step": 167 + }, + { + "epoch": 0.009246518795751004, + "grad_norm": 1.2802611589431763, + "learning_rate": 9.999913118385959e-06, + "loss": 1.063, + "step": 168 + }, + { + "epoch": 0.009301557598106665, + "grad_norm": 1.2055327892303467, + "learning_rate": 9.99991054426277e-06, + "loss": 0.8, + "step": 169 + }, + { + "epoch": 0.009356596400462327, + "grad_norm": 1.0391098260879517, + "learning_rate": 9.99990793256159e-06, + "loss": 0.8672, + "step": 170 + }, + { + "epoch": 0.009411635202817986, + "grad_norm": 1.131536602973938, + "learning_rate": 9.99990528328244e-06, + "loss": 0.9569, + "step": 171 + }, + { + "epoch": 0.009466674005173648, + "grad_norm": 1.164307951927185, + "learning_rate": 9.999902596425342e-06, + "loss": 0.9999, + "step": 172 + }, + { + "epoch": 0.009521712807529309, + "grad_norm": 1.2099504470825195, + "learning_rate": 9.999899871990313e-06, + "loss": 0.9994, + "step": 173 + }, + { + "epoch": 0.009576751609884969, + "grad_norm": 1.7294539213180542, + "learning_rate": 9.999897109977376e-06, + "loss": 1.0265, + "step": 174 + }, + { + "epoch": 0.00963179041224063, + "grad_norm": 1.3009883165359497, + "learning_rate": 9.99989431038655e-06, + "loss": 0.9022, + "step": 175 + }, + { + "epoch": 0.00968682921459629, + "grad_norm": 1.1014611721038818, + "learning_rate": 9.999891473217857e-06, + "loss": 0.8476, + "step": 176 + }, + { + "epoch": 0.009741868016951951, + "grad_norm": 1.2410900592803955, + "learning_rate": 9.99988859847132e-06, + "loss": 1.0272, + "step": 177 + }, + { + "epoch": 0.009796906819307612, + "grad_norm": 1.336348295211792, + "learning_rate": 9.999885686146957e-06, + "loss": 0.9456, + "step": 178 + }, + { + "epoch": 0.009851945621663272, + "grad_norm": 1.2931095361709595, + "learning_rate": 9.99988273624479e-06, + "loss": 0.9554, + "step": 179 + }, + { + "epoch": 0.009906984424018933, + "grad_norm": 1.2647838592529297, + "learning_rate": 9.999879748764845e-06, + "loss": 1.0394, + "step": 180 + }, + { + "epoch": 0.009962023226374595, + "grad_norm": 1.3485127687454224, + "learning_rate": 9.99987672370714e-06, + "loss": 1.1016, + "step": 181 + }, + { + "epoch": 0.010017062028730254, + "grad_norm": 1.110187292098999, + "learning_rate": 9.999873661071702e-06, + "loss": 0.946, + "step": 182 + }, + { + "epoch": 0.010072100831085916, + "grad_norm": 1.0991623401641846, + "learning_rate": 9.999870560858551e-06, + "loss": 1.0084, + "step": 183 + }, + { + "epoch": 0.010127139633441576, + "grad_norm": 1.049804449081421, + "learning_rate": 9.999867423067713e-06, + "loss": 0.8264, + "step": 184 + }, + { + "epoch": 0.010182178435797237, + "grad_norm": 1.0947058200836182, + "learning_rate": 9.999864247699207e-06, + "loss": 0.8884, + "step": 185 + }, + { + "epoch": 0.010237217238152898, + "grad_norm": 1.1147902011871338, + "learning_rate": 9.999861034753061e-06, + "loss": 0.9657, + "step": 186 + }, + { + "epoch": 0.010292256040508558, + "grad_norm": 1.260027527809143, + "learning_rate": 9.999857784229298e-06, + "loss": 1.0102, + "step": 187 + }, + { + "epoch": 0.01034729484286422, + "grad_norm": 1.1275582313537598, + "learning_rate": 9.999854496127942e-06, + "loss": 1.028, + "step": 188 + }, + { + "epoch": 0.01040233364521988, + "grad_norm": 1.1377174854278564, + "learning_rate": 9.999851170449018e-06, + "loss": 1.032, + "step": 189 + }, + { + "epoch": 0.01045737244757554, + "grad_norm": 1.1734225749969482, + "learning_rate": 9.999847807192552e-06, + "loss": 1.0009, + "step": 190 + }, + { + "epoch": 0.010512411249931202, + "grad_norm": 1.1934596300125122, + "learning_rate": 9.999844406358565e-06, + "loss": 1.0432, + "step": 191 + }, + { + "epoch": 0.010567450052286861, + "grad_norm": 1.0638024806976318, + "learning_rate": 9.99984096794709e-06, + "loss": 0.8651, + "step": 192 + }, + { + "epoch": 0.010622488854642523, + "grad_norm": 1.2381829023361206, + "learning_rate": 9.999837491958147e-06, + "loss": 1.0088, + "step": 193 + }, + { + "epoch": 0.010677527656998184, + "grad_norm": 1.030246615409851, + "learning_rate": 9.999833978391763e-06, + "loss": 0.9488, + "step": 194 + }, + { + "epoch": 0.010732566459353844, + "grad_norm": 1.1640657186508179, + "learning_rate": 9.999830427247965e-06, + "loss": 1.0588, + "step": 195 + }, + { + "epoch": 0.010787605261709505, + "grad_norm": 1.0431616306304932, + "learning_rate": 9.99982683852678e-06, + "loss": 0.8728, + "step": 196 + }, + { + "epoch": 0.010842644064065167, + "grad_norm": 1.032263159751892, + "learning_rate": 9.999823212228235e-06, + "loss": 0.9498, + "step": 197 + }, + { + "epoch": 0.010897682866420826, + "grad_norm": 1.1383745670318604, + "learning_rate": 9.999819548352358e-06, + "loss": 0.9498, + "step": 198 + }, + { + "epoch": 0.010952721668776488, + "grad_norm": 1.1324639320373535, + "learning_rate": 9.999815846899175e-06, + "loss": 1.0432, + "step": 199 + }, + { + "epoch": 0.011007760471132147, + "grad_norm": 1.188672661781311, + "learning_rate": 9.999812107868714e-06, + "loss": 0.982, + "step": 200 + }, + { + "epoch": 0.011062799273487809, + "grad_norm": 1.1011098623275757, + "learning_rate": 9.999808331261005e-06, + "loss": 0.9587, + "step": 201 + }, + { + "epoch": 0.01111783807584347, + "grad_norm": 1.1782938241958618, + "learning_rate": 9.999804517076073e-06, + "loss": 1.0659, + "step": 202 + }, + { + "epoch": 0.01117287687819913, + "grad_norm": 1.0520117282867432, + "learning_rate": 9.99980066531395e-06, + "loss": 1.0056, + "step": 203 + }, + { + "epoch": 0.011227915680554791, + "grad_norm": 1.1584919691085815, + "learning_rate": 9.999796775974663e-06, + "loss": 0.9435, + "step": 204 + }, + { + "epoch": 0.011282954482910452, + "grad_norm": 1.2201849222183228, + "learning_rate": 9.999792849058242e-06, + "loss": 1.0562, + "step": 205 + }, + { + "epoch": 0.011337993285266112, + "grad_norm": 1.2985976934432983, + "learning_rate": 9.999788884564715e-06, + "loss": 1.0126, + "step": 206 + }, + { + "epoch": 0.011393032087621774, + "grad_norm": 0.9926307201385498, + "learning_rate": 9.999784882494115e-06, + "loss": 0.7875, + "step": 207 + }, + { + "epoch": 0.011448070889977435, + "grad_norm": 1.103365182876587, + "learning_rate": 9.99978084284647e-06, + "loss": 0.9833, + "step": 208 + }, + { + "epoch": 0.011503109692333095, + "grad_norm": 1.1798462867736816, + "learning_rate": 9.99977676562181e-06, + "loss": 0.8479, + "step": 209 + }, + { + "epoch": 0.011558148494688756, + "grad_norm": 1.2887194156646729, + "learning_rate": 9.999772650820168e-06, + "loss": 0.9606, + "step": 210 + }, + { + "epoch": 0.011613187297044416, + "grad_norm": 1.1120634078979492, + "learning_rate": 9.99976849844157e-06, + "loss": 0.9604, + "step": 211 + }, + { + "epoch": 0.011668226099400077, + "grad_norm": 1.1248979568481445, + "learning_rate": 9.999764308486052e-06, + "loss": 0.9428, + "step": 212 + }, + { + "epoch": 0.011723264901755738, + "grad_norm": 1.274610161781311, + "learning_rate": 9.999760080953643e-06, + "loss": 0.9044, + "step": 213 + }, + { + "epoch": 0.011778303704111398, + "grad_norm": 1.1746865510940552, + "learning_rate": 9.999755815844377e-06, + "loss": 0.9114, + "step": 214 + }, + { + "epoch": 0.01183334250646706, + "grad_norm": 1.2531086206436157, + "learning_rate": 9.999751513158282e-06, + "loss": 1.0785, + "step": 215 + }, + { + "epoch": 0.01188838130882272, + "grad_norm": 1.0789539813995361, + "learning_rate": 9.999747172895395e-06, + "loss": 0.9794, + "step": 216 + }, + { + "epoch": 0.01194342011117838, + "grad_norm": 1.1805329322814941, + "learning_rate": 9.999742795055746e-06, + "loss": 0.9602, + "step": 217 + }, + { + "epoch": 0.011998458913534042, + "grad_norm": 2.309329032897949, + "learning_rate": 9.99973837963937e-06, + "loss": 0.9482, + "step": 218 + }, + { + "epoch": 0.012053497715889702, + "grad_norm": 1.2379088401794434, + "learning_rate": 9.999733926646296e-06, + "loss": 1.0237, + "step": 219 + }, + { + "epoch": 0.012108536518245363, + "grad_norm": 1.1581377983093262, + "learning_rate": 9.999729436076562e-06, + "loss": 1.0583, + "step": 220 + }, + { + "epoch": 0.012163575320601024, + "grad_norm": 1.3006727695465088, + "learning_rate": 9.999724907930199e-06, + "loss": 0.9581, + "step": 221 + }, + { + "epoch": 0.012218614122956684, + "grad_norm": 1.3215982913970947, + "learning_rate": 9.999720342207243e-06, + "loss": 0.9438, + "step": 222 + }, + { + "epoch": 0.012273652925312345, + "grad_norm": 1.1107337474822998, + "learning_rate": 9.999715738907727e-06, + "loss": 0.9987, + "step": 223 + }, + { + "epoch": 0.012328691727668007, + "grad_norm": 1.0745457410812378, + "learning_rate": 9.999711098031685e-06, + "loss": 0.9637, + "step": 224 + }, + { + "epoch": 0.012383730530023666, + "grad_norm": 1.110861897468567, + "learning_rate": 9.999706419579154e-06, + "loss": 1.0225, + "step": 225 + }, + { + "epoch": 0.012438769332379328, + "grad_norm": 1.0755527019500732, + "learning_rate": 9.999701703550167e-06, + "loss": 1.0204, + "step": 226 + }, + { + "epoch": 0.012493808134734987, + "grad_norm": 1.1694976091384888, + "learning_rate": 9.99969694994476e-06, + "loss": 1.0566, + "step": 227 + }, + { + "epoch": 0.012548846937090649, + "grad_norm": 1.455856442451477, + "learning_rate": 9.99969215876297e-06, + "loss": 0.9397, + "step": 228 + }, + { + "epoch": 0.01260388573944631, + "grad_norm": 1.0707073211669922, + "learning_rate": 9.99968733000483e-06, + "loss": 0.8286, + "step": 229 + }, + { + "epoch": 0.01265892454180197, + "grad_norm": 1.189548134803772, + "learning_rate": 9.99968246367038e-06, + "loss": 0.8762, + "step": 230 + }, + { + "epoch": 0.012713963344157631, + "grad_norm": 1.1439214944839478, + "learning_rate": 9.999677559759655e-06, + "loss": 0.9187, + "step": 231 + }, + { + "epoch": 0.012769002146513293, + "grad_norm": 1.2329761981964111, + "learning_rate": 9.999672618272691e-06, + "loss": 1.0374, + "step": 232 + }, + { + "epoch": 0.012824040948868952, + "grad_norm": 1.1545134782791138, + "learning_rate": 9.999667639209527e-06, + "loss": 0.9343, + "step": 233 + }, + { + "epoch": 0.012879079751224614, + "grad_norm": 1.0946775674819946, + "learning_rate": 9.999662622570198e-06, + "loss": 0.9568, + "step": 234 + }, + { + "epoch": 0.012934118553580273, + "grad_norm": 1.2099589109420776, + "learning_rate": 9.999657568354743e-06, + "loss": 1.0364, + "step": 235 + }, + { + "epoch": 0.012989157355935935, + "grad_norm": 1.09062922000885, + "learning_rate": 9.999652476563202e-06, + "loss": 1.0289, + "step": 236 + }, + { + "epoch": 0.013044196158291596, + "grad_norm": 1.154557228088379, + "learning_rate": 9.999647347195612e-06, + "loss": 0.9925, + "step": 237 + }, + { + "epoch": 0.013099234960647256, + "grad_norm": 1.025374174118042, + "learning_rate": 9.999642180252008e-06, + "loss": 0.9346, + "step": 238 + }, + { + "epoch": 0.013154273763002917, + "grad_norm": 1.1473641395568848, + "learning_rate": 9.999636975732433e-06, + "loss": 1.0244, + "step": 239 + }, + { + "epoch": 0.013209312565358578, + "grad_norm": 1.0421240329742432, + "learning_rate": 9.999631733636923e-06, + "loss": 0.9368, + "step": 240 + }, + { + "epoch": 0.013264351367714238, + "grad_norm": 1.1076610088348389, + "learning_rate": 9.99962645396552e-06, + "loss": 1.0276, + "step": 241 + }, + { + "epoch": 0.0133193901700699, + "grad_norm": 1.143559455871582, + "learning_rate": 9.999621136718266e-06, + "loss": 0.9626, + "step": 242 + }, + { + "epoch": 0.01337442897242556, + "grad_norm": 1.0958378314971924, + "learning_rate": 9.999615781895195e-06, + "loss": 1.0254, + "step": 243 + }, + { + "epoch": 0.01342946777478122, + "grad_norm": 1.117688536643982, + "learning_rate": 9.99961038949635e-06, + "loss": 0.9685, + "step": 244 + }, + { + "epoch": 0.013484506577136882, + "grad_norm": 1.1645647287368774, + "learning_rate": 9.999604959521771e-06, + "loss": 1.0666, + "step": 245 + }, + { + "epoch": 0.013539545379492542, + "grad_norm": 1.1238516569137573, + "learning_rate": 9.999599491971502e-06, + "loss": 1.0252, + "step": 246 + }, + { + "epoch": 0.013594584181848203, + "grad_norm": 1.0196914672851562, + "learning_rate": 9.999593986845579e-06, + "loss": 0.9389, + "step": 247 + }, + { + "epoch": 0.013649622984203864, + "grad_norm": 1.0231372117996216, + "learning_rate": 9.999588444144049e-06, + "loss": 0.8786, + "step": 248 + }, + { + "epoch": 0.013704661786559524, + "grad_norm": 1.2504147291183472, + "learning_rate": 9.999582863866947e-06, + "loss": 1.0969, + "step": 249 + }, + { + "epoch": 0.013759700588915185, + "grad_norm": 1.1123549938201904, + "learning_rate": 9.99957724601432e-06, + "loss": 0.8833, + "step": 250 + }, + { + "epoch": 0.013814739391270847, + "grad_norm": 1.1068202257156372, + "learning_rate": 9.999571590586208e-06, + "loss": 0.9709, + "step": 251 + }, + { + "epoch": 0.013869778193626506, + "grad_norm": 0.9891651272773743, + "learning_rate": 9.999565897582655e-06, + "loss": 0.8598, + "step": 252 + }, + { + "epoch": 0.013924816995982168, + "grad_norm": 0.9866491556167603, + "learning_rate": 9.999560167003703e-06, + "loss": 0.8101, + "step": 253 + }, + { + "epoch": 0.013979855798337828, + "grad_norm": 1.0862594842910767, + "learning_rate": 9.999554398849396e-06, + "loss": 0.9411, + "step": 254 + }, + { + "epoch": 0.014034894600693489, + "grad_norm": 1.1898949146270752, + "learning_rate": 9.999548593119774e-06, + "loss": 0.9548, + "step": 255 + }, + { + "epoch": 0.01408993340304915, + "grad_norm": 1.2167880535125732, + "learning_rate": 9.999542749814886e-06, + "loss": 1.0302, + "step": 256 + }, + { + "epoch": 0.01414497220540481, + "grad_norm": 1.0784146785736084, + "learning_rate": 9.999536868934771e-06, + "loss": 0.8875, + "step": 257 + }, + { + "epoch": 0.014200011007760471, + "grad_norm": 1.1128027439117432, + "learning_rate": 9.999530950479475e-06, + "loss": 0.9498, + "step": 258 + }, + { + "epoch": 0.014255049810116133, + "grad_norm": 1.1311595439910889, + "learning_rate": 9.999524994449044e-06, + "loss": 0.9035, + "step": 259 + }, + { + "epoch": 0.014310088612471792, + "grad_norm": 1.225615382194519, + "learning_rate": 9.999519000843521e-06, + "loss": 1.0104, + "step": 260 + }, + { + "epoch": 0.014365127414827454, + "grad_norm": 1.2347793579101562, + "learning_rate": 9.99951296966295e-06, + "loss": 1.0288, + "step": 261 + }, + { + "epoch": 0.014420166217183113, + "grad_norm": 1.1837103366851807, + "learning_rate": 9.99950690090738e-06, + "loss": 0.9553, + "step": 262 + }, + { + "epoch": 0.014475205019538775, + "grad_norm": 1.1985397338867188, + "learning_rate": 9.999500794576852e-06, + "loss": 0.9561, + "step": 263 + }, + { + "epoch": 0.014530243821894436, + "grad_norm": 1.036928415298462, + "learning_rate": 9.999494650671418e-06, + "loss": 0.8906, + "step": 264 + }, + { + "epoch": 0.014585282624250096, + "grad_norm": 1.0797842741012573, + "learning_rate": 9.999488469191116e-06, + "loss": 0.8975, + "step": 265 + }, + { + "epoch": 0.014640321426605757, + "grad_norm": 1.0571156740188599, + "learning_rate": 9.999482250136e-06, + "loss": 0.9334, + "step": 266 + }, + { + "epoch": 0.014695360228961419, + "grad_norm": 1.2065023183822632, + "learning_rate": 9.999475993506114e-06, + "loss": 0.8986, + "step": 267 + }, + { + "epoch": 0.014750399031317078, + "grad_norm": 1.201586127281189, + "learning_rate": 9.999469699301502e-06, + "loss": 0.9192, + "step": 268 + }, + { + "epoch": 0.01480543783367274, + "grad_norm": 1.0470168590545654, + "learning_rate": 9.999463367522216e-06, + "loss": 0.8604, + "step": 269 + }, + { + "epoch": 0.0148604766360284, + "grad_norm": 1.1142147779464722, + "learning_rate": 9.9994569981683e-06, + "loss": 0.9847, + "step": 270 + }, + { + "epoch": 0.01491551543838406, + "grad_norm": 1.0352061986923218, + "learning_rate": 9.999450591239805e-06, + "loss": 0.8927, + "step": 271 + }, + { + "epoch": 0.014970554240739722, + "grad_norm": 1.0353184938430786, + "learning_rate": 9.999444146736779e-06, + "loss": 0.8435, + "step": 272 + }, + { + "epoch": 0.015025593043095382, + "grad_norm": 1.2091951370239258, + "learning_rate": 9.999437664659267e-06, + "loss": 0.8959, + "step": 273 + }, + { + "epoch": 0.015080631845451043, + "grad_norm": 1.006361722946167, + "learning_rate": 9.999431145007319e-06, + "loss": 0.8579, + "step": 274 + }, + { + "epoch": 0.015135670647806704, + "grad_norm": 1.1265509128570557, + "learning_rate": 9.999424587780985e-06, + "loss": 0.8808, + "step": 275 + }, + { + "epoch": 0.015190709450162364, + "grad_norm": 1.060882568359375, + "learning_rate": 9.999417992980317e-06, + "loss": 1.044, + "step": 276 + }, + { + "epoch": 0.015245748252518026, + "grad_norm": 1.0216747522354126, + "learning_rate": 9.999411360605358e-06, + "loss": 0.7773, + "step": 277 + }, + { + "epoch": 0.015300787054873685, + "grad_norm": 1.1382462978363037, + "learning_rate": 9.999404690656163e-06, + "loss": 0.8954, + "step": 278 + }, + { + "epoch": 0.015355825857229347, + "grad_norm": 1.113815188407898, + "learning_rate": 9.99939798313278e-06, + "loss": 0.8143, + "step": 279 + }, + { + "epoch": 0.015410864659585008, + "grad_norm": 1.123530387878418, + "learning_rate": 9.99939123803526e-06, + "loss": 0.8872, + "step": 280 + }, + { + "epoch": 0.015465903461940668, + "grad_norm": 1.0873669385910034, + "learning_rate": 9.999384455363656e-06, + "loss": 1.008, + "step": 281 + }, + { + "epoch": 0.015520942264296329, + "grad_norm": 1.5956637859344482, + "learning_rate": 9.999377635118014e-06, + "loss": 0.9456, + "step": 282 + }, + { + "epoch": 0.01557598106665199, + "grad_norm": 1.1471425294876099, + "learning_rate": 9.999370777298389e-06, + "loss": 0.9897, + "step": 283 + }, + { + "epoch": 0.01563101986900765, + "grad_norm": 0.9960193634033203, + "learning_rate": 9.999363881904831e-06, + "loss": 0.8196, + "step": 284 + }, + { + "epoch": 0.01568605867136331, + "grad_norm": 1.1033951044082642, + "learning_rate": 9.999356948937393e-06, + "loss": 0.879, + "step": 285 + }, + { + "epoch": 0.015741097473718973, + "grad_norm": 1.157765507698059, + "learning_rate": 9.999349978396126e-06, + "loss": 1.0116, + "step": 286 + }, + { + "epoch": 0.015796136276074634, + "grad_norm": 1.0472352504730225, + "learning_rate": 9.999342970281084e-06, + "loss": 0.8657, + "step": 287 + }, + { + "epoch": 0.015851175078430292, + "grad_norm": 1.1346659660339355, + "learning_rate": 9.999335924592315e-06, + "loss": 0.8482, + "step": 288 + }, + { + "epoch": 0.015906213880785953, + "grad_norm": 1.1164487600326538, + "learning_rate": 9.999328841329879e-06, + "loss": 1.0542, + "step": 289 + }, + { + "epoch": 0.015961252683141615, + "grad_norm": 1.1890591382980347, + "learning_rate": 9.999321720493825e-06, + "loss": 0.9598, + "step": 290 + }, + { + "epoch": 0.016016291485497276, + "grad_norm": 1.0419867038726807, + "learning_rate": 9.999314562084205e-06, + "loss": 0.9548, + "step": 291 + }, + { + "epoch": 0.016071330287852938, + "grad_norm": 1.0652042627334595, + "learning_rate": 9.999307366101077e-06, + "loss": 0.9359, + "step": 292 + }, + { + "epoch": 0.016126369090208596, + "grad_norm": 1.0166404247283936, + "learning_rate": 9.999300132544492e-06, + "loss": 0.9276, + "step": 293 + }, + { + "epoch": 0.016181407892564257, + "grad_norm": 1.1638866662979126, + "learning_rate": 9.999292861414507e-06, + "loss": 0.957, + "step": 294 + }, + { + "epoch": 0.01623644669491992, + "grad_norm": 1.5505993366241455, + "learning_rate": 9.999285552711173e-06, + "loss": 0.9878, + "step": 295 + }, + { + "epoch": 0.01629148549727558, + "grad_norm": 1.177262783050537, + "learning_rate": 9.999278206434549e-06, + "loss": 0.8631, + "step": 296 + }, + { + "epoch": 0.01634652429963124, + "grad_norm": 1.8578168153762817, + "learning_rate": 9.999270822584687e-06, + "loss": 0.9684, + "step": 297 + }, + { + "epoch": 0.0164015631019869, + "grad_norm": 1.2617360353469849, + "learning_rate": 9.999263401161643e-06, + "loss": 1.014, + "step": 298 + }, + { + "epoch": 0.01645660190434256, + "grad_norm": 0.9740132689476013, + "learning_rate": 9.999255942165475e-06, + "loss": 0.8606, + "step": 299 + }, + { + "epoch": 0.016511640706698222, + "grad_norm": 0.9821745753288269, + "learning_rate": 9.999248445596238e-06, + "loss": 0.8241, + "step": 300 + }, + { + "epoch": 0.016566679509053883, + "grad_norm": 1.0200445652008057, + "learning_rate": 9.999240911453986e-06, + "loss": 0.8256, + "step": 301 + }, + { + "epoch": 0.016621718311409545, + "grad_norm": 1.4100390672683716, + "learning_rate": 9.999233339738779e-06, + "loss": 0.9057, + "step": 302 + }, + { + "epoch": 0.016676757113765206, + "grad_norm": 1.056544303894043, + "learning_rate": 9.99922573045067e-06, + "loss": 1.0808, + "step": 303 + }, + { + "epoch": 0.016731795916120864, + "grad_norm": 0.9271026253700256, + "learning_rate": 9.99921808358972e-06, + "loss": 0.878, + "step": 304 + }, + { + "epoch": 0.016786834718476525, + "grad_norm": 0.9864157438278198, + "learning_rate": 9.999210399155987e-06, + "loss": 0.9198, + "step": 305 + }, + { + "epoch": 0.016841873520832187, + "grad_norm": 1.093995451927185, + "learning_rate": 9.999202677149525e-06, + "loss": 0.9794, + "step": 306 + }, + { + "epoch": 0.016896912323187848, + "grad_norm": 0.9717912077903748, + "learning_rate": 9.999194917570395e-06, + "loss": 0.8764, + "step": 307 + }, + { + "epoch": 0.01695195112554351, + "grad_norm": 1.0026428699493408, + "learning_rate": 9.999187120418653e-06, + "loss": 0.8526, + "step": 308 + }, + { + "epoch": 0.017006989927899167, + "grad_norm": 1.122870922088623, + "learning_rate": 9.999179285694359e-06, + "loss": 0.9773, + "step": 309 + }, + { + "epoch": 0.01706202873025483, + "grad_norm": 1.0522836446762085, + "learning_rate": 9.999171413397572e-06, + "loss": 1.0183, + "step": 310 + }, + { + "epoch": 0.01711706753261049, + "grad_norm": 0.9303658604621887, + "learning_rate": 9.99916350352835e-06, + "loss": 0.8402, + "step": 311 + }, + { + "epoch": 0.01717210633496615, + "grad_norm": 0.9606096148490906, + "learning_rate": 9.999155556086755e-06, + "loss": 0.9692, + "step": 312 + }, + { + "epoch": 0.017227145137321813, + "grad_norm": 1.176992416381836, + "learning_rate": 9.999147571072844e-06, + "loss": 0.8172, + "step": 313 + }, + { + "epoch": 0.017282183939677474, + "grad_norm": 1.1948801279067993, + "learning_rate": 9.999139548486678e-06, + "loss": 1.0205, + "step": 314 + }, + { + "epoch": 0.017337222742033132, + "grad_norm": 1.0064897537231445, + "learning_rate": 9.999131488328318e-06, + "loss": 0.9479, + "step": 315 + }, + { + "epoch": 0.017392261544388794, + "grad_norm": 1.048242449760437, + "learning_rate": 9.999123390597822e-06, + "loss": 0.9862, + "step": 316 + }, + { + "epoch": 0.017447300346744455, + "grad_norm": 1.12875497341156, + "learning_rate": 9.999115255295256e-06, + "loss": 0.9743, + "step": 317 + }, + { + "epoch": 0.017502339149100116, + "grad_norm": 1.0607460737228394, + "learning_rate": 9.999107082420674e-06, + "loss": 0.8878, + "step": 318 + }, + { + "epoch": 0.017557377951455778, + "grad_norm": 1.1480191946029663, + "learning_rate": 9.999098871974144e-06, + "loss": 0.8769, + "step": 319 + }, + { + "epoch": 0.017612416753811436, + "grad_norm": 1.1150004863739014, + "learning_rate": 9.999090623955724e-06, + "loss": 0.8615, + "step": 320 + }, + { + "epoch": 0.017667455556167097, + "grad_norm": 1.137839913368225, + "learning_rate": 9.999082338365478e-06, + "loss": 0.9703, + "step": 321 + }, + { + "epoch": 0.01772249435852276, + "grad_norm": 1.0883489847183228, + "learning_rate": 9.999074015203467e-06, + "loss": 0.9273, + "step": 322 + }, + { + "epoch": 0.01777753316087842, + "grad_norm": 1.0999557971954346, + "learning_rate": 9.999065654469752e-06, + "loss": 0.9605, + "step": 323 + }, + { + "epoch": 0.01783257196323408, + "grad_norm": 0.9911689758300781, + "learning_rate": 9.999057256164401e-06, + "loss": 0.9117, + "step": 324 + }, + { + "epoch": 0.01788761076558974, + "grad_norm": 1.040933609008789, + "learning_rate": 9.999048820287472e-06, + "loss": 0.9229, + "step": 325 + }, + { + "epoch": 0.0179426495679454, + "grad_norm": 1.4341392517089844, + "learning_rate": 9.999040346839031e-06, + "loss": 1.0718, + "step": 326 + }, + { + "epoch": 0.017997688370301062, + "grad_norm": 1.0246332883834839, + "learning_rate": 9.99903183581914e-06, + "loss": 0.9617, + "step": 327 + }, + { + "epoch": 0.018052727172656723, + "grad_norm": 10.162322998046875, + "learning_rate": 9.999023287227863e-06, + "loss": 1.0391, + "step": 328 + }, + { + "epoch": 0.018107765975012385, + "grad_norm": 1.3370027542114258, + "learning_rate": 9.999014701065266e-06, + "loss": 1.0211, + "step": 329 + }, + { + "epoch": 0.018162804777368046, + "grad_norm": 1.0146219730377197, + "learning_rate": 9.999006077331413e-06, + "loss": 0.8611, + "step": 330 + }, + { + "epoch": 0.018217843579723704, + "grad_norm": 1.0899269580841064, + "learning_rate": 9.998997416026368e-06, + "loss": 0.9209, + "step": 331 + }, + { + "epoch": 0.018272882382079365, + "grad_norm": 1.1343204975128174, + "learning_rate": 9.998988717150198e-06, + "loss": 0.9405, + "step": 332 + }, + { + "epoch": 0.018327921184435027, + "grad_norm": 1.2308380603790283, + "learning_rate": 9.998979980702965e-06, + "loss": 0.9579, + "step": 333 + }, + { + "epoch": 0.018382959986790688, + "grad_norm": 1.1433519124984741, + "learning_rate": 9.998971206684737e-06, + "loss": 1.0045, + "step": 334 + }, + { + "epoch": 0.01843799878914635, + "grad_norm": 1.0585781335830688, + "learning_rate": 9.99896239509558e-06, + "loss": 0.9171, + "step": 335 + }, + { + "epoch": 0.018493037591502007, + "grad_norm": 1.2735164165496826, + "learning_rate": 9.99895354593556e-06, + "loss": 1.1001, + "step": 336 + }, + { + "epoch": 0.01854807639385767, + "grad_norm": 1.2905755043029785, + "learning_rate": 9.998944659204744e-06, + "loss": 1.0294, + "step": 337 + }, + { + "epoch": 0.01860311519621333, + "grad_norm": 1.1442075967788696, + "learning_rate": 9.998935734903198e-06, + "loss": 0.9385, + "step": 338 + }, + { + "epoch": 0.01865815399856899, + "grad_norm": 1.1005232334136963, + "learning_rate": 9.998926773030987e-06, + "loss": 1.026, + "step": 339 + }, + { + "epoch": 0.018713192800924653, + "grad_norm": 1.2770785093307495, + "learning_rate": 9.998917773588182e-06, + "loss": 1.0015, + "step": 340 + }, + { + "epoch": 0.01876823160328031, + "grad_norm": 1.0963070392608643, + "learning_rate": 9.998908736574849e-06, + "loss": 0.9347, + "step": 341 + }, + { + "epoch": 0.018823270405635972, + "grad_norm": 1.10364830493927, + "learning_rate": 9.998899661991055e-06, + "loss": 0.869, + "step": 342 + }, + { + "epoch": 0.018878309207991634, + "grad_norm": 1.0364975929260254, + "learning_rate": 9.99889054983687e-06, + "loss": 0.9855, + "step": 343 + }, + { + "epoch": 0.018933348010347295, + "grad_norm": 1.104702115058899, + "learning_rate": 9.998881400112362e-06, + "loss": 0.9555, + "step": 344 + }, + { + "epoch": 0.018988386812702956, + "grad_norm": 0.9957441687583923, + "learning_rate": 9.998872212817599e-06, + "loss": 0.9634, + "step": 345 + }, + { + "epoch": 0.019043425615058618, + "grad_norm": 1.262271523475647, + "learning_rate": 9.998862987952651e-06, + "loss": 1.0133, + "step": 346 + }, + { + "epoch": 0.019098464417414276, + "grad_norm": 1.2075226306915283, + "learning_rate": 9.998853725517587e-06, + "loss": 1.0588, + "step": 347 + }, + { + "epoch": 0.019153503219769937, + "grad_norm": 1.0609898567199707, + "learning_rate": 9.998844425512477e-06, + "loss": 0.9952, + "step": 348 + }, + { + "epoch": 0.0192085420221256, + "grad_norm": 1.1930195093154907, + "learning_rate": 9.998835087937389e-06, + "loss": 0.9617, + "step": 349 + }, + { + "epoch": 0.01926358082448126, + "grad_norm": 1.2359932661056519, + "learning_rate": 9.998825712792396e-06, + "loss": 0.8768, + "step": 350 + }, + { + "epoch": 0.01931861962683692, + "grad_norm": 0.9984115362167358, + "learning_rate": 9.998816300077566e-06, + "loss": 0.8205, + "step": 351 + }, + { + "epoch": 0.01937365842919258, + "grad_norm": 1.6853677034378052, + "learning_rate": 9.998806849792972e-06, + "loss": 0.9066, + "step": 352 + }, + { + "epoch": 0.01942869723154824, + "grad_norm": 1.2869856357574463, + "learning_rate": 9.998797361938683e-06, + "loss": 1.0054, + "step": 353 + }, + { + "epoch": 0.019483736033903902, + "grad_norm": 1.2791584730148315, + "learning_rate": 9.99878783651477e-06, + "loss": 0.7627, + "step": 354 + }, + { + "epoch": 0.019538774836259563, + "grad_norm": 1.0795867443084717, + "learning_rate": 9.998778273521307e-06, + "loss": 0.9343, + "step": 355 + }, + { + "epoch": 0.019593813638615225, + "grad_norm": 1.0926088094711304, + "learning_rate": 9.998768672958365e-06, + "loss": 0.943, + "step": 356 + }, + { + "epoch": 0.019648852440970886, + "grad_norm": 1.0530847311019897, + "learning_rate": 9.998759034826015e-06, + "loss": 0.9656, + "step": 357 + }, + { + "epoch": 0.019703891243326544, + "grad_norm": 1.1793400049209595, + "learning_rate": 9.99874935912433e-06, + "loss": 0.9799, + "step": 358 + }, + { + "epoch": 0.019758930045682205, + "grad_norm": 1.0726191997528076, + "learning_rate": 9.998739645853383e-06, + "loss": 0.8739, + "step": 359 + }, + { + "epoch": 0.019813968848037867, + "grad_norm": 1.0488981008529663, + "learning_rate": 9.998729895013246e-06, + "loss": 0.8986, + "step": 360 + }, + { + "epoch": 0.019869007650393528, + "grad_norm": 1.8267477750778198, + "learning_rate": 9.998720106603993e-06, + "loss": 0.9175, + "step": 361 + }, + { + "epoch": 0.01992404645274919, + "grad_norm": 0.9868306517601013, + "learning_rate": 9.9987102806257e-06, + "loss": 0.9609, + "step": 362 + }, + { + "epoch": 0.019979085255104848, + "grad_norm": 1.0171183347702026, + "learning_rate": 9.998700417078438e-06, + "loss": 0.8904, + "step": 363 + }, + { + "epoch": 0.02003412405746051, + "grad_norm": 0.9800812602043152, + "learning_rate": 9.998690515962282e-06, + "loss": 0.8344, + "step": 364 + }, + { + "epoch": 0.02008916285981617, + "grad_norm": 1.024707317352295, + "learning_rate": 9.998680577277304e-06, + "loss": 0.9026, + "step": 365 + }, + { + "epoch": 0.02014420166217183, + "grad_norm": 1.1056619882583618, + "learning_rate": 9.998670601023584e-06, + "loss": 1.017, + "step": 366 + }, + { + "epoch": 0.020199240464527493, + "grad_norm": 1.0555908679962158, + "learning_rate": 9.998660587201191e-06, + "loss": 0.9627, + "step": 367 + }, + { + "epoch": 0.02025427926688315, + "grad_norm": 0.9502031803131104, + "learning_rate": 9.998650535810204e-06, + "loss": 0.935, + "step": 368 + }, + { + "epoch": 0.020309318069238812, + "grad_norm": 1.0355613231658936, + "learning_rate": 9.998640446850699e-06, + "loss": 0.9946, + "step": 369 + }, + { + "epoch": 0.020364356871594474, + "grad_norm": 0.9906355142593384, + "learning_rate": 9.99863032032275e-06, + "loss": 0.9389, + "step": 370 + }, + { + "epoch": 0.020419395673950135, + "grad_norm": 0.9483911395072937, + "learning_rate": 9.99862015622643e-06, + "loss": 0.979, + "step": 371 + }, + { + "epoch": 0.020474434476305797, + "grad_norm": 0.9769986271858215, + "learning_rate": 9.998609954561822e-06, + "loss": 0.8972, + "step": 372 + }, + { + "epoch": 0.020529473278661458, + "grad_norm": 1.1682699918746948, + "learning_rate": 9.998599715329e-06, + "loss": 0.943, + "step": 373 + }, + { + "epoch": 0.020584512081017116, + "grad_norm": 1.007912516593933, + "learning_rate": 9.99858943852804e-06, + "loss": 0.8825, + "step": 374 + }, + { + "epoch": 0.020639550883372777, + "grad_norm": 0.9788785576820374, + "learning_rate": 9.99857912415902e-06, + "loss": 0.9667, + "step": 375 + }, + { + "epoch": 0.02069458968572844, + "grad_norm": 1.0804275274276733, + "learning_rate": 9.998568772222017e-06, + "loss": 1.0026, + "step": 376 + }, + { + "epoch": 0.0207496284880841, + "grad_norm": 1.0859237909317017, + "learning_rate": 9.998558382717109e-06, + "loss": 0.9592, + "step": 377 + }, + { + "epoch": 0.02080466729043976, + "grad_norm": 1.2925337553024292, + "learning_rate": 9.998547955644373e-06, + "loss": 0.9067, + "step": 378 + }, + { + "epoch": 0.02085970609279542, + "grad_norm": 0.9853373765945435, + "learning_rate": 9.99853749100389e-06, + "loss": 0.9538, + "step": 379 + }, + { + "epoch": 0.02091474489515108, + "grad_norm": 1.0461076498031616, + "learning_rate": 9.998526988795738e-06, + "loss": 0.9261, + "step": 380 + }, + { + "epoch": 0.020969783697506742, + "grad_norm": 1.024559497833252, + "learning_rate": 9.998516449019995e-06, + "loss": 0.9117, + "step": 381 + }, + { + "epoch": 0.021024822499862404, + "grad_norm": 1.1474825143814087, + "learning_rate": 9.998505871676739e-06, + "loss": 1.0177, + "step": 382 + }, + { + "epoch": 0.021079861302218065, + "grad_norm": 0.9587596654891968, + "learning_rate": 9.998495256766051e-06, + "loss": 0.8809, + "step": 383 + }, + { + "epoch": 0.021134900104573723, + "grad_norm": 0.9505122303962708, + "learning_rate": 9.998484604288013e-06, + "loss": 0.9266, + "step": 384 + }, + { + "epoch": 0.021189938906929384, + "grad_norm": 0.9625647664070129, + "learning_rate": 9.9984739142427e-06, + "loss": 0.9073, + "step": 385 + }, + { + "epoch": 0.021244977709285046, + "grad_norm": 0.9650934338569641, + "learning_rate": 9.998463186630196e-06, + "loss": 0.9042, + "step": 386 + }, + { + "epoch": 0.021300016511640707, + "grad_norm": 1.0289491415023804, + "learning_rate": 9.99845242145058e-06, + "loss": 0.929, + "step": 387 + }, + { + "epoch": 0.02135505531399637, + "grad_norm": 0.9543869495391846, + "learning_rate": 9.998441618703935e-06, + "loss": 0.9406, + "step": 388 + }, + { + "epoch": 0.02141009411635203, + "grad_norm": 0.9276942610740662, + "learning_rate": 9.99843077839034e-06, + "loss": 0.8982, + "step": 389 + }, + { + "epoch": 0.021465132918707688, + "grad_norm": 0.9264664053916931, + "learning_rate": 9.998419900509877e-06, + "loss": 0.7255, + "step": 390 + }, + { + "epoch": 0.02152017172106335, + "grad_norm": 0.9961187243461609, + "learning_rate": 9.998408985062628e-06, + "loss": 0.9826, + "step": 391 + }, + { + "epoch": 0.02157521052341901, + "grad_norm": 0.966596245765686, + "learning_rate": 9.998398032048676e-06, + "loss": 0.8159, + "step": 392 + }, + { + "epoch": 0.021630249325774672, + "grad_norm": 1.1336095333099365, + "learning_rate": 9.998387041468102e-06, + "loss": 0.9289, + "step": 393 + }, + { + "epoch": 0.021685288128130333, + "grad_norm": 1.0453619956970215, + "learning_rate": 9.998376013320989e-06, + "loss": 0.8816, + "step": 394 + }, + { + "epoch": 0.02174032693048599, + "grad_norm": 0.8961821794509888, + "learning_rate": 9.998364947607419e-06, + "loss": 0.871, + "step": 395 + }, + { + "epoch": 0.021795365732841653, + "grad_norm": 1.3420332670211792, + "learning_rate": 9.998353844327477e-06, + "loss": 0.9338, + "step": 396 + }, + { + "epoch": 0.021850404535197314, + "grad_norm": 0.9635335206985474, + "learning_rate": 9.998342703481246e-06, + "loss": 0.9592, + "step": 397 + }, + { + "epoch": 0.021905443337552975, + "grad_norm": 1.3322341442108154, + "learning_rate": 9.998331525068807e-06, + "loss": 1.0974, + "step": 398 + }, + { + "epoch": 0.021960482139908637, + "grad_norm": 1.017220377922058, + "learning_rate": 9.998320309090247e-06, + "loss": 0.9827, + "step": 399 + }, + { + "epoch": 0.022015520942264295, + "grad_norm": 1.0080329179763794, + "learning_rate": 9.99830905554565e-06, + "loss": 0.877, + "step": 400 + }, + { + "epoch": 0.022070559744619956, + "grad_norm": 0.9883211255073547, + "learning_rate": 9.998297764435101e-06, + "loss": 0.9625, + "step": 401 + }, + { + "epoch": 0.022125598546975617, + "grad_norm": 1.0948412418365479, + "learning_rate": 9.998286435758684e-06, + "loss": 0.9058, + "step": 402 + }, + { + "epoch": 0.02218063734933128, + "grad_norm": 0.9402000308036804, + "learning_rate": 9.998275069516482e-06, + "loss": 0.8882, + "step": 403 + }, + { + "epoch": 0.02223567615168694, + "grad_norm": 0.9858806133270264, + "learning_rate": 9.998263665708583e-06, + "loss": 0.9086, + "step": 404 + }, + { + "epoch": 0.0222907149540426, + "grad_norm": 1.0556131601333618, + "learning_rate": 9.998252224335073e-06, + "loss": 0.9583, + "step": 405 + }, + { + "epoch": 0.02234575375639826, + "grad_norm": 1.092766284942627, + "learning_rate": 9.998240745396037e-06, + "loss": 0.9124, + "step": 406 + }, + { + "epoch": 0.02240079255875392, + "grad_norm": 1.1902250051498413, + "learning_rate": 9.998229228891563e-06, + "loss": 1.0566, + "step": 407 + }, + { + "epoch": 0.022455831361109582, + "grad_norm": 1.067906141281128, + "learning_rate": 9.998217674821734e-06, + "loss": 0.9823, + "step": 408 + }, + { + "epoch": 0.022510870163465244, + "grad_norm": 1.0051710605621338, + "learning_rate": 9.998206083186638e-06, + "loss": 0.9141, + "step": 409 + }, + { + "epoch": 0.022565908965820905, + "grad_norm": 1.046412467956543, + "learning_rate": 9.998194453986367e-06, + "loss": 0.9439, + "step": 410 + }, + { + "epoch": 0.022620947768176563, + "grad_norm": 1.1103553771972656, + "learning_rate": 9.998182787221e-06, + "loss": 0.9494, + "step": 411 + }, + { + "epoch": 0.022675986570532224, + "grad_norm": 1.0508466958999634, + "learning_rate": 9.998171082890632e-06, + "loss": 0.9202, + "step": 412 + }, + { + "epoch": 0.022731025372887886, + "grad_norm": 1.1364226341247559, + "learning_rate": 9.998159340995347e-06, + "loss": 0.9859, + "step": 413 + }, + { + "epoch": 0.022786064175243547, + "grad_norm": 1.2073607444763184, + "learning_rate": 9.998147561535234e-06, + "loss": 0.8883, + "step": 414 + }, + { + "epoch": 0.02284110297759921, + "grad_norm": 1.0657012462615967, + "learning_rate": 9.998135744510384e-06, + "loss": 0.8321, + "step": 415 + }, + { + "epoch": 0.02289614177995487, + "grad_norm": 1.0101548433303833, + "learning_rate": 9.998123889920881e-06, + "loss": 0.9374, + "step": 416 + }, + { + "epoch": 0.022951180582310528, + "grad_norm": 1.057455062866211, + "learning_rate": 9.998111997766817e-06, + "loss": 0.8831, + "step": 417 + }, + { + "epoch": 0.02300621938466619, + "grad_norm": 1.206092357635498, + "learning_rate": 9.998100068048282e-06, + "loss": 0.8812, + "step": 418 + }, + { + "epoch": 0.02306125818702185, + "grad_norm": 1.0709773302078247, + "learning_rate": 9.998088100765366e-06, + "loss": 0.9486, + "step": 419 + }, + { + "epoch": 0.023116296989377512, + "grad_norm": 1.066469669342041, + "learning_rate": 9.998076095918156e-06, + "loss": 1.0229, + "step": 420 + }, + { + "epoch": 0.023171335791733173, + "grad_norm": 1.0443583726882935, + "learning_rate": 9.998064053506744e-06, + "loss": 0.8615, + "step": 421 + }, + { + "epoch": 0.02322637459408883, + "grad_norm": 1.103096842765808, + "learning_rate": 9.99805197353122e-06, + "loss": 0.9909, + "step": 422 + }, + { + "epoch": 0.023281413396444493, + "grad_norm": 0.9804643392562866, + "learning_rate": 9.998039855991677e-06, + "loss": 0.9214, + "step": 423 + }, + { + "epoch": 0.023336452198800154, + "grad_norm": 0.9880676865577698, + "learning_rate": 9.998027700888202e-06, + "loss": 0.9345, + "step": 424 + }, + { + "epoch": 0.023391491001155815, + "grad_norm": 0.9633826017379761, + "learning_rate": 9.99801550822089e-06, + "loss": 0.9897, + "step": 425 + }, + { + "epoch": 0.023446529803511477, + "grad_norm": 1.0159331560134888, + "learning_rate": 9.998003277989831e-06, + "loss": 0.9385, + "step": 426 + }, + { + "epoch": 0.023501568605867135, + "grad_norm": 1.009667158126831, + "learning_rate": 9.99799101019512e-06, + "loss": 0.9013, + "step": 427 + }, + { + "epoch": 0.023556607408222796, + "grad_norm": 0.9478578567504883, + "learning_rate": 9.997978704836842e-06, + "loss": 0.8775, + "step": 428 + }, + { + "epoch": 0.023611646210578457, + "grad_norm": 1.013181447982788, + "learning_rate": 9.997966361915096e-06, + "loss": 0.8797, + "step": 429 + }, + { + "epoch": 0.02366668501293412, + "grad_norm": 1.0337481498718262, + "learning_rate": 9.997953981429974e-06, + "loss": 1.0047, + "step": 430 + }, + { + "epoch": 0.02372172381528978, + "grad_norm": 0.9423721432685852, + "learning_rate": 9.997941563381566e-06, + "loss": 0.8639, + "step": 431 + }, + { + "epoch": 0.02377676261764544, + "grad_norm": 1.100492000579834, + "learning_rate": 9.997929107769968e-06, + "loss": 1.0022, + "step": 432 + }, + { + "epoch": 0.0238318014200011, + "grad_norm": 1.1232364177703857, + "learning_rate": 9.997916614595272e-06, + "loss": 0.9145, + "step": 433 + }, + { + "epoch": 0.02388684022235676, + "grad_norm": 0.9466833472251892, + "learning_rate": 9.997904083857572e-06, + "loss": 0.9397, + "step": 434 + }, + { + "epoch": 0.023941879024712422, + "grad_norm": 0.9514566659927368, + "learning_rate": 9.997891515556963e-06, + "loss": 0.8025, + "step": 435 + }, + { + "epoch": 0.023996917827068084, + "grad_norm": 0.9292222261428833, + "learning_rate": 9.997878909693539e-06, + "loss": 0.7739, + "step": 436 + }, + { + "epoch": 0.024051956629423745, + "grad_norm": 1.1049963235855103, + "learning_rate": 9.997866266267397e-06, + "loss": 0.9439, + "step": 437 + }, + { + "epoch": 0.024106995431779403, + "grad_norm": 1.0938019752502441, + "learning_rate": 9.997853585278627e-06, + "loss": 0.9479, + "step": 438 + }, + { + "epoch": 0.024162034234135064, + "grad_norm": 1.0423611402511597, + "learning_rate": 9.997840866727331e-06, + "loss": 0.9309, + "step": 439 + }, + { + "epoch": 0.024217073036490726, + "grad_norm": 1.0584756135940552, + "learning_rate": 9.997828110613598e-06, + "loss": 1.0218, + "step": 440 + }, + { + "epoch": 0.024272111838846387, + "grad_norm": 0.9986408948898315, + "learning_rate": 9.997815316937527e-06, + "loss": 0.9734, + "step": 441 + }, + { + "epoch": 0.02432715064120205, + "grad_norm": 0.9680983424186707, + "learning_rate": 9.997802485699215e-06, + "loss": 0.9286, + "step": 442 + }, + { + "epoch": 0.024382189443557706, + "grad_norm": 1.2231700420379639, + "learning_rate": 9.997789616898757e-06, + "loss": 0.8083, + "step": 443 + }, + { + "epoch": 0.024437228245913368, + "grad_norm": 1.0064021348953247, + "learning_rate": 9.99777671053625e-06, + "loss": 0.9161, + "step": 444 + }, + { + "epoch": 0.02449226704826903, + "grad_norm": 0.9658541679382324, + "learning_rate": 9.99776376661179e-06, + "loss": 0.8027, + "step": 445 + }, + { + "epoch": 0.02454730585062469, + "grad_norm": 0.9440343379974365, + "learning_rate": 9.997750785125477e-06, + "loss": 0.9124, + "step": 446 + }, + { + "epoch": 0.024602344652980352, + "grad_norm": 0.998792827129364, + "learning_rate": 9.997737766077404e-06, + "loss": 0.8699, + "step": 447 + }, + { + "epoch": 0.024657383455336013, + "grad_norm": 1.430880069732666, + "learning_rate": 9.997724709467676e-06, + "loss": 0.9158, + "step": 448 + }, + { + "epoch": 0.02471242225769167, + "grad_norm": 0.9737820029258728, + "learning_rate": 9.997711615296384e-06, + "loss": 0.9496, + "step": 449 + }, + { + "epoch": 0.024767461060047333, + "grad_norm": 0.9710075855255127, + "learning_rate": 9.997698483563629e-06, + "loss": 0.8714, + "step": 450 + }, + { + "epoch": 0.024822499862402994, + "grad_norm": 1.5286253690719604, + "learning_rate": 9.997685314269511e-06, + "loss": 0.8421, + "step": 451 + }, + { + "epoch": 0.024877538664758655, + "grad_norm": 1.0269445180892944, + "learning_rate": 9.99767210741413e-06, + "loss": 1.0131, + "step": 452 + }, + { + "epoch": 0.024932577467114317, + "grad_norm": 0.9780508279800415, + "learning_rate": 9.99765886299758e-06, + "loss": 0.9897, + "step": 453 + }, + { + "epoch": 0.024987616269469975, + "grad_norm": 0.998332679271698, + "learning_rate": 9.997645581019965e-06, + "loss": 0.9647, + "step": 454 + }, + { + "epoch": 0.025042655071825636, + "grad_norm": 1.7062602043151855, + "learning_rate": 9.997632261481383e-06, + "loss": 1.0729, + "step": 455 + }, + { + "epoch": 0.025097693874181298, + "grad_norm": 0.9793694615364075, + "learning_rate": 9.997618904381936e-06, + "loss": 0.9556, + "step": 456 + }, + { + "epoch": 0.02515273267653696, + "grad_norm": 1.0183895826339722, + "learning_rate": 9.997605509721721e-06, + "loss": 0.9194, + "step": 457 + }, + { + "epoch": 0.02520777147889262, + "grad_norm": 1.0288400650024414, + "learning_rate": 9.997592077500844e-06, + "loss": 0.955, + "step": 458 + }, + { + "epoch": 0.025262810281248282, + "grad_norm": 0.9551253914833069, + "learning_rate": 9.997578607719401e-06, + "loss": 0.8498, + "step": 459 + }, + { + "epoch": 0.02531784908360394, + "grad_norm": 0.9648008942604065, + "learning_rate": 9.997565100377494e-06, + "loss": 0.9306, + "step": 460 + }, + { + "epoch": 0.0253728878859596, + "grad_norm": 0.9206677675247192, + "learning_rate": 9.997551555475225e-06, + "loss": 0.7874, + "step": 461 + }, + { + "epoch": 0.025427926688315262, + "grad_norm": 1.0479545593261719, + "learning_rate": 9.997537973012698e-06, + "loss": 0.9201, + "step": 462 + }, + { + "epoch": 0.025482965490670924, + "grad_norm": 1.0329946279525757, + "learning_rate": 9.997524352990013e-06, + "loss": 0.9577, + "step": 463 + }, + { + "epoch": 0.025538004293026585, + "grad_norm": 1.1177828311920166, + "learning_rate": 9.997510695407273e-06, + "loss": 1.0041, + "step": 464 + }, + { + "epoch": 0.025593043095382243, + "grad_norm": 1.0351577997207642, + "learning_rate": 9.99749700026458e-06, + "loss": 0.9952, + "step": 465 + }, + { + "epoch": 0.025648081897737905, + "grad_norm": 0.905274510383606, + "learning_rate": 9.997483267562035e-06, + "loss": 0.8185, + "step": 466 + }, + { + "epoch": 0.025703120700093566, + "grad_norm": 1.0749776363372803, + "learning_rate": 9.997469497299747e-06, + "loss": 1.0611, + "step": 467 + }, + { + "epoch": 0.025758159502449227, + "grad_norm": 0.8972223401069641, + "learning_rate": 9.997455689477815e-06, + "loss": 0.8994, + "step": 468 + }, + { + "epoch": 0.02581319830480489, + "grad_norm": 1.0669914484024048, + "learning_rate": 9.997441844096342e-06, + "loss": 1.06, + "step": 469 + }, + { + "epoch": 0.025868237107160547, + "grad_norm": 1.0431914329528809, + "learning_rate": 9.997427961155435e-06, + "loss": 0.8657, + "step": 470 + }, + { + "epoch": 0.025923275909516208, + "grad_norm": 0.9609962701797485, + "learning_rate": 9.997414040655198e-06, + "loss": 0.8864, + "step": 471 + }, + { + "epoch": 0.02597831471187187, + "grad_norm": 1.0829721689224243, + "learning_rate": 9.997400082595735e-06, + "loss": 0.9221, + "step": 472 + }, + { + "epoch": 0.02603335351422753, + "grad_norm": 0.992082953453064, + "learning_rate": 9.99738608697715e-06, + "loss": 0.8455, + "step": 473 + }, + { + "epoch": 0.026088392316583192, + "grad_norm": 1.0486301183700562, + "learning_rate": 9.997372053799547e-06, + "loss": 0.8729, + "step": 474 + }, + { + "epoch": 0.026143431118938854, + "grad_norm": 1.0328491926193237, + "learning_rate": 9.997357983063036e-06, + "loss": 0.8788, + "step": 475 + }, + { + "epoch": 0.02619846992129451, + "grad_norm": 0.963333249092102, + "learning_rate": 9.997343874767719e-06, + "loss": 0.892, + "step": 476 + }, + { + "epoch": 0.026253508723650173, + "grad_norm": 1.1606497764587402, + "learning_rate": 9.997329728913704e-06, + "loss": 0.9984, + "step": 477 + }, + { + "epoch": 0.026308547526005834, + "grad_norm": 1.241650104522705, + "learning_rate": 9.997315545501096e-06, + "loss": 0.946, + "step": 478 + }, + { + "epoch": 0.026363586328361496, + "grad_norm": 1.008004069328308, + "learning_rate": 9.99730132453e-06, + "loss": 0.849, + "step": 479 + }, + { + "epoch": 0.026418625130717157, + "grad_norm": 0.9883478879928589, + "learning_rate": 9.997287066000527e-06, + "loss": 0.9478, + "step": 480 + }, + { + "epoch": 0.026473663933072815, + "grad_norm": 1.0224446058273315, + "learning_rate": 9.997272769912783e-06, + "loss": 1.0318, + "step": 481 + }, + { + "epoch": 0.026528702735428476, + "grad_norm": 0.9412569403648376, + "learning_rate": 9.997258436266874e-06, + "loss": 0.9119, + "step": 482 + }, + { + "epoch": 0.026583741537784138, + "grad_norm": 0.9214537739753723, + "learning_rate": 9.997244065062906e-06, + "loss": 0.8785, + "step": 483 + }, + { + "epoch": 0.0266387803401398, + "grad_norm": 1.0015628337860107, + "learning_rate": 9.997229656300991e-06, + "loss": 0.8869, + "step": 484 + }, + { + "epoch": 0.02669381914249546, + "grad_norm": 0.8965190052986145, + "learning_rate": 9.997215209981237e-06, + "loss": 0.7009, + "step": 485 + }, + { + "epoch": 0.02674885794485112, + "grad_norm": 1.1976135969161987, + "learning_rate": 9.997200726103749e-06, + "loss": 0.9795, + "step": 486 + }, + { + "epoch": 0.02680389674720678, + "grad_norm": 0.864780843257904, + "learning_rate": 9.997186204668639e-06, + "loss": 0.7687, + "step": 487 + }, + { + "epoch": 0.02685893554956244, + "grad_norm": 0.9946566820144653, + "learning_rate": 9.997171645676013e-06, + "loss": 0.9672, + "step": 488 + }, + { + "epoch": 0.026913974351918103, + "grad_norm": 1.043835997581482, + "learning_rate": 9.997157049125985e-06, + "loss": 0.862, + "step": 489 + }, + { + "epoch": 0.026969013154273764, + "grad_norm": 0.9697456955909729, + "learning_rate": 9.99714241501866e-06, + "loss": 0.8368, + "step": 490 + }, + { + "epoch": 0.027024051956629425, + "grad_norm": 0.9975618124008179, + "learning_rate": 9.997127743354153e-06, + "loss": 0.8739, + "step": 491 + }, + { + "epoch": 0.027079090758985083, + "grad_norm": 1.0055313110351562, + "learning_rate": 9.99711303413257e-06, + "loss": 0.9227, + "step": 492 + }, + { + "epoch": 0.027134129561340745, + "grad_norm": 1.0418384075164795, + "learning_rate": 9.997098287354024e-06, + "loss": 0.9978, + "step": 493 + }, + { + "epoch": 0.027189168363696406, + "grad_norm": 0.8648970723152161, + "learning_rate": 9.997083503018625e-06, + "loss": 0.8363, + "step": 494 + }, + { + "epoch": 0.027244207166052067, + "grad_norm": 1.13506019115448, + "learning_rate": 9.997068681126483e-06, + "loss": 0.8851, + "step": 495 + }, + { + "epoch": 0.02729924596840773, + "grad_norm": 0.974400520324707, + "learning_rate": 9.997053821677712e-06, + "loss": 0.8533, + "step": 496 + }, + { + "epoch": 0.027354284770763387, + "grad_norm": 1.226507544517517, + "learning_rate": 9.997038924672419e-06, + "loss": 0.8586, + "step": 497 + }, + { + "epoch": 0.027409323573119048, + "grad_norm": 1.004753589630127, + "learning_rate": 9.997023990110721e-06, + "loss": 0.8974, + "step": 498 + }, + { + "epoch": 0.02746436237547471, + "grad_norm": 1.0492571592330933, + "learning_rate": 9.997009017992729e-06, + "loss": 0.8457, + "step": 499 + }, + { + "epoch": 0.02751940117783037, + "grad_norm": 1.0068167448043823, + "learning_rate": 9.996994008318554e-06, + "loss": 0.9608, + "step": 500 + }, + { + "epoch": 0.027574439980186032, + "grad_norm": 0.9686044454574585, + "learning_rate": 9.996978961088311e-06, + "loss": 0.9041, + "step": 501 + }, + { + "epoch": 0.027629478782541694, + "grad_norm": 1.281728744506836, + "learning_rate": 9.99696387630211e-06, + "loss": 0.9739, + "step": 502 + }, + { + "epoch": 0.02768451758489735, + "grad_norm": 0.9069758653640747, + "learning_rate": 9.996948753960065e-06, + "loss": 0.8467, + "step": 503 + }, + { + "epoch": 0.027739556387253013, + "grad_norm": 1.0337222814559937, + "learning_rate": 9.996933594062293e-06, + "loss": 0.9638, + "step": 504 + }, + { + "epoch": 0.027794595189608674, + "grad_norm": 0.9695359468460083, + "learning_rate": 9.996918396608905e-06, + "loss": 0.8986, + "step": 505 + }, + { + "epoch": 0.027849633991964336, + "grad_norm": 0.9120615124702454, + "learning_rate": 9.996903161600016e-06, + "loss": 0.9103, + "step": 506 + }, + { + "epoch": 0.027904672794319997, + "grad_norm": 0.9736546874046326, + "learning_rate": 9.996887889035741e-06, + "loss": 0.9308, + "step": 507 + }, + { + "epoch": 0.027959711596675655, + "grad_norm": 1.0184897184371948, + "learning_rate": 9.996872578916192e-06, + "loss": 0.8978, + "step": 508 + }, + { + "epoch": 0.028014750399031316, + "grad_norm": 0.9791838526725769, + "learning_rate": 9.996857231241489e-06, + "loss": 0.8639, + "step": 509 + }, + { + "epoch": 0.028069789201386978, + "grad_norm": 1.2985681295394897, + "learning_rate": 9.996841846011742e-06, + "loss": 0.9581, + "step": 510 + }, + { + "epoch": 0.02812482800374264, + "grad_norm": 1.0647368431091309, + "learning_rate": 9.996826423227071e-06, + "loss": 1.0565, + "step": 511 + }, + { + "epoch": 0.0281798668060983, + "grad_norm": 1.0336421728134155, + "learning_rate": 9.996810962887591e-06, + "loss": 1.008, + "step": 512 + }, + { + "epoch": 0.02823490560845396, + "grad_norm": 1.1838933229446411, + "learning_rate": 9.996795464993416e-06, + "loss": 0.8359, + "step": 513 + }, + { + "epoch": 0.02828994441080962, + "grad_norm": 0.9898360371589661, + "learning_rate": 9.996779929544663e-06, + "loss": 0.8501, + "step": 514 + }, + { + "epoch": 0.02834498321316528, + "grad_norm": 0.9836066365242004, + "learning_rate": 9.99676435654145e-06, + "loss": 0.8795, + "step": 515 + }, + { + "epoch": 0.028400022015520943, + "grad_norm": 1.0621601343154907, + "learning_rate": 9.996748745983895e-06, + "loss": 0.8746, + "step": 516 + }, + { + "epoch": 0.028455060817876604, + "grad_norm": 1.0082437992095947, + "learning_rate": 9.996733097872113e-06, + "loss": 0.9278, + "step": 517 + }, + { + "epoch": 0.028510099620232265, + "grad_norm": 0.9903931617736816, + "learning_rate": 9.996717412206222e-06, + "loss": 0.8264, + "step": 518 + }, + { + "epoch": 0.028565138422587923, + "grad_norm": 1.0797243118286133, + "learning_rate": 9.996701688986342e-06, + "loss": 1.0077, + "step": 519 + }, + { + "epoch": 0.028620177224943585, + "grad_norm": 1.147133231163025, + "learning_rate": 9.99668592821259e-06, + "loss": 0.9374, + "step": 520 + }, + { + "epoch": 0.028675216027299246, + "grad_norm": 0.9993947744369507, + "learning_rate": 9.996670129885082e-06, + "loss": 0.9562, + "step": 521 + }, + { + "epoch": 0.028730254829654907, + "grad_norm": 0.8580895066261292, + "learning_rate": 9.99665429400394e-06, + "loss": 0.7985, + "step": 522 + }, + { + "epoch": 0.02878529363201057, + "grad_norm": 0.9251388907432556, + "learning_rate": 9.996638420569281e-06, + "loss": 0.7323, + "step": 523 + }, + { + "epoch": 0.028840332434366227, + "grad_norm": 1.0010193586349487, + "learning_rate": 9.996622509581227e-06, + "loss": 0.9316, + "step": 524 + }, + { + "epoch": 0.028895371236721888, + "grad_norm": 0.9822579026222229, + "learning_rate": 9.996606561039894e-06, + "loss": 0.8978, + "step": 525 + }, + { + "epoch": 0.02895041003907755, + "grad_norm": 1.0760595798492432, + "learning_rate": 9.996590574945403e-06, + "loss": 0.9125, + "step": 526 + }, + { + "epoch": 0.02900544884143321, + "grad_norm": 1.138869285583496, + "learning_rate": 9.996574551297876e-06, + "loss": 0.8185, + "step": 527 + }, + { + "epoch": 0.029060487643788872, + "grad_norm": 1.002994179725647, + "learning_rate": 9.996558490097433e-06, + "loss": 0.9404, + "step": 528 + }, + { + "epoch": 0.02911552644614453, + "grad_norm": 0.9550611972808838, + "learning_rate": 9.996542391344194e-06, + "loss": 0.859, + "step": 529 + }, + { + "epoch": 0.02917056524850019, + "grad_norm": 0.9236055612564087, + "learning_rate": 9.996526255038277e-06, + "loss": 0.7758, + "step": 530 + }, + { + "epoch": 0.029225604050855853, + "grad_norm": 1.103966474533081, + "learning_rate": 9.996510081179808e-06, + "loss": 1.0147, + "step": 531 + }, + { + "epoch": 0.029280642853211514, + "grad_norm": 0.9884665012359619, + "learning_rate": 9.996493869768906e-06, + "loss": 0.8784, + "step": 532 + }, + { + "epoch": 0.029335681655567176, + "grad_norm": 0.9173223376274109, + "learning_rate": 9.996477620805694e-06, + "loss": 0.8741, + "step": 533 + }, + { + "epoch": 0.029390720457922837, + "grad_norm": 0.965548574924469, + "learning_rate": 9.996461334290294e-06, + "loss": 0.8989, + "step": 534 + }, + { + "epoch": 0.029445759260278495, + "grad_norm": 0.9939296245574951, + "learning_rate": 9.996445010222828e-06, + "loss": 0.8552, + "step": 535 + }, + { + "epoch": 0.029500798062634156, + "grad_norm": 1.0081578493118286, + "learning_rate": 9.996428648603417e-06, + "loss": 0.9138, + "step": 536 + }, + { + "epoch": 0.029555836864989818, + "grad_norm": 1.0139487981796265, + "learning_rate": 9.996412249432188e-06, + "loss": 0.9452, + "step": 537 + }, + { + "epoch": 0.02961087566734548, + "grad_norm": 0.9463647603988647, + "learning_rate": 9.996395812709262e-06, + "loss": 0.8721, + "step": 538 + }, + { + "epoch": 0.02966591446970114, + "grad_norm": 0.9981473684310913, + "learning_rate": 9.99637933843476e-06, + "loss": 0.7791, + "step": 539 + }, + { + "epoch": 0.0297209532720568, + "grad_norm": 1.1637190580368042, + "learning_rate": 9.996362826608812e-06, + "loss": 0.8798, + "step": 540 + }, + { + "epoch": 0.02977599207441246, + "grad_norm": 2.2887051105499268, + "learning_rate": 9.996346277231536e-06, + "loss": 0.9303, + "step": 541 + }, + { + "epoch": 0.02983103087676812, + "grad_norm": 0.9173391461372375, + "learning_rate": 9.99632969030306e-06, + "loss": 0.8627, + "step": 542 + }, + { + "epoch": 0.029886069679123783, + "grad_norm": 1.033355474472046, + "learning_rate": 9.996313065823506e-06, + "loss": 0.9906, + "step": 543 + }, + { + "epoch": 0.029941108481479444, + "grad_norm": 0.9286639094352722, + "learning_rate": 9.996296403793002e-06, + "loss": 0.7043, + "step": 544 + }, + { + "epoch": 0.029996147283835102, + "grad_norm": 0.963238000869751, + "learning_rate": 9.996279704211671e-06, + "loss": 1.0236, + "step": 545 + }, + { + "epoch": 0.030051186086190763, + "grad_norm": 1.0275089740753174, + "learning_rate": 9.99626296707964e-06, + "loss": 0.976, + "step": 546 + }, + { + "epoch": 0.030106224888546425, + "grad_norm": 1.0944674015045166, + "learning_rate": 9.996246192397032e-06, + "loss": 0.9209, + "step": 547 + }, + { + "epoch": 0.030161263690902086, + "grad_norm": 0.9620945453643799, + "learning_rate": 9.996229380163976e-06, + "loss": 0.8973, + "step": 548 + }, + { + "epoch": 0.030216302493257748, + "grad_norm": 1.032549500465393, + "learning_rate": 9.996212530380597e-06, + "loss": 0.892, + "step": 549 + }, + { + "epoch": 0.03027134129561341, + "grad_norm": 1.0433719158172607, + "learning_rate": 9.996195643047023e-06, + "loss": 0.8428, + "step": 550 + }, + { + "epoch": 0.030326380097969067, + "grad_norm": 1.1541085243225098, + "learning_rate": 9.996178718163378e-06, + "loss": 0.9084, + "step": 551 + }, + { + "epoch": 0.03038141890032473, + "grad_norm": 0.9386873245239258, + "learning_rate": 9.996161755729793e-06, + "loss": 0.9246, + "step": 552 + }, + { + "epoch": 0.03043645770268039, + "grad_norm": 1.092236042022705, + "learning_rate": 9.996144755746393e-06, + "loss": 0.8419, + "step": 553 + }, + { + "epoch": 0.03049149650503605, + "grad_norm": 0.9517606496810913, + "learning_rate": 9.996127718213306e-06, + "loss": 0.9002, + "step": 554 + }, + { + "epoch": 0.030546535307391712, + "grad_norm": 0.965972900390625, + "learning_rate": 9.996110643130661e-06, + "loss": 0.9197, + "step": 555 + }, + { + "epoch": 0.03060157410974737, + "grad_norm": 0.9396095275878906, + "learning_rate": 9.996093530498586e-06, + "loss": 0.8686, + "step": 556 + }, + { + "epoch": 0.030656612912103032, + "grad_norm": 1.0154120922088623, + "learning_rate": 9.99607638031721e-06, + "loss": 0.9773, + "step": 557 + }, + { + "epoch": 0.030711651714458693, + "grad_norm": 1.3572301864624023, + "learning_rate": 9.99605919258666e-06, + "loss": 0.911, + "step": 558 + }, + { + "epoch": 0.030766690516814355, + "grad_norm": 0.968278169631958, + "learning_rate": 9.996041967307066e-06, + "loss": 0.7704, + "step": 559 + }, + { + "epoch": 0.030821729319170016, + "grad_norm": 0.9867869019508362, + "learning_rate": 9.99602470447856e-06, + "loss": 0.873, + "step": 560 + }, + { + "epoch": 0.030876768121525677, + "grad_norm": 1.056450605392456, + "learning_rate": 9.996007404101269e-06, + "loss": 0.941, + "step": 561 + }, + { + "epoch": 0.030931806923881335, + "grad_norm": 1.0419799089431763, + "learning_rate": 9.995990066175321e-06, + "loss": 0.957, + "step": 562 + }, + { + "epoch": 0.030986845726236997, + "grad_norm": 0.9789314866065979, + "learning_rate": 9.995972690700852e-06, + "loss": 0.9229, + "step": 563 + }, + { + "epoch": 0.031041884528592658, + "grad_norm": 0.917783796787262, + "learning_rate": 9.995955277677989e-06, + "loss": 0.8186, + "step": 564 + }, + { + "epoch": 0.03109692333094832, + "grad_norm": 1.0231432914733887, + "learning_rate": 9.995937827106863e-06, + "loss": 0.8624, + "step": 565 + }, + { + "epoch": 0.03115196213330398, + "grad_norm": 0.9552083015441895, + "learning_rate": 9.995920338987605e-06, + "loss": 0.7967, + "step": 566 + }, + { + "epoch": 0.03120700093565964, + "grad_norm": 0.9441083669662476, + "learning_rate": 9.995902813320349e-06, + "loss": 0.8471, + "step": 567 + }, + { + "epoch": 0.0312620397380153, + "grad_norm": 1.0025299787521362, + "learning_rate": 9.995885250105223e-06, + "loss": 0.8646, + "step": 568 + }, + { + "epoch": 0.03131707854037096, + "grad_norm": 0.8997280597686768, + "learning_rate": 9.99586764934236e-06, + "loss": 0.8736, + "step": 569 + }, + { + "epoch": 0.03137211734272662, + "grad_norm": 0.9090663194656372, + "learning_rate": 9.995850011031896e-06, + "loss": 0.8548, + "step": 570 + }, + { + "epoch": 0.031427156145082284, + "grad_norm": 0.9641294479370117, + "learning_rate": 9.995832335173959e-06, + "loss": 0.8667, + "step": 571 + }, + { + "epoch": 0.031482194947437946, + "grad_norm": 0.9165804982185364, + "learning_rate": 9.995814621768682e-06, + "loss": 0.803, + "step": 572 + }, + { + "epoch": 0.03153723374979361, + "grad_norm": 0.9672492742538452, + "learning_rate": 9.995796870816202e-06, + "loss": 0.8335, + "step": 573 + }, + { + "epoch": 0.03159227255214927, + "grad_norm": 0.9359404444694519, + "learning_rate": 9.995779082316648e-06, + "loss": 0.8294, + "step": 574 + }, + { + "epoch": 0.03164731135450492, + "grad_norm": 0.926925003528595, + "learning_rate": 9.995761256270157e-06, + "loss": 0.7714, + "step": 575 + }, + { + "epoch": 0.031702350156860584, + "grad_norm": 1.1848629713058472, + "learning_rate": 9.995743392676862e-06, + "loss": 0.8925, + "step": 576 + }, + { + "epoch": 0.031757388959216246, + "grad_norm": 0.9624786972999573, + "learning_rate": 9.995725491536897e-06, + "loss": 0.9292, + "step": 577 + }, + { + "epoch": 0.03181242776157191, + "grad_norm": 0.9479736089706421, + "learning_rate": 9.995707552850396e-06, + "loss": 0.8797, + "step": 578 + }, + { + "epoch": 0.03186746656392757, + "grad_norm": 0.9551546573638916, + "learning_rate": 9.995689576617494e-06, + "loss": 0.8793, + "step": 579 + }, + { + "epoch": 0.03192250536628323, + "grad_norm": 0.9210056662559509, + "learning_rate": 9.995671562838325e-06, + "loss": 0.9714, + "step": 580 + }, + { + "epoch": 0.03197754416863889, + "grad_norm": 1.063117504119873, + "learning_rate": 9.995653511513029e-06, + "loss": 0.9608, + "step": 581 + }, + { + "epoch": 0.03203258297099455, + "grad_norm": 0.9426459670066833, + "learning_rate": 9.995635422641736e-06, + "loss": 0.9102, + "step": 582 + }, + { + "epoch": 0.032087621773350214, + "grad_norm": 1.0176693201065063, + "learning_rate": 9.995617296224584e-06, + "loss": 0.9109, + "step": 583 + }, + { + "epoch": 0.032142660575705875, + "grad_norm": 0.9457042217254639, + "learning_rate": 9.995599132261711e-06, + "loss": 0.9017, + "step": 584 + }, + { + "epoch": 0.03219769937806154, + "grad_norm": 1.5851638317108154, + "learning_rate": 9.995580930753252e-06, + "loss": 0.967, + "step": 585 + }, + { + "epoch": 0.03225273818041719, + "grad_norm": 0.9961487054824829, + "learning_rate": 9.995562691699345e-06, + "loss": 0.9396, + "step": 586 + }, + { + "epoch": 0.03230777698277285, + "grad_norm": 0.9892112016677856, + "learning_rate": 9.995544415100125e-06, + "loss": 0.9058, + "step": 587 + }, + { + "epoch": 0.032362815785128514, + "grad_norm": 0.9052272439002991, + "learning_rate": 9.99552610095573e-06, + "loss": 0.9194, + "step": 588 + }, + { + "epoch": 0.032417854587484175, + "grad_norm": 0.8381399512290955, + "learning_rate": 9.995507749266297e-06, + "loss": 0.7465, + "step": 589 + }, + { + "epoch": 0.03247289338983984, + "grad_norm": 1.018964171409607, + "learning_rate": 9.995489360031969e-06, + "loss": 0.841, + "step": 590 + }, + { + "epoch": 0.0325279321921955, + "grad_norm": 0.908311128616333, + "learning_rate": 9.995470933252876e-06, + "loss": 0.8592, + "step": 591 + }, + { + "epoch": 0.03258297099455116, + "grad_norm": 1.2986040115356445, + "learning_rate": 9.995452468929162e-06, + "loss": 0.8341, + "step": 592 + }, + { + "epoch": 0.03263800979690682, + "grad_norm": 1.6565190553665161, + "learning_rate": 9.995433967060966e-06, + "loss": 0.8681, + "step": 593 + }, + { + "epoch": 0.03269304859926248, + "grad_norm": 0.9725674390792847, + "learning_rate": 9.995415427648423e-06, + "loss": 0.8449, + "step": 594 + }, + { + "epoch": 0.032748087401618144, + "grad_norm": 0.8683852553367615, + "learning_rate": 9.995396850691677e-06, + "loss": 0.8478, + "step": 595 + }, + { + "epoch": 0.0328031262039738, + "grad_norm": 0.9912856817245483, + "learning_rate": 9.995378236190862e-06, + "loss": 0.8912, + "step": 596 + }, + { + "epoch": 0.03285816500632946, + "grad_norm": 0.9396800398826599, + "learning_rate": 9.995359584146125e-06, + "loss": 0.856, + "step": 597 + }, + { + "epoch": 0.03291320380868512, + "grad_norm": 1.385006308555603, + "learning_rate": 9.995340894557601e-06, + "loss": 0.9633, + "step": 598 + }, + { + "epoch": 0.03296824261104078, + "grad_norm": 0.8982875943183899, + "learning_rate": 9.995322167425433e-06, + "loss": 0.9244, + "step": 599 + }, + { + "epoch": 0.033023281413396444, + "grad_norm": 0.8981022834777832, + "learning_rate": 9.995303402749759e-06, + "loss": 0.8854, + "step": 600 + }, + { + "epoch": 0.033078320215752105, + "grad_norm": 0.9917197227478027, + "learning_rate": 9.995284600530724e-06, + "loss": 1.0086, + "step": 601 + }, + { + "epoch": 0.033133359018107766, + "grad_norm": 1.0540626049041748, + "learning_rate": 9.995265760768464e-06, + "loss": 1.0022, + "step": 602 + }, + { + "epoch": 0.03318839782046343, + "grad_norm": 0.9523479342460632, + "learning_rate": 9.995246883463126e-06, + "loss": 0.9893, + "step": 603 + }, + { + "epoch": 0.03324343662281909, + "grad_norm": 0.9824770092964172, + "learning_rate": 9.99522796861485e-06, + "loss": 0.8385, + "step": 604 + }, + { + "epoch": 0.03329847542517475, + "grad_norm": 1.0968893766403198, + "learning_rate": 9.995209016223776e-06, + "loss": 1.0109, + "step": 605 + }, + { + "epoch": 0.03335351422753041, + "grad_norm": 0.9115625023841858, + "learning_rate": 9.995190026290049e-06, + "loss": 0.8656, + "step": 606 + }, + { + "epoch": 0.033408553029886066, + "grad_norm": 0.9795814156532288, + "learning_rate": 9.99517099881381e-06, + "loss": 0.8941, + "step": 607 + }, + { + "epoch": 0.03346359183224173, + "grad_norm": 0.9317291378974915, + "learning_rate": 9.995151933795204e-06, + "loss": 0.7819, + "step": 608 + }, + { + "epoch": 0.03351863063459739, + "grad_norm": 0.9936283230781555, + "learning_rate": 9.995132831234373e-06, + "loss": 0.8674, + "step": 609 + }, + { + "epoch": 0.03357366943695305, + "grad_norm": 0.9872812032699585, + "learning_rate": 9.995113691131462e-06, + "loss": 0.9038, + "step": 610 + }, + { + "epoch": 0.03362870823930871, + "grad_norm": 0.9516895413398743, + "learning_rate": 9.995094513486611e-06, + "loss": 0.9038, + "step": 611 + }, + { + "epoch": 0.03368374704166437, + "grad_norm": 1.090579867362976, + "learning_rate": 9.995075298299968e-06, + "loss": 0.9587, + "step": 612 + }, + { + "epoch": 0.033738785844020035, + "grad_norm": 1.021398663520813, + "learning_rate": 9.995056045571677e-06, + "loss": 0.9569, + "step": 613 + }, + { + "epoch": 0.033793824646375696, + "grad_norm": 1.009657382965088, + "learning_rate": 9.99503675530188e-06, + "loss": 0.8346, + "step": 614 + }, + { + "epoch": 0.03384886344873136, + "grad_norm": 1.0478712320327759, + "learning_rate": 9.995017427490725e-06, + "loss": 1.0566, + "step": 615 + }, + { + "epoch": 0.03390390225108702, + "grad_norm": 1.1391830444335938, + "learning_rate": 9.994998062138355e-06, + "loss": 1.0727, + "step": 616 + }, + { + "epoch": 0.03395894105344268, + "grad_norm": 1.0172302722930908, + "learning_rate": 9.994978659244918e-06, + "loss": 0.7869, + "step": 617 + }, + { + "epoch": 0.034013979855798335, + "grad_norm": 1.0532630681991577, + "learning_rate": 9.994959218810558e-06, + "loss": 0.8626, + "step": 618 + }, + { + "epoch": 0.034069018658153996, + "grad_norm": 0.8300478458404541, + "learning_rate": 9.99493974083542e-06, + "loss": 0.8166, + "step": 619 + }, + { + "epoch": 0.03412405746050966, + "grad_norm": 1.0613664388656616, + "learning_rate": 9.994920225319656e-06, + "loss": 0.8899, + "step": 620 + }, + { + "epoch": 0.03417909626286532, + "grad_norm": 0.9827042818069458, + "learning_rate": 9.994900672263406e-06, + "loss": 0.8243, + "step": 621 + }, + { + "epoch": 0.03423413506522098, + "grad_norm": 0.8790082931518555, + "learning_rate": 9.994881081666818e-06, + "loss": 0.8153, + "step": 622 + }, + { + "epoch": 0.03428917386757664, + "grad_norm": 1.033378005027771, + "learning_rate": 9.994861453530044e-06, + "loss": 0.8916, + "step": 623 + }, + { + "epoch": 0.0343442126699323, + "grad_norm": 0.9547238349914551, + "learning_rate": 9.994841787853227e-06, + "loss": 0.9141, + "step": 624 + }, + { + "epoch": 0.034399251472287964, + "grad_norm": 0.9606438279151917, + "learning_rate": 9.994822084636514e-06, + "loss": 0.9435, + "step": 625 + }, + { + "epoch": 0.034454290274643626, + "grad_norm": 0.8461503982543945, + "learning_rate": 9.994802343880059e-06, + "loss": 0.7914, + "step": 626 + }, + { + "epoch": 0.03450932907699929, + "grad_norm": 1.144538402557373, + "learning_rate": 9.994782565584004e-06, + "loss": 0.8025, + "step": 627 + }, + { + "epoch": 0.03456436787935495, + "grad_norm": 1.0099962949752808, + "learning_rate": 9.994762749748502e-06, + "loss": 0.9607, + "step": 628 + }, + { + "epoch": 0.0346194066817106, + "grad_norm": 0.9822041988372803, + "learning_rate": 9.9947428963737e-06, + "loss": 0.9216, + "step": 629 + }, + { + "epoch": 0.034674445484066264, + "grad_norm": 0.9056866765022278, + "learning_rate": 9.994723005459746e-06, + "loss": 0.7913, + "step": 630 + }, + { + "epoch": 0.034729484286421926, + "grad_norm": 1.0099287033081055, + "learning_rate": 9.994703077006792e-06, + "loss": 0.9937, + "step": 631 + }, + { + "epoch": 0.03478452308877759, + "grad_norm": 0.9559167623519897, + "learning_rate": 9.994683111014984e-06, + "loss": 0.9774, + "step": 632 + }, + { + "epoch": 0.03483956189113325, + "grad_norm": 1.0359059572219849, + "learning_rate": 9.994663107484478e-06, + "loss": 0.9062, + "step": 633 + }, + { + "epoch": 0.03489460069348891, + "grad_norm": 0.8803057074546814, + "learning_rate": 9.99464306641542e-06, + "loss": 0.9638, + "step": 634 + }, + { + "epoch": 0.03494963949584457, + "grad_norm": 1.0926579236984253, + "learning_rate": 9.994622987807962e-06, + "loss": 1.0467, + "step": 635 + }, + { + "epoch": 0.03500467829820023, + "grad_norm": 1.0051401853561401, + "learning_rate": 9.994602871662253e-06, + "loss": 0.8717, + "step": 636 + }, + { + "epoch": 0.035059717100555894, + "grad_norm": 1.2007508277893066, + "learning_rate": 9.994582717978448e-06, + "loss": 0.8004, + "step": 637 + }, + { + "epoch": 0.035114755902911556, + "grad_norm": 0.8826266527175903, + "learning_rate": 9.994562526756695e-06, + "loss": 0.8888, + "step": 638 + }, + { + "epoch": 0.03516979470526721, + "grad_norm": 0.9953717589378357, + "learning_rate": 9.994542297997147e-06, + "loss": 0.8999, + "step": 639 + }, + { + "epoch": 0.03522483350762287, + "grad_norm": 1.0203614234924316, + "learning_rate": 9.994522031699958e-06, + "loss": 0.8241, + "step": 640 + }, + { + "epoch": 0.03527987230997853, + "grad_norm": 0.8760203719139099, + "learning_rate": 9.994501727865276e-06, + "loss": 0.7893, + "step": 641 + }, + { + "epoch": 0.035334911112334194, + "grad_norm": 1.024888277053833, + "learning_rate": 9.994481386493257e-06, + "loss": 0.9865, + "step": 642 + }, + { + "epoch": 0.035389949914689856, + "grad_norm": 0.907454788684845, + "learning_rate": 9.994461007584052e-06, + "loss": 0.891, + "step": 643 + }, + { + "epoch": 0.03544498871704552, + "grad_norm": 1.0400965213775635, + "learning_rate": 9.994440591137816e-06, + "loss": 0.9345, + "step": 644 + }, + { + "epoch": 0.03550002751940118, + "grad_norm": 0.9816616177558899, + "learning_rate": 9.9944201371547e-06, + "loss": 0.91, + "step": 645 + }, + { + "epoch": 0.03555506632175684, + "grad_norm": 1.0528117418289185, + "learning_rate": 9.99439964563486e-06, + "loss": 0.952, + "step": 646 + }, + { + "epoch": 0.0356101051241125, + "grad_norm": 0.9802080988883972, + "learning_rate": 9.99437911657845e-06, + "loss": 0.9392, + "step": 647 + }, + { + "epoch": 0.03566514392646816, + "grad_norm": 0.9580393433570862, + "learning_rate": 9.994358549985623e-06, + "loss": 0.874, + "step": 648 + }, + { + "epoch": 0.035720182728823824, + "grad_norm": 0.8935576677322388, + "learning_rate": 9.994337945856533e-06, + "loss": 0.8435, + "step": 649 + }, + { + "epoch": 0.03577522153117948, + "grad_norm": 1.009699821472168, + "learning_rate": 9.994317304191337e-06, + "loss": 0.9436, + "step": 650 + }, + { + "epoch": 0.03583026033353514, + "grad_norm": 0.9126121401786804, + "learning_rate": 9.994296624990188e-06, + "loss": 0.8424, + "step": 651 + }, + { + "epoch": 0.0358852991358908, + "grad_norm": 0.9555553197860718, + "learning_rate": 9.994275908253243e-06, + "loss": 0.93, + "step": 652 + }, + { + "epoch": 0.03594033793824646, + "grad_norm": 0.8359857797622681, + "learning_rate": 9.994255153980658e-06, + "loss": 0.6326, + "step": 653 + }, + { + "epoch": 0.035995376740602124, + "grad_norm": 0.8918783664703369, + "learning_rate": 9.994234362172587e-06, + "loss": 0.8287, + "step": 654 + }, + { + "epoch": 0.036050415542957785, + "grad_norm": 0.9878549575805664, + "learning_rate": 9.994213532829188e-06, + "loss": 0.8841, + "step": 655 + }, + { + "epoch": 0.03610545434531345, + "grad_norm": 0.9504040479660034, + "learning_rate": 9.994192665950617e-06, + "loss": 1.0182, + "step": 656 + }, + { + "epoch": 0.03616049314766911, + "grad_norm": 0.9531422257423401, + "learning_rate": 9.99417176153703e-06, + "loss": 0.8504, + "step": 657 + }, + { + "epoch": 0.03621553195002477, + "grad_norm": 0.9580292105674744, + "learning_rate": 9.994150819588587e-06, + "loss": 0.8048, + "step": 658 + }, + { + "epoch": 0.03627057075238043, + "grad_norm": 0.9786819815635681, + "learning_rate": 9.99412984010544e-06, + "loss": 0.9124, + "step": 659 + }, + { + "epoch": 0.03632560955473609, + "grad_norm": 0.9733422994613647, + "learning_rate": 9.994108823087751e-06, + "loss": 0.8868, + "step": 660 + }, + { + "epoch": 0.03638064835709175, + "grad_norm": 1.093173623085022, + "learning_rate": 9.994087768535679e-06, + "loss": 0.9428, + "step": 661 + }, + { + "epoch": 0.03643568715944741, + "grad_norm": 0.9067148566246033, + "learning_rate": 9.994066676449378e-06, + "loss": 0.8838, + "step": 662 + }, + { + "epoch": 0.03649072596180307, + "grad_norm": 0.9509521722793579, + "learning_rate": 9.99404554682901e-06, + "loss": 0.9034, + "step": 663 + }, + { + "epoch": 0.03654576476415873, + "grad_norm": 0.9523824453353882, + "learning_rate": 9.994024379674731e-06, + "loss": 0.9623, + "step": 664 + }, + { + "epoch": 0.03660080356651439, + "grad_norm": 0.987276554107666, + "learning_rate": 9.994003174986703e-06, + "loss": 0.8817, + "step": 665 + }, + { + "epoch": 0.036655842368870054, + "grad_norm": 0.9500744342803955, + "learning_rate": 9.993981932765083e-06, + "loss": 0.9742, + "step": 666 + }, + { + "epoch": 0.036710881171225715, + "grad_norm": 0.9420705437660217, + "learning_rate": 9.993960653010034e-06, + "loss": 0.9657, + "step": 667 + }, + { + "epoch": 0.036765919973581376, + "grad_norm": 0.9443248510360718, + "learning_rate": 9.99393933572171e-06, + "loss": 0.8468, + "step": 668 + }, + { + "epoch": 0.03682095877593704, + "grad_norm": 0.9666558504104614, + "learning_rate": 9.993917980900276e-06, + "loss": 0.9871, + "step": 669 + }, + { + "epoch": 0.0368759975782927, + "grad_norm": 1.0236201286315918, + "learning_rate": 9.993896588545892e-06, + "loss": 0.9814, + "step": 670 + }, + { + "epoch": 0.03693103638064836, + "grad_norm": 1.016190528869629, + "learning_rate": 9.993875158658716e-06, + "loss": 1.0156, + "step": 671 + }, + { + "epoch": 0.036986075183004015, + "grad_norm": 0.9296661019325256, + "learning_rate": 9.993853691238913e-06, + "loss": 0.7956, + "step": 672 + }, + { + "epoch": 0.037041113985359676, + "grad_norm": 0.9276684522628784, + "learning_rate": 9.993832186286643e-06, + "loss": 0.9253, + "step": 673 + }, + { + "epoch": 0.03709615278771534, + "grad_norm": 0.8588787913322449, + "learning_rate": 9.993810643802065e-06, + "loss": 0.7878, + "step": 674 + }, + { + "epoch": 0.037151191590071, + "grad_norm": 0.9955212473869324, + "learning_rate": 9.993789063785344e-06, + "loss": 0.8711, + "step": 675 + }, + { + "epoch": 0.03720623039242666, + "grad_norm": 0.925578236579895, + "learning_rate": 9.993767446236642e-06, + "loss": 0.9431, + "step": 676 + }, + { + "epoch": 0.03726126919478232, + "grad_norm": 0.9610552787780762, + "learning_rate": 9.99374579115612e-06, + "loss": 0.887, + "step": 677 + }, + { + "epoch": 0.03731630799713798, + "grad_norm": 1.0052428245544434, + "learning_rate": 9.99372409854394e-06, + "loss": 0.8751, + "step": 678 + }, + { + "epoch": 0.037371346799493645, + "grad_norm": 0.9503066539764404, + "learning_rate": 9.99370236840027e-06, + "loss": 0.8556, + "step": 679 + }, + { + "epoch": 0.037426385601849306, + "grad_norm": 2.426232099533081, + "learning_rate": 9.993680600725266e-06, + "loss": 0.9077, + "step": 680 + }, + { + "epoch": 0.03748142440420497, + "grad_norm": 0.9119723439216614, + "learning_rate": 9.993658795519096e-06, + "loss": 0.8575, + "step": 681 + }, + { + "epoch": 0.03753646320656062, + "grad_norm": 0.9688286781311035, + "learning_rate": 9.993636952781923e-06, + "loss": 0.8921, + "step": 682 + }, + { + "epoch": 0.03759150200891628, + "grad_norm": 1.030013084411621, + "learning_rate": 9.993615072513913e-06, + "loss": 0.8622, + "step": 683 + }, + { + "epoch": 0.037646540811271945, + "grad_norm": 1.055187463760376, + "learning_rate": 9.993593154715228e-06, + "loss": 0.9251, + "step": 684 + }, + { + "epoch": 0.037701579613627606, + "grad_norm": 1.0518591403961182, + "learning_rate": 9.993571199386032e-06, + "loss": 0.9575, + "step": 685 + }, + { + "epoch": 0.03775661841598327, + "grad_norm": 0.9232666492462158, + "learning_rate": 9.993549206526495e-06, + "loss": 0.8522, + "step": 686 + }, + { + "epoch": 0.03781165721833893, + "grad_norm": 1.0212332010269165, + "learning_rate": 9.993527176136775e-06, + "loss": 0.9358, + "step": 687 + }, + { + "epoch": 0.03786669602069459, + "grad_norm": 0.9137141108512878, + "learning_rate": 9.993505108217045e-06, + "loss": 0.8561, + "step": 688 + }, + { + "epoch": 0.03792173482305025, + "grad_norm": 1.0069375038146973, + "learning_rate": 9.993483002767465e-06, + "loss": 0.8274, + "step": 689 + }, + { + "epoch": 0.03797677362540591, + "grad_norm": 0.9820672869682312, + "learning_rate": 9.993460859788204e-06, + "loss": 0.907, + "step": 690 + }, + { + "epoch": 0.038031812427761574, + "grad_norm": 1.0042002201080322, + "learning_rate": 9.993438679279428e-06, + "loss": 0.9263, + "step": 691 + }, + { + "epoch": 0.038086851230117236, + "grad_norm": 0.9733695983886719, + "learning_rate": 9.993416461241304e-06, + "loss": 0.8455, + "step": 692 + }, + { + "epoch": 0.03814189003247289, + "grad_norm": 0.9106015563011169, + "learning_rate": 9.993394205673996e-06, + "loss": 0.8469, + "step": 693 + }, + { + "epoch": 0.03819692883482855, + "grad_norm": 0.9802660346031189, + "learning_rate": 9.993371912577677e-06, + "loss": 0.8662, + "step": 694 + }, + { + "epoch": 0.03825196763718421, + "grad_norm": 0.9183964729309082, + "learning_rate": 9.99334958195251e-06, + "loss": 0.8968, + "step": 695 + }, + { + "epoch": 0.038307006439539874, + "grad_norm": 0.9572185277938843, + "learning_rate": 9.993327213798663e-06, + "loss": 0.953, + "step": 696 + }, + { + "epoch": 0.038362045241895536, + "grad_norm": 1.4480071067810059, + "learning_rate": 9.993304808116307e-06, + "loss": 1.1131, + "step": 697 + }, + { + "epoch": 0.0384170840442512, + "grad_norm": 0.9297361969947815, + "learning_rate": 9.993282364905607e-06, + "loss": 0.884, + "step": 698 + }, + { + "epoch": 0.03847212284660686, + "grad_norm": 0.9400073885917664, + "learning_rate": 9.993259884166735e-06, + "loss": 0.932, + "step": 699 + }, + { + "epoch": 0.03852716164896252, + "grad_norm": 0.9231798052787781, + "learning_rate": 9.993237365899858e-06, + "loss": 0.8981, + "step": 700 + }, + { + "epoch": 0.03858220045131818, + "grad_norm": 0.8233712911605835, + "learning_rate": 9.993214810105144e-06, + "loss": 0.8218, + "step": 701 + }, + { + "epoch": 0.03863723925367384, + "grad_norm": 1.0997854471206665, + "learning_rate": 9.993192216782768e-06, + "loss": 0.9298, + "step": 702 + }, + { + "epoch": 0.038692278056029504, + "grad_norm": 0.9570802450180054, + "learning_rate": 9.993169585932893e-06, + "loss": 0.7815, + "step": 703 + }, + { + "epoch": 0.03874731685838516, + "grad_norm": 0.9913730025291443, + "learning_rate": 9.993146917555692e-06, + "loss": 0.9621, + "step": 704 + }, + { + "epoch": 0.03880235566074082, + "grad_norm": 1.088767409324646, + "learning_rate": 9.993124211651334e-06, + "loss": 0.9295, + "step": 705 + }, + { + "epoch": 0.03885739446309648, + "grad_norm": 0.8199124336242676, + "learning_rate": 9.993101468219995e-06, + "loss": 0.7613, + "step": 706 + }, + { + "epoch": 0.03891243326545214, + "grad_norm": 1.112566351890564, + "learning_rate": 9.99307868726184e-06, + "loss": 0.791, + "step": 707 + }, + { + "epoch": 0.038967472067807804, + "grad_norm": 0.9372578859329224, + "learning_rate": 9.99305586877704e-06, + "loss": 0.8567, + "step": 708 + }, + { + "epoch": 0.039022510870163465, + "grad_norm": 1.0167721509933472, + "learning_rate": 9.99303301276577e-06, + "loss": 0.9787, + "step": 709 + }, + { + "epoch": 0.03907754967251913, + "grad_norm": 1.3526856899261475, + "learning_rate": 9.993010119228202e-06, + "loss": 1.2215, + "step": 710 + }, + { + "epoch": 0.03913258847487479, + "grad_norm": 0.8819016814231873, + "learning_rate": 9.992987188164505e-06, + "loss": 0.7736, + "step": 711 + }, + { + "epoch": 0.03918762727723045, + "grad_norm": 1.0033677816390991, + "learning_rate": 9.992964219574852e-06, + "loss": 0.9919, + "step": 712 + }, + { + "epoch": 0.03924266607958611, + "grad_norm": 0.894926130771637, + "learning_rate": 9.992941213459417e-06, + "loss": 0.9058, + "step": 713 + }, + { + "epoch": 0.03929770488194177, + "grad_norm": 0.9481377005577087, + "learning_rate": 9.992918169818373e-06, + "loss": 0.8436, + "step": 714 + }, + { + "epoch": 0.03935274368429743, + "grad_norm": 0.9312933087348938, + "learning_rate": 9.992895088651893e-06, + "loss": 0.8869, + "step": 715 + }, + { + "epoch": 0.03940778248665309, + "grad_norm": 0.9765705466270447, + "learning_rate": 9.99287196996015e-06, + "loss": 0.9512, + "step": 716 + }, + { + "epoch": 0.03946282128900875, + "grad_norm": 0.9610235691070557, + "learning_rate": 9.992848813743317e-06, + "loss": 0.8005, + "step": 717 + }, + { + "epoch": 0.03951786009136441, + "grad_norm": 1.102995753288269, + "learning_rate": 9.99282562000157e-06, + "loss": 0.8017, + "step": 718 + }, + { + "epoch": 0.03957289889372007, + "grad_norm": 1.023317575454712, + "learning_rate": 9.99280238873508e-06, + "loss": 0.911, + "step": 719 + }, + { + "epoch": 0.039627937696075734, + "grad_norm": 1.0531049966812134, + "learning_rate": 9.992779119944025e-06, + "loss": 0.8562, + "step": 720 + }, + { + "epoch": 0.039682976498431395, + "grad_norm": 0.918250322341919, + "learning_rate": 9.992755813628579e-06, + "loss": 0.92, + "step": 721 + }, + { + "epoch": 0.039738015300787057, + "grad_norm": 0.8508251309394836, + "learning_rate": 9.992732469788915e-06, + "loss": 0.7347, + "step": 722 + }, + { + "epoch": 0.03979305410314272, + "grad_norm": 0.9184926152229309, + "learning_rate": 9.992709088425211e-06, + "loss": 0.8732, + "step": 723 + }, + { + "epoch": 0.03984809290549838, + "grad_norm": 1.1613929271697998, + "learning_rate": 9.992685669537643e-06, + "loss": 0.9522, + "step": 724 + }, + { + "epoch": 0.039903131707854034, + "grad_norm": 1.091513752937317, + "learning_rate": 9.992662213126386e-06, + "loss": 0.9646, + "step": 725 + }, + { + "epoch": 0.039958170510209695, + "grad_norm": 1.057803750038147, + "learning_rate": 9.992638719191615e-06, + "loss": 0.7032, + "step": 726 + }, + { + "epoch": 0.040013209312565357, + "grad_norm": 0.8771823644638062, + "learning_rate": 9.992615187733508e-06, + "loss": 0.8577, + "step": 727 + }, + { + "epoch": 0.04006824811492102, + "grad_norm": 0.9471028447151184, + "learning_rate": 9.992591618752244e-06, + "loss": 0.9057, + "step": 728 + }, + { + "epoch": 0.04012328691727668, + "grad_norm": 0.9547705054283142, + "learning_rate": 9.992568012247995e-06, + "loss": 0.9549, + "step": 729 + }, + { + "epoch": 0.04017832571963234, + "grad_norm": 0.8862974047660828, + "learning_rate": 9.992544368220941e-06, + "loss": 0.8593, + "step": 730 + }, + { + "epoch": 0.040233364521988, + "grad_norm": 0.906334400177002, + "learning_rate": 9.992520686671261e-06, + "loss": 0.8832, + "step": 731 + }, + { + "epoch": 0.04028840332434366, + "grad_norm": 1.07270085811615, + "learning_rate": 9.992496967599133e-06, + "loss": 0.9409, + "step": 732 + }, + { + "epoch": 0.040343442126699325, + "grad_norm": 0.9026005268096924, + "learning_rate": 9.992473211004734e-06, + "loss": 0.8326, + "step": 733 + }, + { + "epoch": 0.040398480929054986, + "grad_norm": 0.9762942790985107, + "learning_rate": 9.992449416888241e-06, + "loss": 0.9048, + "step": 734 + }, + { + "epoch": 0.04045351973141065, + "grad_norm": 0.9658033847808838, + "learning_rate": 9.992425585249837e-06, + "loss": 0.9219, + "step": 735 + }, + { + "epoch": 0.0405085585337663, + "grad_norm": 0.8909044861793518, + "learning_rate": 9.992401716089698e-06, + "loss": 0.8564, + "step": 736 + }, + { + "epoch": 0.04056359733612196, + "grad_norm": 1.0387929677963257, + "learning_rate": 9.992377809408001e-06, + "loss": 0.9533, + "step": 737 + }, + { + "epoch": 0.040618636138477625, + "grad_norm": 0.9044275879859924, + "learning_rate": 9.99235386520493e-06, + "loss": 0.8508, + "step": 738 + }, + { + "epoch": 0.040673674940833286, + "grad_norm": 1.019377589225769, + "learning_rate": 9.992329883480667e-06, + "loss": 0.8684, + "step": 739 + }, + { + "epoch": 0.04072871374318895, + "grad_norm": 0.9394627213478088, + "learning_rate": 9.992305864235385e-06, + "loss": 0.7665, + "step": 740 + }, + { + "epoch": 0.04078375254554461, + "grad_norm": 0.8652323484420776, + "learning_rate": 9.99228180746927e-06, + "loss": 0.8576, + "step": 741 + }, + { + "epoch": 0.04083879134790027, + "grad_norm": 0.9347619414329529, + "learning_rate": 9.992257713182502e-06, + "loss": 0.9586, + "step": 742 + }, + { + "epoch": 0.04089383015025593, + "grad_norm": 0.9510203003883362, + "learning_rate": 9.99223358137526e-06, + "loss": 0.9092, + "step": 743 + }, + { + "epoch": 0.04094886895261159, + "grad_norm": 0.8242866396903992, + "learning_rate": 9.992209412047729e-06, + "loss": 0.6997, + "step": 744 + }, + { + "epoch": 0.041003907754967255, + "grad_norm": 0.8842730522155762, + "learning_rate": 9.992185205200087e-06, + "loss": 0.8873, + "step": 745 + }, + { + "epoch": 0.041058946557322916, + "grad_norm": 1.0813730955123901, + "learning_rate": 9.992160960832518e-06, + "loss": 1.0162, + "step": 746 + }, + { + "epoch": 0.04111398535967857, + "grad_norm": 1.1276283264160156, + "learning_rate": 9.9921366789452e-06, + "loss": 1.0004, + "step": 747 + }, + { + "epoch": 0.04116902416203423, + "grad_norm": 0.8810326457023621, + "learning_rate": 9.992112359538323e-06, + "loss": 0.7823, + "step": 748 + }, + { + "epoch": 0.04122406296438989, + "grad_norm": 0.9939407110214233, + "learning_rate": 9.992088002612066e-06, + "loss": 1.0016, + "step": 749 + }, + { + "epoch": 0.041279101766745555, + "grad_norm": 1.0963523387908936, + "learning_rate": 9.99206360816661e-06, + "loss": 0.9252, + "step": 750 + }, + { + "epoch": 0.041334140569101216, + "grad_norm": 1.1346478462219238, + "learning_rate": 9.99203917620214e-06, + "loss": 0.9608, + "step": 751 + }, + { + "epoch": 0.04138917937145688, + "grad_norm": 1.0108580589294434, + "learning_rate": 9.992014706718841e-06, + "loss": 0.9179, + "step": 752 + }, + { + "epoch": 0.04144421817381254, + "grad_norm": 0.897293210029602, + "learning_rate": 9.991990199716894e-06, + "loss": 0.9295, + "step": 753 + }, + { + "epoch": 0.0414992569761682, + "grad_norm": 1.0152363777160645, + "learning_rate": 9.991965655196488e-06, + "loss": 0.8467, + "step": 754 + }, + { + "epoch": 0.04155429577852386, + "grad_norm": 0.8655388355255127, + "learning_rate": 9.9919410731578e-06, + "loss": 0.796, + "step": 755 + }, + { + "epoch": 0.04160933458087952, + "grad_norm": 1.0140331983566284, + "learning_rate": 9.991916453601023e-06, + "loss": 0.8444, + "step": 756 + }, + { + "epoch": 0.041664373383235184, + "grad_norm": 0.9387341141700745, + "learning_rate": 9.991891796526338e-06, + "loss": 0.8669, + "step": 757 + }, + { + "epoch": 0.04171941218559084, + "grad_norm": 0.9395696520805359, + "learning_rate": 9.991867101933928e-06, + "loss": 0.8376, + "step": 758 + }, + { + "epoch": 0.0417744509879465, + "grad_norm": 1.0856634378433228, + "learning_rate": 9.991842369823983e-06, + "loss": 0.9271, + "step": 759 + }, + { + "epoch": 0.04182948979030216, + "grad_norm": 0.8777190446853638, + "learning_rate": 9.991817600196687e-06, + "loss": 0.9197, + "step": 760 + }, + { + "epoch": 0.04188452859265782, + "grad_norm": 0.9639917016029358, + "learning_rate": 9.991792793052225e-06, + "loss": 0.8835, + "step": 761 + }, + { + "epoch": 0.041939567395013484, + "grad_norm": 0.9384773969650269, + "learning_rate": 9.991767948390785e-06, + "loss": 0.8403, + "step": 762 + }, + { + "epoch": 0.041994606197369146, + "grad_norm": 0.8987650275230408, + "learning_rate": 9.991743066212554e-06, + "loss": 0.7948, + "step": 763 + }, + { + "epoch": 0.04204964499972481, + "grad_norm": 1.0545049905776978, + "learning_rate": 9.991718146517717e-06, + "loss": 0.9359, + "step": 764 + }, + { + "epoch": 0.04210468380208047, + "grad_norm": 0.9840022325515747, + "learning_rate": 9.991693189306463e-06, + "loss": 0.9188, + "step": 765 + }, + { + "epoch": 0.04215972260443613, + "grad_norm": 0.8769927620887756, + "learning_rate": 9.991668194578981e-06, + "loss": 0.8647, + "step": 766 + }, + { + "epoch": 0.04221476140679179, + "grad_norm": 0.9268791675567627, + "learning_rate": 9.991643162335455e-06, + "loss": 0.897, + "step": 767 + }, + { + "epoch": 0.042269800209147446, + "grad_norm": 0.9316747784614563, + "learning_rate": 9.991618092576075e-06, + "loss": 0.9341, + "step": 768 + }, + { + "epoch": 0.04232483901150311, + "grad_norm": 0.8348364233970642, + "learning_rate": 9.991592985301031e-06, + "loss": 0.7528, + "step": 769 + }, + { + "epoch": 0.04237987781385877, + "grad_norm": 0.9139068126678467, + "learning_rate": 9.99156784051051e-06, + "loss": 0.8596, + "step": 770 + }, + { + "epoch": 0.04243491661621443, + "grad_norm": 0.9403928518295288, + "learning_rate": 9.991542658204701e-06, + "loss": 0.974, + "step": 771 + }, + { + "epoch": 0.04248995541857009, + "grad_norm": 0.993549108505249, + "learning_rate": 9.991517438383793e-06, + "loss": 0.9479, + "step": 772 + }, + { + "epoch": 0.04254499422092575, + "grad_norm": 0.8494916558265686, + "learning_rate": 9.991492181047975e-06, + "loss": 0.9149, + "step": 773 + }, + { + "epoch": 0.042600033023281414, + "grad_norm": 1.0351910591125488, + "learning_rate": 9.991466886197441e-06, + "loss": 0.9552, + "step": 774 + }, + { + "epoch": 0.042655071825637075, + "grad_norm": 0.916829526424408, + "learning_rate": 9.991441553832375e-06, + "loss": 0.8781, + "step": 775 + }, + { + "epoch": 0.04271011062799274, + "grad_norm": 1.113476276397705, + "learning_rate": 9.991416183952972e-06, + "loss": 0.8137, + "step": 776 + }, + { + "epoch": 0.0427651494303484, + "grad_norm": 1.1608171463012695, + "learning_rate": 9.991390776559421e-06, + "loss": 1.0045, + "step": 777 + }, + { + "epoch": 0.04282018823270406, + "grad_norm": 1.0045493841171265, + "learning_rate": 9.991365331651913e-06, + "loss": 0.8813, + "step": 778 + }, + { + "epoch": 0.042875227035059714, + "grad_norm": 0.918820858001709, + "learning_rate": 9.991339849230639e-06, + "loss": 0.9198, + "step": 779 + }, + { + "epoch": 0.042930265837415375, + "grad_norm": 0.9875735640525818, + "learning_rate": 9.991314329295792e-06, + "loss": 0.8665, + "step": 780 + }, + { + "epoch": 0.04298530463977104, + "grad_norm": 0.873768150806427, + "learning_rate": 9.991288771847561e-06, + "loss": 0.8606, + "step": 781 + }, + { + "epoch": 0.0430403434421267, + "grad_norm": 0.8892746567726135, + "learning_rate": 9.991263176886139e-06, + "loss": 0.9011, + "step": 782 + }, + { + "epoch": 0.04309538224448236, + "grad_norm": 1.097734808921814, + "learning_rate": 9.99123754441172e-06, + "loss": 1.009, + "step": 783 + }, + { + "epoch": 0.04315042104683802, + "grad_norm": 1.0065964460372925, + "learning_rate": 9.991211874424497e-06, + "loss": 0.9492, + "step": 784 + }, + { + "epoch": 0.04320545984919368, + "grad_norm": 1.0791678428649902, + "learning_rate": 9.99118616692466e-06, + "loss": 1.0142, + "step": 785 + }, + { + "epoch": 0.043260498651549344, + "grad_norm": 0.9454777836799622, + "learning_rate": 9.991160421912404e-06, + "loss": 0.8058, + "step": 786 + }, + { + "epoch": 0.043315537453905005, + "grad_norm": 0.9448156952857971, + "learning_rate": 9.991134639387922e-06, + "loss": 0.8184, + "step": 787 + }, + { + "epoch": 0.043370576256260666, + "grad_norm": 0.9636550545692444, + "learning_rate": 9.99110881935141e-06, + "loss": 0.8606, + "step": 788 + }, + { + "epoch": 0.04342561505861633, + "grad_norm": 0.9933613538742065, + "learning_rate": 9.991082961803058e-06, + "loss": 0.9449, + "step": 789 + }, + { + "epoch": 0.04348065386097198, + "grad_norm": 0.8906797170639038, + "learning_rate": 9.991057066743065e-06, + "loss": 0.8053, + "step": 790 + }, + { + "epoch": 0.043535692663327644, + "grad_norm": 1.0393906831741333, + "learning_rate": 9.991031134171621e-06, + "loss": 0.8487, + "step": 791 + }, + { + "epoch": 0.043590731465683305, + "grad_norm": 1.0618231296539307, + "learning_rate": 9.991005164088923e-06, + "loss": 0.9847, + "step": 792 + }, + { + "epoch": 0.043645770268038966, + "grad_norm": 0.9525149464607239, + "learning_rate": 9.990979156495167e-06, + "loss": 0.9318, + "step": 793 + }, + { + "epoch": 0.04370080907039463, + "grad_norm": 0.9430851936340332, + "learning_rate": 9.990953111390546e-06, + "loss": 0.8483, + "step": 794 + }, + { + "epoch": 0.04375584787275029, + "grad_norm": 0.9259672164916992, + "learning_rate": 9.99092702877526e-06, + "loss": 0.9365, + "step": 795 + }, + { + "epoch": 0.04381088667510595, + "grad_norm": 0.942609965801239, + "learning_rate": 9.9909009086495e-06, + "loss": 0.8408, + "step": 796 + }, + { + "epoch": 0.04386592547746161, + "grad_norm": 0.939255952835083, + "learning_rate": 9.990874751013467e-06, + "loss": 0.8749, + "step": 797 + }, + { + "epoch": 0.04392096427981727, + "grad_norm": 1.1701711416244507, + "learning_rate": 9.990848555867353e-06, + "loss": 0.9312, + "step": 798 + }, + { + "epoch": 0.043976003082172935, + "grad_norm": 1.0441124439239502, + "learning_rate": 9.990822323211358e-06, + "loss": 0.8618, + "step": 799 + }, + { + "epoch": 0.04403104188452859, + "grad_norm": 0.9601489305496216, + "learning_rate": 9.990796053045679e-06, + "loss": 0.9569, + "step": 800 + }, + { + "epoch": 0.04408608068688425, + "grad_norm": 0.9394032955169678, + "learning_rate": 9.990769745370513e-06, + "loss": 0.846, + "step": 801 + }, + { + "epoch": 0.04414111948923991, + "grad_norm": 0.9631348252296448, + "learning_rate": 9.990743400186056e-06, + "loss": 0.8754, + "step": 802 + }, + { + "epoch": 0.04419615829159557, + "grad_norm": 0.9234963059425354, + "learning_rate": 9.990717017492508e-06, + "loss": 0.8613, + "step": 803 + }, + { + "epoch": 0.044251197093951235, + "grad_norm": 0.9169090390205383, + "learning_rate": 9.990690597290069e-06, + "loss": 0.8867, + "step": 804 + }, + { + "epoch": 0.044306235896306896, + "grad_norm": 1.0194867849349976, + "learning_rate": 9.990664139578933e-06, + "loss": 0.8675, + "step": 805 + }, + { + "epoch": 0.04436127469866256, + "grad_norm": 1.3226114511489868, + "learning_rate": 9.990637644359302e-06, + "loss": 0.997, + "step": 806 + }, + { + "epoch": 0.04441631350101822, + "grad_norm": 0.8904317617416382, + "learning_rate": 9.990611111631374e-06, + "loss": 0.7274, + "step": 807 + }, + { + "epoch": 0.04447135230337388, + "grad_norm": 0.8909007906913757, + "learning_rate": 9.99058454139535e-06, + "loss": 0.8141, + "step": 808 + }, + { + "epoch": 0.04452639110572954, + "grad_norm": 1.004015564918518, + "learning_rate": 9.990557933651429e-06, + "loss": 0.9883, + "step": 809 + }, + { + "epoch": 0.0445814299080852, + "grad_norm": 1.1215732097625732, + "learning_rate": 9.990531288399807e-06, + "loss": 0.9355, + "step": 810 + }, + { + "epoch": 0.04463646871044086, + "grad_norm": 1.0545012950897217, + "learning_rate": 9.99050460564069e-06, + "loss": 0.9532, + "step": 811 + }, + { + "epoch": 0.04469150751279652, + "grad_norm": 0.9608867168426514, + "learning_rate": 9.990477885374277e-06, + "loss": 0.9363, + "step": 812 + }, + { + "epoch": 0.04474654631515218, + "grad_norm": 0.8750461935997009, + "learning_rate": 9.990451127600766e-06, + "loss": 0.7343, + "step": 813 + }, + { + "epoch": 0.04480158511750784, + "grad_norm": 0.891740620136261, + "learning_rate": 9.99042433232036e-06, + "loss": 0.8541, + "step": 814 + }, + { + "epoch": 0.0448566239198635, + "grad_norm": 1.1520029306411743, + "learning_rate": 9.990397499533264e-06, + "loss": 0.7696, + "step": 815 + }, + { + "epoch": 0.044911662722219164, + "grad_norm": 0.9526278972625732, + "learning_rate": 9.990370629239673e-06, + "loss": 0.8953, + "step": 816 + }, + { + "epoch": 0.044966701524574826, + "grad_norm": 0.9218434691429138, + "learning_rate": 9.990343721439795e-06, + "loss": 0.8198, + "step": 817 + }, + { + "epoch": 0.04502174032693049, + "grad_norm": 0.8502745628356934, + "learning_rate": 9.990316776133827e-06, + "loss": 0.8035, + "step": 818 + }, + { + "epoch": 0.04507677912928615, + "grad_norm": 0.8861565589904785, + "learning_rate": 9.990289793321975e-06, + "loss": 0.8626, + "step": 819 + }, + { + "epoch": 0.04513181793164181, + "grad_norm": 1.1113256216049194, + "learning_rate": 9.99026277300444e-06, + "loss": 0.9363, + "step": 820 + }, + { + "epoch": 0.04518685673399747, + "grad_norm": 0.9984708428382874, + "learning_rate": 9.990235715181426e-06, + "loss": 1.0376, + "step": 821 + }, + { + "epoch": 0.045241895536353126, + "grad_norm": 0.9026711583137512, + "learning_rate": 9.990208619853137e-06, + "loss": 0.9079, + "step": 822 + }, + { + "epoch": 0.04529693433870879, + "grad_norm": 0.8724965453147888, + "learning_rate": 9.990181487019775e-06, + "loss": 0.8665, + "step": 823 + }, + { + "epoch": 0.04535197314106445, + "grad_norm": 0.8923047780990601, + "learning_rate": 9.990154316681543e-06, + "loss": 0.7779, + "step": 824 + }, + { + "epoch": 0.04540701194342011, + "grad_norm": 0.9024640321731567, + "learning_rate": 9.99012710883865e-06, + "loss": 0.8859, + "step": 825 + }, + { + "epoch": 0.04546205074577577, + "grad_norm": 0.9245888590812683, + "learning_rate": 9.990099863491296e-06, + "loss": 0.8501, + "step": 826 + }, + { + "epoch": 0.04551708954813143, + "grad_norm": 0.9257050156593323, + "learning_rate": 9.990072580639687e-06, + "loss": 0.9561, + "step": 827 + }, + { + "epoch": 0.045572128350487094, + "grad_norm": 0.995610773563385, + "learning_rate": 9.99004526028403e-06, + "loss": 0.917, + "step": 828 + }, + { + "epoch": 0.045627167152842756, + "grad_norm": 0.9524009823799133, + "learning_rate": 9.990017902424525e-06, + "loss": 0.9184, + "step": 829 + }, + { + "epoch": 0.04568220595519842, + "grad_norm": 0.9264503121376038, + "learning_rate": 9.989990507061385e-06, + "loss": 0.8615, + "step": 830 + }, + { + "epoch": 0.04573724475755408, + "grad_norm": 1.0068570375442505, + "learning_rate": 9.989963074194809e-06, + "loss": 0.8331, + "step": 831 + }, + { + "epoch": 0.04579228355990974, + "grad_norm": 0.9295952320098877, + "learning_rate": 9.989935603825009e-06, + "loss": 0.8387, + "step": 832 + }, + { + "epoch": 0.045847322362265394, + "grad_norm": 1.0408827066421509, + "learning_rate": 9.989908095952186e-06, + "loss": 0.9686, + "step": 833 + }, + { + "epoch": 0.045902361164621056, + "grad_norm": 0.8874136209487915, + "learning_rate": 9.989880550576551e-06, + "loss": 0.815, + "step": 834 + }, + { + "epoch": 0.04595739996697672, + "grad_norm": 0.9898836016654968, + "learning_rate": 9.989852967698311e-06, + "loss": 0.9458, + "step": 835 + }, + { + "epoch": 0.04601243876933238, + "grad_norm": 0.9828970432281494, + "learning_rate": 9.989825347317668e-06, + "loss": 0.7922, + "step": 836 + }, + { + "epoch": 0.04606747757168804, + "grad_norm": 1.025447964668274, + "learning_rate": 9.989797689434836e-06, + "loss": 0.9349, + "step": 837 + }, + { + "epoch": 0.0461225163740437, + "grad_norm": 0.8623831272125244, + "learning_rate": 9.98976999405002e-06, + "loss": 0.8786, + "step": 838 + }, + { + "epoch": 0.04617755517639936, + "grad_norm": 0.9614997506141663, + "learning_rate": 9.98974226116343e-06, + "loss": 0.7885, + "step": 839 + }, + { + "epoch": 0.046232593978755024, + "grad_norm": 1.0207616090774536, + "learning_rate": 9.989714490775269e-06, + "loss": 0.9786, + "step": 840 + }, + { + "epoch": 0.046287632781110685, + "grad_norm": 0.8509595990180969, + "learning_rate": 9.98968668288575e-06, + "loss": 0.7312, + "step": 841 + }, + { + "epoch": 0.04634267158346635, + "grad_norm": 0.9822607040405273, + "learning_rate": 9.989658837495084e-06, + "loss": 0.952, + "step": 842 + }, + { + "epoch": 0.046397710385822, + "grad_norm": 1.0058252811431885, + "learning_rate": 9.989630954603477e-06, + "loss": 0.8811, + "step": 843 + }, + { + "epoch": 0.04645274918817766, + "grad_norm": 1.0146985054016113, + "learning_rate": 9.989603034211139e-06, + "loss": 0.9051, + "step": 844 + }, + { + "epoch": 0.046507787990533324, + "grad_norm": 0.8976503610610962, + "learning_rate": 9.98957507631828e-06, + "loss": 0.879, + "step": 845 + }, + { + "epoch": 0.046562826792888985, + "grad_norm": 0.8791939616203308, + "learning_rate": 9.989547080925111e-06, + "loss": 0.8944, + "step": 846 + }, + { + "epoch": 0.04661786559524465, + "grad_norm": 0.8530884981155396, + "learning_rate": 9.989519048031842e-06, + "loss": 0.9029, + "step": 847 + }, + { + "epoch": 0.04667290439760031, + "grad_norm": 0.9621617197990417, + "learning_rate": 9.989490977638683e-06, + "loss": 0.8374, + "step": 848 + }, + { + "epoch": 0.04672794319995597, + "grad_norm": 0.9629075527191162, + "learning_rate": 9.989462869745845e-06, + "loss": 0.9032, + "step": 849 + }, + { + "epoch": 0.04678298200231163, + "grad_norm": 1.3256126642227173, + "learning_rate": 9.989434724353541e-06, + "loss": 0.9748, + "step": 850 + }, + { + "epoch": 0.04683802080466729, + "grad_norm": 1.0230494737625122, + "learning_rate": 9.989406541461979e-06, + "loss": 0.9752, + "step": 851 + }, + { + "epoch": 0.046893059607022954, + "grad_norm": 0.8454533219337463, + "learning_rate": 9.989378321071375e-06, + "loss": 0.8426, + "step": 852 + }, + { + "epoch": 0.046948098409378615, + "grad_norm": 0.9995863437652588, + "learning_rate": 9.989350063181939e-06, + "loss": 0.9955, + "step": 853 + }, + { + "epoch": 0.04700313721173427, + "grad_norm": 0.8956604599952698, + "learning_rate": 9.989321767793883e-06, + "loss": 0.9024, + "step": 854 + }, + { + "epoch": 0.04705817601408993, + "grad_norm": 1.0123292207717896, + "learning_rate": 9.989293434907419e-06, + "loss": 0.7856, + "step": 855 + }, + { + "epoch": 0.04711321481644559, + "grad_norm": 0.814577043056488, + "learning_rate": 9.989265064522762e-06, + "loss": 0.8377, + "step": 856 + }, + { + "epoch": 0.047168253618801254, + "grad_norm": 1.1571552753448486, + "learning_rate": 9.989236656640125e-06, + "loss": 0.8562, + "step": 857 + }, + { + "epoch": 0.047223292421156915, + "grad_norm": 0.9681577682495117, + "learning_rate": 9.98920821125972e-06, + "loss": 0.8473, + "step": 858 + }, + { + "epoch": 0.047278331223512576, + "grad_norm": 0.9680121541023254, + "learning_rate": 9.989179728381761e-06, + "loss": 0.9811, + "step": 859 + }, + { + "epoch": 0.04733337002586824, + "grad_norm": 0.985477089881897, + "learning_rate": 9.989151208006464e-06, + "loss": 0.6994, + "step": 860 + }, + { + "epoch": 0.0473884088282239, + "grad_norm": 0.8612962365150452, + "learning_rate": 9.98912265013404e-06, + "loss": 0.7667, + "step": 861 + }, + { + "epoch": 0.04744344763057956, + "grad_norm": 0.8884604573249817, + "learning_rate": 9.989094054764708e-06, + "loss": 0.8382, + "step": 862 + }, + { + "epoch": 0.04749848643293522, + "grad_norm": 1.036881923675537, + "learning_rate": 9.989065421898681e-06, + "loss": 0.8748, + "step": 863 + }, + { + "epoch": 0.04755352523529088, + "grad_norm": 0.9954493045806885, + "learning_rate": 9.989036751536171e-06, + "loss": 0.9174, + "step": 864 + }, + { + "epoch": 0.04760856403764654, + "grad_norm": 0.9984694123268127, + "learning_rate": 9.989008043677399e-06, + "loss": 0.7636, + "step": 865 + }, + { + "epoch": 0.0476636028400022, + "grad_norm": 1.0412588119506836, + "learning_rate": 9.988979298322576e-06, + "loss": 0.773, + "step": 866 + }, + { + "epoch": 0.04771864164235786, + "grad_norm": 0.8034874796867371, + "learning_rate": 9.98895051547192e-06, + "loss": 0.7914, + "step": 867 + }, + { + "epoch": 0.04777368044471352, + "grad_norm": 0.8983979225158691, + "learning_rate": 9.988921695125648e-06, + "loss": 0.7292, + "step": 868 + }, + { + "epoch": 0.04782871924706918, + "grad_norm": 0.9445077776908875, + "learning_rate": 9.988892837283976e-06, + "loss": 0.8263, + "step": 869 + }, + { + "epoch": 0.047883758049424845, + "grad_norm": 1.0753306150436401, + "learning_rate": 9.988863941947121e-06, + "loss": 1.1122, + "step": 870 + }, + { + "epoch": 0.047938796851780506, + "grad_norm": 1.0091484785079956, + "learning_rate": 9.9888350091153e-06, + "loss": 0.9276, + "step": 871 + }, + { + "epoch": 0.04799383565413617, + "grad_norm": 1.0977306365966797, + "learning_rate": 9.988806038788732e-06, + "loss": 0.854, + "step": 872 + }, + { + "epoch": 0.04804887445649183, + "grad_norm": 1.0285007953643799, + "learning_rate": 9.988777030967632e-06, + "loss": 0.9441, + "step": 873 + }, + { + "epoch": 0.04810391325884749, + "grad_norm": 0.8973976373672485, + "learning_rate": 9.988747985652218e-06, + "loss": 0.786, + "step": 874 + }, + { + "epoch": 0.04815895206120315, + "grad_norm": 0.9809553623199463, + "learning_rate": 9.98871890284271e-06, + "loss": 0.9042, + "step": 875 + }, + { + "epoch": 0.048213990863558806, + "grad_norm": 0.8514279723167419, + "learning_rate": 9.988689782539326e-06, + "loss": 0.7874, + "step": 876 + }, + { + "epoch": 0.04826902966591447, + "grad_norm": 0.8299674391746521, + "learning_rate": 9.988660624742286e-06, + "loss": 0.8704, + "step": 877 + }, + { + "epoch": 0.04832406846827013, + "grad_norm": 0.9862462282180786, + "learning_rate": 9.988631429451809e-06, + "loss": 0.9963, + "step": 878 + }, + { + "epoch": 0.04837910727062579, + "grad_norm": 0.9041131734848022, + "learning_rate": 9.988602196668111e-06, + "loss": 0.9207, + "step": 879 + }, + { + "epoch": 0.04843414607298145, + "grad_norm": 0.8597276210784912, + "learning_rate": 9.988572926391416e-06, + "loss": 0.8226, + "step": 880 + }, + { + "epoch": 0.04848918487533711, + "grad_norm": 0.9494329690933228, + "learning_rate": 9.988543618621941e-06, + "loss": 0.8834, + "step": 881 + }, + { + "epoch": 0.048544223677692774, + "grad_norm": 0.9129118323326111, + "learning_rate": 9.98851427335991e-06, + "loss": 0.7819, + "step": 882 + }, + { + "epoch": 0.048599262480048436, + "grad_norm": 0.9145999550819397, + "learning_rate": 9.988484890605539e-06, + "loss": 0.885, + "step": 883 + }, + { + "epoch": 0.0486543012824041, + "grad_norm": 1.0115307569503784, + "learning_rate": 9.98845547035905e-06, + "loss": 0.8347, + "step": 884 + }, + { + "epoch": 0.04870934008475976, + "grad_norm": 1.1372706890106201, + "learning_rate": 9.988426012620667e-06, + "loss": 0.944, + "step": 885 + }, + { + "epoch": 0.04876437888711541, + "grad_norm": 0.9502811431884766, + "learning_rate": 9.98839651739061e-06, + "loss": 0.9054, + "step": 886 + }, + { + "epoch": 0.048819417689471074, + "grad_norm": 0.9612823128700256, + "learning_rate": 9.988366984669097e-06, + "loss": 0.8796, + "step": 887 + }, + { + "epoch": 0.048874456491826736, + "grad_norm": 0.9551461935043335, + "learning_rate": 9.988337414456355e-06, + "loss": 0.8769, + "step": 888 + }, + { + "epoch": 0.0489294952941824, + "grad_norm": 0.8554086089134216, + "learning_rate": 9.988307806752603e-06, + "loss": 0.892, + "step": 889 + }, + { + "epoch": 0.04898453409653806, + "grad_norm": 0.8418886661529541, + "learning_rate": 9.988278161558067e-06, + "loss": 0.7568, + "step": 890 + }, + { + "epoch": 0.04903957289889372, + "grad_norm": 1.4780360460281372, + "learning_rate": 9.988248478872967e-06, + "loss": 0.9126, + "step": 891 + }, + { + "epoch": 0.04909461170124938, + "grad_norm": 0.8236714005470276, + "learning_rate": 9.988218758697526e-06, + "loss": 0.7317, + "step": 892 + }, + { + "epoch": 0.04914965050360504, + "grad_norm": 0.8777141571044922, + "learning_rate": 9.988189001031968e-06, + "loss": 0.7989, + "step": 893 + }, + { + "epoch": 0.049204689305960704, + "grad_norm": 1.0235031843185425, + "learning_rate": 9.988159205876516e-06, + "loss": 0.8335, + "step": 894 + }, + { + "epoch": 0.049259728108316365, + "grad_norm": 0.9340357184410095, + "learning_rate": 9.988129373231395e-06, + "loss": 0.8129, + "step": 895 + }, + { + "epoch": 0.04931476691067203, + "grad_norm": 1.7686667442321777, + "learning_rate": 9.98809950309683e-06, + "loss": 0.9792, + "step": 896 + }, + { + "epoch": 0.04936980571302768, + "grad_norm": 0.9252369403839111, + "learning_rate": 9.988069595473044e-06, + "loss": 0.8671, + "step": 897 + }, + { + "epoch": 0.04942484451538334, + "grad_norm": 0.9989960789680481, + "learning_rate": 9.988039650360262e-06, + "loss": 0.9245, + "step": 898 + }, + { + "epoch": 0.049479883317739004, + "grad_norm": 1.062912106513977, + "learning_rate": 9.98800966775871e-06, + "loss": 0.9146, + "step": 899 + }, + { + "epoch": 0.049534922120094665, + "grad_norm": 0.8698169589042664, + "learning_rate": 9.98797964766861e-06, + "loss": 0.8606, + "step": 900 + }, + { + "epoch": 0.04958996092245033, + "grad_norm": 1.6754224300384521, + "learning_rate": 9.98794959009019e-06, + "loss": 0.9236, + "step": 901 + }, + { + "epoch": 0.04964499972480599, + "grad_norm": 1.084174394607544, + "learning_rate": 9.98791949502368e-06, + "loss": 0.9252, + "step": 902 + }, + { + "epoch": 0.04970003852716165, + "grad_norm": 0.9866724610328674, + "learning_rate": 9.987889362469301e-06, + "loss": 0.9096, + "step": 903 + }, + { + "epoch": 0.04975507732951731, + "grad_norm": 0.8814040422439575, + "learning_rate": 9.987859192427279e-06, + "loss": 0.8475, + "step": 904 + }, + { + "epoch": 0.04981011613187297, + "grad_norm": 0.8796457052230835, + "learning_rate": 9.987828984897843e-06, + "loss": 0.8478, + "step": 905 + }, + { + "epoch": 0.049865154934228634, + "grad_norm": 1.0541884899139404, + "learning_rate": 9.98779873988122e-06, + "loss": 0.9799, + "step": 906 + }, + { + "epoch": 0.049920193736584295, + "grad_norm": 0.91409832239151, + "learning_rate": 9.987768457377636e-06, + "loss": 0.8701, + "step": 907 + }, + { + "epoch": 0.04997523253893995, + "grad_norm": 1.0120370388031006, + "learning_rate": 9.98773813738732e-06, + "loss": 0.8417, + "step": 908 + }, + { + "epoch": 0.05003027134129561, + "grad_norm": 1.7744206190109253, + "learning_rate": 9.987707779910499e-06, + "loss": 0.9263, + "step": 909 + }, + { + "epoch": 0.05008531014365127, + "grad_norm": 0.9423969388008118, + "learning_rate": 9.987677384947402e-06, + "loss": 0.9667, + "step": 910 + }, + { + "epoch": 0.050140348946006934, + "grad_norm": 1.5940319299697876, + "learning_rate": 9.987646952498256e-06, + "loss": 0.9223, + "step": 911 + }, + { + "epoch": 0.050195387748362595, + "grad_norm": 0.941792368888855, + "learning_rate": 9.987616482563292e-06, + "loss": 0.895, + "step": 912 + }, + { + "epoch": 0.05025042655071826, + "grad_norm": 3.1945221424102783, + "learning_rate": 9.987585975142738e-06, + "loss": 0.837, + "step": 913 + }, + { + "epoch": 0.05030546535307392, + "grad_norm": 2.0819199085235596, + "learning_rate": 9.98755543023682e-06, + "loss": 0.918, + "step": 914 + }, + { + "epoch": 0.05036050415542958, + "grad_norm": 0.984282910823822, + "learning_rate": 9.987524847845773e-06, + "loss": 0.8589, + "step": 915 + }, + { + "epoch": 0.05041554295778524, + "grad_norm": 0.9021026492118835, + "learning_rate": 9.987494227969823e-06, + "loss": 0.9053, + "step": 916 + }, + { + "epoch": 0.0504705817601409, + "grad_norm": 2.6515462398529053, + "learning_rate": 9.9874635706092e-06, + "loss": 0.8874, + "step": 917 + }, + { + "epoch": 0.050525620562496563, + "grad_norm": 0.8893095254898071, + "learning_rate": 9.98743287576414e-06, + "loss": 0.8259, + "step": 918 + }, + { + "epoch": 0.05058065936485222, + "grad_norm": 0.9897775650024414, + "learning_rate": 9.987402143434868e-06, + "loss": 0.877, + "step": 919 + }, + { + "epoch": 0.05063569816720788, + "grad_norm": 0.9391944408416748, + "learning_rate": 9.987371373621614e-06, + "loss": 0.9363, + "step": 920 + }, + { + "epoch": 0.05069073696956354, + "grad_norm": 0.9585913419723511, + "learning_rate": 9.987340566324615e-06, + "loss": 0.8704, + "step": 921 + }, + { + "epoch": 0.0507457757719192, + "grad_norm": 0.9210980534553528, + "learning_rate": 9.987309721544098e-06, + "loss": 0.9321, + "step": 922 + }, + { + "epoch": 0.05080081457427486, + "grad_norm": 1.0713307857513428, + "learning_rate": 9.987278839280295e-06, + "loss": 0.9489, + "step": 923 + }, + { + "epoch": 0.050855853376630525, + "grad_norm": 1.0178636312484741, + "learning_rate": 9.98724791953344e-06, + "loss": 0.853, + "step": 924 + }, + { + "epoch": 0.050910892178986186, + "grad_norm": 0.9782636761665344, + "learning_rate": 9.987216962303766e-06, + "loss": 0.924, + "step": 925 + }, + { + "epoch": 0.05096593098134185, + "grad_norm": 0.9474522471427917, + "learning_rate": 9.987185967591503e-06, + "loss": 0.8619, + "step": 926 + }, + { + "epoch": 0.05102096978369751, + "grad_norm": 1.1875778436660767, + "learning_rate": 9.987154935396885e-06, + "loss": 1.012, + "step": 927 + }, + { + "epoch": 0.05107600858605317, + "grad_norm": 1.0585243701934814, + "learning_rate": 9.987123865720147e-06, + "loss": 0.9019, + "step": 928 + }, + { + "epoch": 0.051131047388408825, + "grad_norm": 0.9848800897598267, + "learning_rate": 9.98709275856152e-06, + "loss": 0.9665, + "step": 929 + }, + { + "epoch": 0.051186086190764486, + "grad_norm": 1.04201078414917, + "learning_rate": 9.987061613921238e-06, + "loss": 0.9269, + "step": 930 + }, + { + "epoch": 0.05124112499312015, + "grad_norm": 1.1748600006103516, + "learning_rate": 9.987030431799537e-06, + "loss": 0.8565, + "step": 931 + }, + { + "epoch": 0.05129616379547581, + "grad_norm": 1.879232406616211, + "learning_rate": 9.98699921219665e-06, + "loss": 0.8535, + "step": 932 + }, + { + "epoch": 0.05135120259783147, + "grad_norm": 0.9837847948074341, + "learning_rate": 9.986967955112812e-06, + "loss": 0.927, + "step": 933 + }, + { + "epoch": 0.05140624140018713, + "grad_norm": 0.8637211918830872, + "learning_rate": 9.986936660548257e-06, + "loss": 0.7903, + "step": 934 + }, + { + "epoch": 0.05146128020254279, + "grad_norm": 0.9078792929649353, + "learning_rate": 9.986905328503222e-06, + "loss": 0.9135, + "step": 935 + }, + { + "epoch": 0.051516319004898455, + "grad_norm": 0.9763005971908569, + "learning_rate": 9.98687395897794e-06, + "loss": 0.9006, + "step": 936 + }, + { + "epoch": 0.051571357807254116, + "grad_norm": 1.0174345970153809, + "learning_rate": 9.98684255197265e-06, + "loss": 0.9294, + "step": 937 + }, + { + "epoch": 0.05162639660960978, + "grad_norm": 0.8709769248962402, + "learning_rate": 9.986811107487584e-06, + "loss": 0.7986, + "step": 938 + }, + { + "epoch": 0.05168143541196544, + "grad_norm": 0.8717525601387024, + "learning_rate": 9.986779625522983e-06, + "loss": 0.8705, + "step": 939 + }, + { + "epoch": 0.05173647421432109, + "grad_norm": 0.9682945013046265, + "learning_rate": 9.98674810607908e-06, + "loss": 0.8127, + "step": 940 + }, + { + "epoch": 0.051791513016676755, + "grad_norm": 1.0248037576675415, + "learning_rate": 9.986716549156113e-06, + "loss": 0.9217, + "step": 941 + }, + { + "epoch": 0.051846551819032416, + "grad_norm": 0.9883397221565247, + "learning_rate": 9.98668495475432e-06, + "loss": 0.853, + "step": 942 + }, + { + "epoch": 0.05190159062138808, + "grad_norm": 0.9271108508110046, + "learning_rate": 9.986653322873937e-06, + "loss": 0.8807, + "step": 943 + }, + { + "epoch": 0.05195662942374374, + "grad_norm": 0.9027101397514343, + "learning_rate": 9.986621653515203e-06, + "loss": 0.88, + "step": 944 + }, + { + "epoch": 0.0520116682260994, + "grad_norm": 0.9807021617889404, + "learning_rate": 9.986589946678354e-06, + "loss": 0.8922, + "step": 945 + }, + { + "epoch": 0.05206670702845506, + "grad_norm": 0.8779157400131226, + "learning_rate": 9.98655820236363e-06, + "loss": 0.8988, + "step": 946 + }, + { + "epoch": 0.05212174583081072, + "grad_norm": 0.8182910680770874, + "learning_rate": 9.986526420571272e-06, + "loss": 0.7534, + "step": 947 + }, + { + "epoch": 0.052176784633166384, + "grad_norm": 0.9205981492996216, + "learning_rate": 9.986494601301513e-06, + "loss": 0.7516, + "step": 948 + }, + { + "epoch": 0.052231823435522046, + "grad_norm": 0.9829681515693665, + "learning_rate": 9.986462744554598e-06, + "loss": 0.9358, + "step": 949 + }, + { + "epoch": 0.05228686223787771, + "grad_norm": 0.8869890570640564, + "learning_rate": 9.986430850330762e-06, + "loss": 0.7933, + "step": 950 + }, + { + "epoch": 0.05234190104023336, + "grad_norm": 1.0226716995239258, + "learning_rate": 9.986398918630248e-06, + "loss": 0.9523, + "step": 951 + }, + { + "epoch": 0.05239693984258902, + "grad_norm": 0.9549778699874878, + "learning_rate": 9.986366949453293e-06, + "loss": 0.9368, + "step": 952 + }, + { + "epoch": 0.052451978644944684, + "grad_norm": 0.860454797744751, + "learning_rate": 9.98633494280014e-06, + "loss": 0.7618, + "step": 953 + }, + { + "epoch": 0.052507017447300346, + "grad_norm": 0.9623841643333435, + "learning_rate": 9.986302898671027e-06, + "loss": 0.8356, + "step": 954 + }, + { + "epoch": 0.05256205624965601, + "grad_norm": 0.9236606359481812, + "learning_rate": 9.986270817066196e-06, + "loss": 0.921, + "step": 955 + }, + { + "epoch": 0.05261709505201167, + "grad_norm": 1.0599812269210815, + "learning_rate": 9.98623869798589e-06, + "loss": 0.8082, + "step": 956 + }, + { + "epoch": 0.05267213385436733, + "grad_norm": 1.0321687459945679, + "learning_rate": 9.986206541430347e-06, + "loss": 0.9001, + "step": 957 + }, + { + "epoch": 0.05272717265672299, + "grad_norm": 0.8884543776512146, + "learning_rate": 9.986174347399813e-06, + "loss": 0.8317, + "step": 958 + }, + { + "epoch": 0.05278221145907865, + "grad_norm": 0.9592668414115906, + "learning_rate": 9.986142115894526e-06, + "loss": 0.9955, + "step": 959 + }, + { + "epoch": 0.052837250261434314, + "grad_norm": 0.9604032039642334, + "learning_rate": 9.986109846914729e-06, + "loss": 0.876, + "step": 960 + }, + { + "epoch": 0.052892289063789975, + "grad_norm": 0.9837536811828613, + "learning_rate": 9.986077540460664e-06, + "loss": 0.8247, + "step": 961 + }, + { + "epoch": 0.05294732786614563, + "grad_norm": 0.8570861220359802, + "learning_rate": 9.986045196532576e-06, + "loss": 0.879, + "step": 962 + }, + { + "epoch": 0.05300236666850129, + "grad_norm": 0.8441471457481384, + "learning_rate": 9.986012815130708e-06, + "loss": 0.8979, + "step": 963 + }, + { + "epoch": 0.05305740547085695, + "grad_norm": 0.8976197838783264, + "learning_rate": 9.985980396255302e-06, + "loss": 0.9382, + "step": 964 + }, + { + "epoch": 0.053112444273212614, + "grad_norm": 0.9685307741165161, + "learning_rate": 9.985947939906599e-06, + "loss": 0.8627, + "step": 965 + }, + { + "epoch": 0.053167483075568275, + "grad_norm": 0.8939018249511719, + "learning_rate": 9.98591544608485e-06, + "loss": 0.9221, + "step": 966 + }, + { + "epoch": 0.05322252187792394, + "grad_norm": 0.9218310713768005, + "learning_rate": 9.985882914790292e-06, + "loss": 0.8356, + "step": 967 + }, + { + "epoch": 0.0532775606802796, + "grad_norm": 0.9342261552810669, + "learning_rate": 9.985850346023174e-06, + "loss": 0.971, + "step": 968 + }, + { + "epoch": 0.05333259948263526, + "grad_norm": 1.0860705375671387, + "learning_rate": 9.985817739783741e-06, + "loss": 0.906, + "step": 969 + }, + { + "epoch": 0.05338763828499092, + "grad_norm": 0.8675006031990051, + "learning_rate": 9.985785096072234e-06, + "loss": 0.906, + "step": 970 + }, + { + "epoch": 0.05344267708734658, + "grad_norm": 0.8170626163482666, + "learning_rate": 9.985752414888903e-06, + "loss": 0.8109, + "step": 971 + }, + { + "epoch": 0.05349771588970224, + "grad_norm": 0.936434805393219, + "learning_rate": 9.98571969623399e-06, + "loss": 0.9219, + "step": 972 + }, + { + "epoch": 0.0535527546920579, + "grad_norm": 0.9316715002059937, + "learning_rate": 9.985686940107741e-06, + "loss": 0.8569, + "step": 973 + }, + { + "epoch": 0.05360779349441356, + "grad_norm": 1.183008074760437, + "learning_rate": 9.985654146510405e-06, + "loss": 0.837, + "step": 974 + }, + { + "epoch": 0.05366283229676922, + "grad_norm": 1.0788745880126953, + "learning_rate": 9.98562131544223e-06, + "loss": 0.8822, + "step": 975 + }, + { + "epoch": 0.05371787109912488, + "grad_norm": 0.9285461902618408, + "learning_rate": 9.985588446903455e-06, + "loss": 0.9279, + "step": 976 + }, + { + "epoch": 0.053772909901480544, + "grad_norm": 0.9389022588729858, + "learning_rate": 9.985555540894334e-06, + "loss": 0.9839, + "step": 977 + }, + { + "epoch": 0.053827948703836205, + "grad_norm": 0.8920616507530212, + "learning_rate": 9.985522597415112e-06, + "loss": 0.9205, + "step": 978 + }, + { + "epoch": 0.053882987506191866, + "grad_norm": 0.9755093455314636, + "learning_rate": 9.985489616466035e-06, + "loss": 0.8987, + "step": 979 + }, + { + "epoch": 0.05393802630854753, + "grad_norm": 0.96027010679245, + "learning_rate": 9.985456598047356e-06, + "loss": 0.8543, + "step": 980 + }, + { + "epoch": 0.05399306511090319, + "grad_norm": 1.0489718914031982, + "learning_rate": 9.985423542159317e-06, + "loss": 0.9179, + "step": 981 + }, + { + "epoch": 0.05404810391325885, + "grad_norm": 0.8665526509284973, + "learning_rate": 9.985390448802171e-06, + "loss": 0.9047, + "step": 982 + }, + { + "epoch": 0.054103142715614505, + "grad_norm": 0.8849464654922485, + "learning_rate": 9.985357317976163e-06, + "loss": 0.8892, + "step": 983 + }, + { + "epoch": 0.054158181517970166, + "grad_norm": 1.0083115100860596, + "learning_rate": 9.985324149681545e-06, + "loss": 0.7713, + "step": 984 + }, + { + "epoch": 0.05421322032032583, + "grad_norm": 0.8233863711357117, + "learning_rate": 9.985290943918565e-06, + "loss": 0.7967, + "step": 985 + }, + { + "epoch": 0.05426825912268149, + "grad_norm": 0.9615303874015808, + "learning_rate": 9.985257700687472e-06, + "loss": 0.8576, + "step": 986 + }, + { + "epoch": 0.05432329792503715, + "grad_norm": 0.8856416344642639, + "learning_rate": 9.985224419988517e-06, + "loss": 0.8614, + "step": 987 + }, + { + "epoch": 0.05437833672739281, + "grad_norm": 0.968325674533844, + "learning_rate": 9.98519110182195e-06, + "loss": 0.8247, + "step": 988 + }, + { + "epoch": 0.05443337552974847, + "grad_norm": 0.878402054309845, + "learning_rate": 9.985157746188021e-06, + "loss": 0.8661, + "step": 989 + }, + { + "epoch": 0.054488414332104135, + "grad_norm": 0.8376438021659851, + "learning_rate": 9.985124353086981e-06, + "loss": 0.8554, + "step": 990 + }, + { + "epoch": 0.054543453134459796, + "grad_norm": 1.0293036699295044, + "learning_rate": 9.98509092251908e-06, + "loss": 0.8049, + "step": 991 + }, + { + "epoch": 0.05459849193681546, + "grad_norm": 1.2345234155654907, + "learning_rate": 9.98505745448457e-06, + "loss": 1.0358, + "step": 992 + }, + { + "epoch": 0.05465353073917112, + "grad_norm": 0.9974482655525208, + "learning_rate": 9.985023948983703e-06, + "loss": 0.9329, + "step": 993 + }, + { + "epoch": 0.05470856954152677, + "grad_norm": 1.383955478668213, + "learning_rate": 9.984990406016732e-06, + "loss": 0.8688, + "step": 994 + }, + { + "epoch": 0.054763608343882435, + "grad_norm": 0.9369306564331055, + "learning_rate": 9.984956825583906e-06, + "loss": 0.8308, + "step": 995 + }, + { + "epoch": 0.054818647146238096, + "grad_norm": 0.8676120042800903, + "learning_rate": 9.984923207685478e-06, + "loss": 0.8283, + "step": 996 + }, + { + "epoch": 0.05487368594859376, + "grad_norm": 0.9218453168869019, + "learning_rate": 9.984889552321704e-06, + "loss": 0.7247, + "step": 997 + }, + { + "epoch": 0.05492872475094942, + "grad_norm": 0.8575478196144104, + "learning_rate": 9.984855859492833e-06, + "loss": 0.8462, + "step": 998 + }, + { + "epoch": 0.05498376355330508, + "grad_norm": 1.0042616128921509, + "learning_rate": 9.98482212919912e-06, + "loss": 0.9383, + "step": 999 + }, + { + "epoch": 0.05503880235566074, + "grad_norm": 0.8642181158065796, + "learning_rate": 9.984788361440817e-06, + "loss": 0.8805, + "step": 1000 + }, + { + "epoch": 0.0550938411580164, + "grad_norm": 0.8413823843002319, + "learning_rate": 9.984754556218178e-06, + "loss": 0.8161, + "step": 1001 + }, + { + "epoch": 0.055148879960372064, + "grad_norm": 0.9473856091499329, + "learning_rate": 9.984720713531462e-06, + "loss": 0.8425, + "step": 1002 + }, + { + "epoch": 0.055203918762727726, + "grad_norm": 0.7854379415512085, + "learning_rate": 9.984686833380917e-06, + "loss": 0.7506, + "step": 1003 + }, + { + "epoch": 0.05525895756508339, + "grad_norm": 0.9481745958328247, + "learning_rate": 9.984652915766801e-06, + "loss": 0.954, + "step": 1004 + }, + { + "epoch": 0.05531399636743904, + "grad_norm": 0.767803966999054, + "learning_rate": 9.984618960689366e-06, + "loss": 0.8113, + "step": 1005 + }, + { + "epoch": 0.0553690351697947, + "grad_norm": 0.8957781195640564, + "learning_rate": 9.984584968148871e-06, + "loss": 0.9042, + "step": 1006 + }, + { + "epoch": 0.055424073972150364, + "grad_norm": 1.116646409034729, + "learning_rate": 9.98455093814557e-06, + "loss": 0.8648, + "step": 1007 + }, + { + "epoch": 0.055479112774506026, + "grad_norm": 0.9567018151283264, + "learning_rate": 9.98451687067972e-06, + "loss": 0.9446, + "step": 1008 + }, + { + "epoch": 0.05553415157686169, + "grad_norm": 0.8470665812492371, + "learning_rate": 9.98448276575157e-06, + "loss": 0.8186, + "step": 1009 + }, + { + "epoch": 0.05558919037921735, + "grad_norm": 0.9595193862915039, + "learning_rate": 9.984448623361387e-06, + "loss": 0.8406, + "step": 1010 + }, + { + "epoch": 0.05564422918157301, + "grad_norm": 1.0579735040664673, + "learning_rate": 9.98441444350942e-06, + "loss": 0.9676, + "step": 1011 + }, + { + "epoch": 0.05569926798392867, + "grad_norm": 0.8693701028823853, + "learning_rate": 9.98438022619593e-06, + "loss": 0.9451, + "step": 1012 + }, + { + "epoch": 0.05575430678628433, + "grad_norm": 0.9251859784126282, + "learning_rate": 9.98434597142117e-06, + "loss": 0.7858, + "step": 1013 + }, + { + "epoch": 0.055809345588639994, + "grad_norm": 0.8584280014038086, + "learning_rate": 9.984311679185402e-06, + "loss": 0.8481, + "step": 1014 + }, + { + "epoch": 0.05586438439099565, + "grad_norm": 0.8903968334197998, + "learning_rate": 9.98427734948888e-06, + "loss": 0.7832, + "step": 1015 + }, + { + "epoch": 0.05591942319335131, + "grad_norm": 0.905581533908844, + "learning_rate": 9.984242982331864e-06, + "loss": 0.8088, + "step": 1016 + }, + { + "epoch": 0.05597446199570697, + "grad_norm": 0.9866476655006409, + "learning_rate": 9.984208577714612e-06, + "loss": 0.8366, + "step": 1017 + }, + { + "epoch": 0.05602950079806263, + "grad_norm": 0.8843809962272644, + "learning_rate": 9.984174135637384e-06, + "loss": 0.8961, + "step": 1018 + }, + { + "epoch": 0.056084539600418294, + "grad_norm": 0.9071753621101379, + "learning_rate": 9.984139656100435e-06, + "loss": 0.8671, + "step": 1019 + }, + { + "epoch": 0.056139578402773956, + "grad_norm": 0.9894018173217773, + "learning_rate": 9.984105139104028e-06, + "loss": 0.9099, + "step": 1020 + }, + { + "epoch": 0.05619461720512962, + "grad_norm": 0.8432741165161133, + "learning_rate": 9.98407058464842e-06, + "loss": 0.7817, + "step": 1021 + }, + { + "epoch": 0.05624965600748528, + "grad_norm": 0.9538390040397644, + "learning_rate": 9.984035992733873e-06, + "loss": 0.8689, + "step": 1022 + }, + { + "epoch": 0.05630469480984094, + "grad_norm": 0.9263421297073364, + "learning_rate": 9.984001363360645e-06, + "loss": 0.9066, + "step": 1023 + }, + { + "epoch": 0.0563597336121966, + "grad_norm": 0.8921047449111938, + "learning_rate": 9.983966696528996e-06, + "loss": 0.8304, + "step": 1024 + }, + { + "epoch": 0.05641477241455226, + "grad_norm": 0.8379812240600586, + "learning_rate": 9.983931992239188e-06, + "loss": 0.866, + "step": 1025 + }, + { + "epoch": 0.05646981121690792, + "grad_norm": 0.9444219470024109, + "learning_rate": 9.983897250491481e-06, + "loss": 0.9456, + "step": 1026 + }, + { + "epoch": 0.05652485001926358, + "grad_norm": 1.0268759727478027, + "learning_rate": 9.983862471286137e-06, + "loss": 0.8277, + "step": 1027 + }, + { + "epoch": 0.05657988882161924, + "grad_norm": 1.3949217796325684, + "learning_rate": 9.983827654623418e-06, + "loss": 0.9721, + "step": 1028 + }, + { + "epoch": 0.0566349276239749, + "grad_norm": 0.8899377584457397, + "learning_rate": 9.983792800503582e-06, + "loss": 0.8794, + "step": 1029 + }, + { + "epoch": 0.05668996642633056, + "grad_norm": 0.989072322845459, + "learning_rate": 9.983757908926895e-06, + "loss": 0.8852, + "step": 1030 + }, + { + "epoch": 0.056745005228686224, + "grad_norm": 0.9797759056091309, + "learning_rate": 9.983722979893615e-06, + "loss": 1.0405, + "step": 1031 + }, + { + "epoch": 0.056800044031041885, + "grad_norm": 0.9044767618179321, + "learning_rate": 9.98368801340401e-06, + "loss": 0.7243, + "step": 1032 + }, + { + "epoch": 0.05685508283339755, + "grad_norm": 1.116324782371521, + "learning_rate": 9.983653009458338e-06, + "loss": 0.9183, + "step": 1033 + }, + { + "epoch": 0.05691012163575321, + "grad_norm": 0.9373337030410767, + "learning_rate": 9.983617968056866e-06, + "loss": 0.9417, + "step": 1034 + }, + { + "epoch": 0.05696516043810887, + "grad_norm": 1.0587197542190552, + "learning_rate": 9.983582889199855e-06, + "loss": 0.896, + "step": 1035 + }, + { + "epoch": 0.05702019924046453, + "grad_norm": 1.0080119371414185, + "learning_rate": 9.983547772887568e-06, + "loss": 0.924, + "step": 1036 + }, + { + "epoch": 0.057075238042820185, + "grad_norm": 0.847091019153595, + "learning_rate": 9.98351261912027e-06, + "loss": 0.7443, + "step": 1037 + }, + { + "epoch": 0.05713027684517585, + "grad_norm": 0.9876272082328796, + "learning_rate": 9.983477427898225e-06, + "loss": 0.9365, + "step": 1038 + }, + { + "epoch": 0.05718531564753151, + "grad_norm": 0.9188169240951538, + "learning_rate": 9.983442199221698e-06, + "loss": 0.9213, + "step": 1039 + }, + { + "epoch": 0.05724035444988717, + "grad_norm": 0.932399332523346, + "learning_rate": 9.983406933090954e-06, + "loss": 0.958, + "step": 1040 + }, + { + "epoch": 0.05729539325224283, + "grad_norm": 0.9126465320587158, + "learning_rate": 9.983371629506258e-06, + "loss": 0.8913, + "step": 1041 + }, + { + "epoch": 0.05735043205459849, + "grad_norm": 0.80904620885849, + "learning_rate": 9.983336288467873e-06, + "loss": 0.7719, + "step": 1042 + }, + { + "epoch": 0.057405470856954154, + "grad_norm": 0.873833417892456, + "learning_rate": 9.983300909976067e-06, + "loss": 0.9201, + "step": 1043 + }, + { + "epoch": 0.057460509659309815, + "grad_norm": 0.8331829309463501, + "learning_rate": 9.983265494031107e-06, + "loss": 0.8605, + "step": 1044 + }, + { + "epoch": 0.057515548461665476, + "grad_norm": 0.8364768624305725, + "learning_rate": 9.983230040633255e-06, + "loss": 0.8627, + "step": 1045 + }, + { + "epoch": 0.05757058726402114, + "grad_norm": 0.9226736426353455, + "learning_rate": 9.98319454978278e-06, + "loss": 0.9759, + "step": 1046 + }, + { + "epoch": 0.05762562606637679, + "grad_norm": 0.8174427151679993, + "learning_rate": 9.98315902147995e-06, + "loss": 0.8066, + "step": 1047 + }, + { + "epoch": 0.057680664868732454, + "grad_norm": 0.9154924750328064, + "learning_rate": 9.98312345572503e-06, + "loss": 0.9112, + "step": 1048 + }, + { + "epoch": 0.057735703671088115, + "grad_norm": 0.8884655237197876, + "learning_rate": 9.983087852518289e-06, + "loss": 0.8699, + "step": 1049 + }, + { + "epoch": 0.057790742473443776, + "grad_norm": 0.8849230408668518, + "learning_rate": 9.983052211859992e-06, + "loss": 0.8999, + "step": 1050 + }, + { + "epoch": 0.05784578127579944, + "grad_norm": 1.025843858718872, + "learning_rate": 9.98301653375041e-06, + "loss": 0.7764, + "step": 1051 + }, + { + "epoch": 0.0579008200781551, + "grad_norm": 0.900505006313324, + "learning_rate": 9.98298081818981e-06, + "loss": 0.9196, + "step": 1052 + }, + { + "epoch": 0.05795585888051076, + "grad_norm": 0.9506704211235046, + "learning_rate": 9.982945065178457e-06, + "loss": 0.8319, + "step": 1053 + }, + { + "epoch": 0.05801089768286642, + "grad_norm": 0.9439849853515625, + "learning_rate": 9.982909274716626e-06, + "loss": 0.8561, + "step": 1054 + }, + { + "epoch": 0.05806593648522208, + "grad_norm": 0.8761240243911743, + "learning_rate": 9.982873446804579e-06, + "loss": 0.9681, + "step": 1055 + }, + { + "epoch": 0.058120975287577745, + "grad_norm": 0.8756145238876343, + "learning_rate": 9.982837581442592e-06, + "loss": 0.8452, + "step": 1056 + }, + { + "epoch": 0.058176014089933406, + "grad_norm": 0.8732383847236633, + "learning_rate": 9.982801678630932e-06, + "loss": 0.9018, + "step": 1057 + }, + { + "epoch": 0.05823105289228906, + "grad_norm": 0.8338272571563721, + "learning_rate": 9.982765738369867e-06, + "loss": 0.9308, + "step": 1058 + }, + { + "epoch": 0.05828609169464472, + "grad_norm": 0.843163013458252, + "learning_rate": 9.982729760659669e-06, + "loss": 0.7802, + "step": 1059 + }, + { + "epoch": 0.05834113049700038, + "grad_norm": 1.2007580995559692, + "learning_rate": 9.982693745500606e-06, + "loss": 0.8406, + "step": 1060 + }, + { + "epoch": 0.058396169299356045, + "grad_norm": 0.8760073781013489, + "learning_rate": 9.982657692892954e-06, + "loss": 0.8528, + "step": 1061 + }, + { + "epoch": 0.058451208101711706, + "grad_norm": 0.925309419631958, + "learning_rate": 9.982621602836978e-06, + "loss": 0.9601, + "step": 1062 + }, + { + "epoch": 0.05850624690406737, + "grad_norm": 0.9277135133743286, + "learning_rate": 9.982585475332952e-06, + "loss": 0.8405, + "step": 1063 + }, + { + "epoch": 0.05856128570642303, + "grad_norm": 0.928044319152832, + "learning_rate": 9.98254931038115e-06, + "loss": 0.8259, + "step": 1064 + }, + { + "epoch": 0.05861632450877869, + "grad_norm": 0.8363838195800781, + "learning_rate": 9.982513107981837e-06, + "loss": 0.8655, + "step": 1065 + }, + { + "epoch": 0.05867136331113435, + "grad_norm": 0.9800984859466553, + "learning_rate": 9.982476868135292e-06, + "loss": 0.9285, + "step": 1066 + }, + { + "epoch": 0.05872640211349001, + "grad_norm": 0.8062636256217957, + "learning_rate": 9.982440590841785e-06, + "loss": 0.754, + "step": 1067 + }, + { + "epoch": 0.058781440915845674, + "grad_norm": 1.2010705471038818, + "learning_rate": 9.982404276101586e-06, + "loss": 0.9872, + "step": 1068 + }, + { + "epoch": 0.05883647971820133, + "grad_norm": 1.0036406517028809, + "learning_rate": 9.982367923914971e-06, + "loss": 0.8724, + "step": 1069 + }, + { + "epoch": 0.05889151852055699, + "grad_norm": 0.8768866658210754, + "learning_rate": 9.982331534282212e-06, + "loss": 0.838, + "step": 1070 + }, + { + "epoch": 0.05894655732291265, + "grad_norm": 0.7892739176750183, + "learning_rate": 9.982295107203584e-06, + "loss": 0.6974, + "step": 1071 + }, + { + "epoch": 0.05900159612526831, + "grad_norm": 0.863315999507904, + "learning_rate": 9.982258642679358e-06, + "loss": 0.9282, + "step": 1072 + }, + { + "epoch": 0.059056634927623974, + "grad_norm": 0.8645132780075073, + "learning_rate": 9.982222140709812e-06, + "loss": 0.8504, + "step": 1073 + }, + { + "epoch": 0.059111673729979636, + "grad_norm": 1.0003199577331543, + "learning_rate": 9.982185601295216e-06, + "loss": 1.0293, + "step": 1074 + }, + { + "epoch": 0.0591667125323353, + "grad_norm": 0.8391831517219543, + "learning_rate": 9.982149024435848e-06, + "loss": 0.8609, + "step": 1075 + }, + { + "epoch": 0.05922175133469096, + "grad_norm": 0.9940230846405029, + "learning_rate": 9.982112410131981e-06, + "loss": 0.9623, + "step": 1076 + }, + { + "epoch": 0.05927679013704662, + "grad_norm": 1.0670262575149536, + "learning_rate": 9.98207575838389e-06, + "loss": 0.9952, + "step": 1077 + }, + { + "epoch": 0.05933182893940228, + "grad_norm": 0.8506165742874146, + "learning_rate": 9.982039069191853e-06, + "loss": 0.8401, + "step": 1078 + }, + { + "epoch": 0.05938686774175794, + "grad_norm": 0.8956409096717834, + "learning_rate": 9.982002342556144e-06, + "loss": 0.8779, + "step": 1079 + }, + { + "epoch": 0.0594419065441136, + "grad_norm": 0.8955749273300171, + "learning_rate": 9.981965578477038e-06, + "loss": 0.8946, + "step": 1080 + }, + { + "epoch": 0.05949694534646926, + "grad_norm": 0.9035234451293945, + "learning_rate": 9.981928776954811e-06, + "loss": 0.9352, + "step": 1081 + }, + { + "epoch": 0.05955198414882492, + "grad_norm": 0.8748759627342224, + "learning_rate": 9.981891937989743e-06, + "loss": 0.8803, + "step": 1082 + }, + { + "epoch": 0.05960702295118058, + "grad_norm": 0.9966281056404114, + "learning_rate": 9.981855061582108e-06, + "loss": 0.9304, + "step": 1083 + }, + { + "epoch": 0.05966206175353624, + "grad_norm": 0.8696668148040771, + "learning_rate": 9.981818147732183e-06, + "loss": 0.8706, + "step": 1084 + }, + { + "epoch": 0.059717100555891904, + "grad_norm": 0.9823188185691833, + "learning_rate": 9.981781196440249e-06, + "loss": 0.9431, + "step": 1085 + }, + { + "epoch": 0.059772139358247565, + "grad_norm": 0.8401583433151245, + "learning_rate": 9.981744207706577e-06, + "loss": 0.8369, + "step": 1086 + }, + { + "epoch": 0.05982717816060323, + "grad_norm": 0.8775757551193237, + "learning_rate": 9.981707181531452e-06, + "loss": 0.9516, + "step": 1087 + }, + { + "epoch": 0.05988221696295889, + "grad_norm": 0.9153465628623962, + "learning_rate": 9.981670117915148e-06, + "loss": 0.8997, + "step": 1088 + }, + { + "epoch": 0.05993725576531455, + "grad_norm": 0.9053078889846802, + "learning_rate": 9.981633016857946e-06, + "loss": 0.9452, + "step": 1089 + }, + { + "epoch": 0.059992294567670204, + "grad_norm": 0.9154480695724487, + "learning_rate": 9.981595878360123e-06, + "loss": 0.8293, + "step": 1090 + }, + { + "epoch": 0.060047333370025865, + "grad_norm": 0.85718834400177, + "learning_rate": 9.981558702421958e-06, + "loss": 0.876, + "step": 1091 + }, + { + "epoch": 0.06010237217238153, + "grad_norm": 0.9437130689620972, + "learning_rate": 9.981521489043734e-06, + "loss": 0.9731, + "step": 1092 + }, + { + "epoch": 0.06015741097473719, + "grad_norm": 0.9014891386032104, + "learning_rate": 9.981484238225725e-06, + "loss": 0.811, + "step": 1093 + }, + { + "epoch": 0.06021244977709285, + "grad_norm": 0.8942846655845642, + "learning_rate": 9.981446949968216e-06, + "loss": 0.808, + "step": 1094 + }, + { + "epoch": 0.06026748857944851, + "grad_norm": 0.855297863483429, + "learning_rate": 9.981409624271483e-06, + "loss": 0.8319, + "step": 1095 + }, + { + "epoch": 0.06032252738180417, + "grad_norm": 0.9310913681983948, + "learning_rate": 9.981372261135811e-06, + "loss": 0.899, + "step": 1096 + }, + { + "epoch": 0.060377566184159834, + "grad_norm": 0.8472979664802551, + "learning_rate": 9.981334860561478e-06, + "loss": 0.8818, + "step": 1097 + }, + { + "epoch": 0.060432604986515495, + "grad_norm": 0.896617591381073, + "learning_rate": 9.981297422548764e-06, + "loss": 0.8991, + "step": 1098 + }, + { + "epoch": 0.06048764378887116, + "grad_norm": 0.8543037176132202, + "learning_rate": 9.981259947097954e-06, + "loss": 0.8595, + "step": 1099 + }, + { + "epoch": 0.06054268259122682, + "grad_norm": 0.8794904947280884, + "learning_rate": 9.981222434209327e-06, + "loss": 0.8561, + "step": 1100 + }, + { + "epoch": 0.06059772139358247, + "grad_norm": 0.8882116675376892, + "learning_rate": 9.981184883883165e-06, + "loss": 0.8099, + "step": 1101 + }, + { + "epoch": 0.060652760195938134, + "grad_norm": 1.0068262815475464, + "learning_rate": 9.98114729611975e-06, + "loss": 0.8104, + "step": 1102 + }, + { + "epoch": 0.060707798998293795, + "grad_norm": 1.072316288948059, + "learning_rate": 9.981109670919366e-06, + "loss": 0.9877, + "step": 1103 + }, + { + "epoch": 0.06076283780064946, + "grad_norm": 0.9959045052528381, + "learning_rate": 9.981072008282298e-06, + "loss": 0.906, + "step": 1104 + }, + { + "epoch": 0.06081787660300512, + "grad_norm": 0.8712790608406067, + "learning_rate": 9.981034308208823e-06, + "loss": 0.8725, + "step": 1105 + }, + { + "epoch": 0.06087291540536078, + "grad_norm": 0.9114679098129272, + "learning_rate": 9.980996570699228e-06, + "loss": 0.8385, + "step": 1106 + }, + { + "epoch": 0.06092795420771644, + "grad_norm": 1.0024466514587402, + "learning_rate": 9.980958795753796e-06, + "loss": 0.8661, + "step": 1107 + }, + { + "epoch": 0.0609829930100721, + "grad_norm": 0.9578461050987244, + "learning_rate": 9.98092098337281e-06, + "loss": 0.9358, + "step": 1108 + }, + { + "epoch": 0.061038031812427763, + "grad_norm": 0.8677787780761719, + "learning_rate": 9.980883133556557e-06, + "loss": 0.8146, + "step": 1109 + }, + { + "epoch": 0.061093070614783425, + "grad_norm": 0.9072276949882507, + "learning_rate": 9.98084524630532e-06, + "loss": 0.91, + "step": 1110 + }, + { + "epoch": 0.061148109417139086, + "grad_norm": 0.8827292919158936, + "learning_rate": 9.980807321619381e-06, + "loss": 0.8854, + "step": 1111 + }, + { + "epoch": 0.06120314821949474, + "grad_norm": 1.0012744665145874, + "learning_rate": 9.98076935949903e-06, + "loss": 0.8242, + "step": 1112 + }, + { + "epoch": 0.0612581870218504, + "grad_norm": 0.9152620434761047, + "learning_rate": 9.980731359944548e-06, + "loss": 0.8832, + "step": 1113 + }, + { + "epoch": 0.061313225824206063, + "grad_norm": 0.8986824750900269, + "learning_rate": 9.980693322956222e-06, + "loss": 0.7975, + "step": 1114 + }, + { + "epoch": 0.061368264626561725, + "grad_norm": 0.9373019933700562, + "learning_rate": 9.98065524853434e-06, + "loss": 0.9541, + "step": 1115 + }, + { + "epoch": 0.061423303428917386, + "grad_norm": 0.9875593781471252, + "learning_rate": 9.980617136679185e-06, + "loss": 1.0052, + "step": 1116 + }, + { + "epoch": 0.06147834223127305, + "grad_norm": 1.0664819478988647, + "learning_rate": 9.980578987391045e-06, + "loss": 0.8584, + "step": 1117 + }, + { + "epoch": 0.06153338103362871, + "grad_norm": 0.9149377942085266, + "learning_rate": 9.980540800670207e-06, + "loss": 0.8467, + "step": 1118 + }, + { + "epoch": 0.06158841983598437, + "grad_norm": 0.9303194284439087, + "learning_rate": 9.980502576516959e-06, + "loss": 0.8219, + "step": 1119 + }, + { + "epoch": 0.06164345863834003, + "grad_norm": 0.9059457778930664, + "learning_rate": 9.980464314931583e-06, + "loss": 0.8459, + "step": 1120 + }, + { + "epoch": 0.06169849744069569, + "grad_norm": 0.9368849396705627, + "learning_rate": 9.980426015914375e-06, + "loss": 0.8933, + "step": 1121 + }, + { + "epoch": 0.061753536243051355, + "grad_norm": 0.8188626766204834, + "learning_rate": 9.980387679465615e-06, + "loss": 0.807, + "step": 1122 + }, + { + "epoch": 0.06180857504540701, + "grad_norm": 1.027171015739441, + "learning_rate": 9.980349305585595e-06, + "loss": 0.8919, + "step": 1123 + }, + { + "epoch": 0.06186361384776267, + "grad_norm": 0.831649899482727, + "learning_rate": 9.980310894274603e-06, + "loss": 0.8109, + "step": 1124 + }, + { + "epoch": 0.06191865265011833, + "grad_norm": 1.0170252323150635, + "learning_rate": 9.980272445532928e-06, + "loss": 0.9537, + "step": 1125 + }, + { + "epoch": 0.06197369145247399, + "grad_norm": 0.97837233543396, + "learning_rate": 9.980233959360858e-06, + "loss": 0.9104, + "step": 1126 + }, + { + "epoch": 0.062028730254829655, + "grad_norm": 0.9548324942588806, + "learning_rate": 9.980195435758681e-06, + "loss": 0.9473, + "step": 1127 + }, + { + "epoch": 0.062083769057185316, + "grad_norm": 0.8675842881202698, + "learning_rate": 9.980156874726692e-06, + "loss": 0.8313, + "step": 1128 + }, + { + "epoch": 0.06213880785954098, + "grad_norm": 0.8948968052864075, + "learning_rate": 9.980118276265173e-06, + "loss": 0.8008, + "step": 1129 + }, + { + "epoch": 0.06219384666189664, + "grad_norm": 0.8914239406585693, + "learning_rate": 9.98007964037442e-06, + "loss": 0.7642, + "step": 1130 + }, + { + "epoch": 0.0622488854642523, + "grad_norm": 0.9499951004981995, + "learning_rate": 9.980040967054723e-06, + "loss": 0.8669, + "step": 1131 + }, + { + "epoch": 0.06230392426660796, + "grad_norm": 0.8959251642227173, + "learning_rate": 9.980002256306369e-06, + "loss": 0.9177, + "step": 1132 + }, + { + "epoch": 0.062358963068963616, + "grad_norm": 0.8634380102157593, + "learning_rate": 9.97996350812965e-06, + "loss": 0.8252, + "step": 1133 + }, + { + "epoch": 0.06241400187131928, + "grad_norm": 0.9380598068237305, + "learning_rate": 9.97992472252486e-06, + "loss": 0.9335, + "step": 1134 + }, + { + "epoch": 0.06246904067367494, + "grad_norm": 0.8373183608055115, + "learning_rate": 9.97988589949229e-06, + "loss": 0.848, + "step": 1135 + }, + { + "epoch": 0.0625240794760306, + "grad_norm": 0.9649023413658142, + "learning_rate": 9.97984703903223e-06, + "loss": 0.9648, + "step": 1136 + }, + { + "epoch": 0.06257911827838626, + "grad_norm": 0.9972373843193054, + "learning_rate": 9.979808141144972e-06, + "loss": 0.9104, + "step": 1137 + }, + { + "epoch": 0.06263415708074192, + "grad_norm": 0.8230985403060913, + "learning_rate": 9.97976920583081e-06, + "loss": 0.8393, + "step": 1138 + }, + { + "epoch": 0.06268919588309758, + "grad_norm": 0.9775324463844299, + "learning_rate": 9.979730233090034e-06, + "loss": 0.8385, + "step": 1139 + }, + { + "epoch": 0.06274423468545325, + "grad_norm": 0.8288110494613647, + "learning_rate": 9.97969122292294e-06, + "loss": 0.7308, + "step": 1140 + }, + { + "epoch": 0.06279927348780891, + "grad_norm": 0.8980758786201477, + "learning_rate": 9.979652175329819e-06, + "loss": 0.863, + "step": 1141 + }, + { + "epoch": 0.06285431229016457, + "grad_norm": 7.43889045715332, + "learning_rate": 9.979613090310965e-06, + "loss": 0.9412, + "step": 1142 + }, + { + "epoch": 0.06290935109252023, + "grad_norm": 0.9758191704750061, + "learning_rate": 9.97957396786667e-06, + "loss": 0.8896, + "step": 1143 + }, + { + "epoch": 0.06296438989487589, + "grad_norm": 0.8211693167686462, + "learning_rate": 9.979534807997234e-06, + "loss": 0.7352, + "step": 1144 + }, + { + "epoch": 0.06301942869723155, + "grad_norm": 0.8643441796302795, + "learning_rate": 9.979495610702945e-06, + "loss": 0.8701, + "step": 1145 + }, + { + "epoch": 0.06307446749958721, + "grad_norm": 1.0199437141418457, + "learning_rate": 9.9794563759841e-06, + "loss": 0.9025, + "step": 1146 + }, + { + "epoch": 0.06312950630194288, + "grad_norm": 0.8367893695831299, + "learning_rate": 9.979417103840994e-06, + "loss": 0.8491, + "step": 1147 + }, + { + "epoch": 0.06318454510429854, + "grad_norm": 0.9411819577217102, + "learning_rate": 9.979377794273923e-06, + "loss": 0.8501, + "step": 1148 + }, + { + "epoch": 0.06323958390665418, + "grad_norm": 1.1497365236282349, + "learning_rate": 9.97933844728318e-06, + "loss": 1.0227, + "step": 1149 + }, + { + "epoch": 0.06329462270900985, + "grad_norm": 0.9892984628677368, + "learning_rate": 9.979299062869064e-06, + "loss": 0.8942, + "step": 1150 + }, + { + "epoch": 0.06334966151136551, + "grad_norm": 0.947952926158905, + "learning_rate": 9.979259641031867e-06, + "loss": 1.0149, + "step": 1151 + }, + { + "epoch": 0.06340470031372117, + "grad_norm": 0.9060251712799072, + "learning_rate": 9.979220181771889e-06, + "loss": 0.8607, + "step": 1152 + }, + { + "epoch": 0.06345973911607683, + "grad_norm": 0.8331984281539917, + "learning_rate": 9.979180685089424e-06, + "loss": 0.8777, + "step": 1153 + }, + { + "epoch": 0.06351477791843249, + "grad_norm": 0.9133188724517822, + "learning_rate": 9.97914115098477e-06, + "loss": 0.7409, + "step": 1154 + }, + { + "epoch": 0.06356981672078815, + "grad_norm": 0.9095513820648193, + "learning_rate": 9.979101579458224e-06, + "loss": 0.8938, + "step": 1155 + }, + { + "epoch": 0.06362485552314381, + "grad_norm": 0.9584553241729736, + "learning_rate": 9.979061970510082e-06, + "loss": 0.8765, + "step": 1156 + }, + { + "epoch": 0.06367989432549948, + "grad_norm": 0.8742124438285828, + "learning_rate": 9.979022324140644e-06, + "loss": 0.8564, + "step": 1157 + }, + { + "epoch": 0.06373493312785514, + "grad_norm": 0.8776904344558716, + "learning_rate": 9.978982640350208e-06, + "loss": 0.8713, + "step": 1158 + }, + { + "epoch": 0.0637899719302108, + "grad_norm": 0.8667464852333069, + "learning_rate": 9.97894291913907e-06, + "loss": 0.8705, + "step": 1159 + }, + { + "epoch": 0.06384501073256646, + "grad_norm": 0.9028087854385376, + "learning_rate": 9.978903160507531e-06, + "loss": 0.8297, + "step": 1160 + }, + { + "epoch": 0.06390004953492212, + "grad_norm": 0.900812029838562, + "learning_rate": 9.978863364455887e-06, + "loss": 0.8456, + "step": 1161 + }, + { + "epoch": 0.06395508833727778, + "grad_norm": 0.9667207598686218, + "learning_rate": 9.97882353098444e-06, + "loss": 0.8081, + "step": 1162 + }, + { + "epoch": 0.06401012713963344, + "grad_norm": 0.8959711194038391, + "learning_rate": 9.978783660093488e-06, + "loss": 0.8455, + "step": 1163 + }, + { + "epoch": 0.0640651659419891, + "grad_norm": 0.8519117832183838, + "learning_rate": 9.97874375178333e-06, + "loss": 0.849, + "step": 1164 + }, + { + "epoch": 0.06412020474434477, + "grad_norm": 1.0532654523849487, + "learning_rate": 9.978703806054267e-06, + "loss": 0.7356, + "step": 1165 + }, + { + "epoch": 0.06417524354670043, + "grad_norm": 1.0313252210617065, + "learning_rate": 9.9786638229066e-06, + "loss": 1.024, + "step": 1166 + }, + { + "epoch": 0.06423028234905609, + "grad_norm": 1.0567537546157837, + "learning_rate": 9.978623802340627e-06, + "loss": 0.9423, + "step": 1167 + }, + { + "epoch": 0.06428532115141175, + "grad_norm": 0.8198097348213196, + "learning_rate": 9.97858374435665e-06, + "loss": 0.829, + "step": 1168 + }, + { + "epoch": 0.06434035995376741, + "grad_norm": 0.8718193173408508, + "learning_rate": 9.97854364895497e-06, + "loss": 0.7184, + "step": 1169 + }, + { + "epoch": 0.06439539875612307, + "grad_norm": 0.8037594556808472, + "learning_rate": 9.978503516135892e-06, + "loss": 0.7961, + "step": 1170 + }, + { + "epoch": 0.06445043755847872, + "grad_norm": 0.9052229523658752, + "learning_rate": 9.978463345899709e-06, + "loss": 0.8016, + "step": 1171 + }, + { + "epoch": 0.06450547636083438, + "grad_norm": 1.0194638967514038, + "learning_rate": 9.978423138246731e-06, + "loss": 0.9045, + "step": 1172 + }, + { + "epoch": 0.06456051516319004, + "grad_norm": 0.953078031539917, + "learning_rate": 9.978382893177259e-06, + "loss": 0.9661, + "step": 1173 + }, + { + "epoch": 0.0646155539655457, + "grad_norm": 0.8777341842651367, + "learning_rate": 9.978342610691592e-06, + "loss": 0.8685, + "step": 1174 + }, + { + "epoch": 0.06467059276790137, + "grad_norm": 1.0178394317626953, + "learning_rate": 9.978302290790034e-06, + "loss": 0.9075, + "step": 1175 + }, + { + "epoch": 0.06472563157025703, + "grad_norm": 0.935694694519043, + "learning_rate": 9.978261933472889e-06, + "loss": 0.8438, + "step": 1176 + }, + { + "epoch": 0.06478067037261269, + "grad_norm": 1.0022411346435547, + "learning_rate": 9.97822153874046e-06, + "loss": 0.8701, + "step": 1177 + }, + { + "epoch": 0.06483570917496835, + "grad_norm": 1.0371203422546387, + "learning_rate": 9.97818110659305e-06, + "loss": 0.9111, + "step": 1178 + }, + { + "epoch": 0.06489074797732401, + "grad_norm": 0.7972478866577148, + "learning_rate": 9.978140637030963e-06, + "loss": 0.8602, + "step": 1179 + }, + { + "epoch": 0.06494578677967967, + "grad_norm": 0.8556679487228394, + "learning_rate": 9.978100130054505e-06, + "loss": 0.9149, + "step": 1180 + }, + { + "epoch": 0.06500082558203533, + "grad_norm": 0.92474365234375, + "learning_rate": 9.978059585663979e-06, + "loss": 0.8608, + "step": 1181 + }, + { + "epoch": 0.065055864384391, + "grad_norm": 1.0170830488204956, + "learning_rate": 9.978019003859687e-06, + "loss": 0.9986, + "step": 1182 + }, + { + "epoch": 0.06511090318674666, + "grad_norm": 0.9405049681663513, + "learning_rate": 9.97797838464194e-06, + "loss": 0.9023, + "step": 1183 + }, + { + "epoch": 0.06516594198910232, + "grad_norm": 0.9351203441619873, + "learning_rate": 9.977937728011038e-06, + "loss": 0.8698, + "step": 1184 + }, + { + "epoch": 0.06522098079145798, + "grad_norm": 0.8620241284370422, + "learning_rate": 9.97789703396729e-06, + "loss": 0.9393, + "step": 1185 + }, + { + "epoch": 0.06527601959381364, + "grad_norm": 0.9440441131591797, + "learning_rate": 9.977856302511e-06, + "loss": 0.8249, + "step": 1186 + }, + { + "epoch": 0.0653310583961693, + "grad_norm": 0.8311079144477844, + "learning_rate": 9.977815533642474e-06, + "loss": 0.8614, + "step": 1187 + }, + { + "epoch": 0.06538609719852496, + "grad_norm": 0.8911672830581665, + "learning_rate": 9.977774727362018e-06, + "loss": 0.7909, + "step": 1188 + }, + { + "epoch": 0.06544113600088063, + "grad_norm": 0.9237088561058044, + "learning_rate": 9.97773388366994e-06, + "loss": 0.7116, + "step": 1189 + }, + { + "epoch": 0.06549617480323629, + "grad_norm": 1.1155747175216675, + "learning_rate": 9.977693002566549e-06, + "loss": 0.9248, + "step": 1190 + }, + { + "epoch": 0.06555121360559195, + "grad_norm": 0.9386736750602722, + "learning_rate": 9.977652084052148e-06, + "loss": 0.8307, + "step": 1191 + }, + { + "epoch": 0.0656062524079476, + "grad_norm": 1.1666040420532227, + "learning_rate": 9.977611128127044e-06, + "loss": 0.9723, + "step": 1192 + }, + { + "epoch": 0.06566129121030326, + "grad_norm": 1.2366368770599365, + "learning_rate": 9.977570134791552e-06, + "loss": 0.8253, + "step": 1193 + }, + { + "epoch": 0.06571633001265892, + "grad_norm": 0.823443591594696, + "learning_rate": 9.977529104045971e-06, + "loss": 0.7472, + "step": 1194 + }, + { + "epoch": 0.06577136881501458, + "grad_norm": 0.9481683969497681, + "learning_rate": 9.977488035890617e-06, + "loss": 0.887, + "step": 1195 + }, + { + "epoch": 0.06582640761737024, + "grad_norm": 0.9120422005653381, + "learning_rate": 9.977446930325794e-06, + "loss": 0.867, + "step": 1196 + }, + { + "epoch": 0.0658814464197259, + "grad_norm": 0.8595587015151978, + "learning_rate": 9.977405787351811e-06, + "loss": 0.8532, + "step": 1197 + }, + { + "epoch": 0.06593648522208156, + "grad_norm": 0.8590419888496399, + "learning_rate": 9.97736460696898e-06, + "loss": 0.8998, + "step": 1198 + }, + { + "epoch": 0.06599152402443723, + "grad_norm": 0.9670939445495605, + "learning_rate": 9.977323389177609e-06, + "loss": 0.8964, + "step": 1199 + }, + { + "epoch": 0.06604656282679289, + "grad_norm": 0.8870261907577515, + "learning_rate": 9.977282133978006e-06, + "loss": 0.9542, + "step": 1200 + }, + { + "epoch": 0.06610160162914855, + "grad_norm": 0.942294180393219, + "learning_rate": 9.977240841370484e-06, + "loss": 0.8681, + "step": 1201 + }, + { + "epoch": 0.06615664043150421, + "grad_norm": 0.9632517099380493, + "learning_rate": 9.977199511355353e-06, + "loss": 0.7327, + "step": 1202 + }, + { + "epoch": 0.06621167923385987, + "grad_norm": 4.8085479736328125, + "learning_rate": 9.97715814393292e-06, + "loss": 0.8528, + "step": 1203 + }, + { + "epoch": 0.06626671803621553, + "grad_norm": 0.9084093570709229, + "learning_rate": 9.977116739103503e-06, + "loss": 0.7836, + "step": 1204 + }, + { + "epoch": 0.0663217568385712, + "grad_norm": 0.8961902260780334, + "learning_rate": 9.977075296867406e-06, + "loss": 0.854, + "step": 1205 + }, + { + "epoch": 0.06637679564092686, + "grad_norm": 0.8727987408638, + "learning_rate": 9.977033817224945e-06, + "loss": 0.7931, + "step": 1206 + }, + { + "epoch": 0.06643183444328252, + "grad_norm": 0.8263267874717712, + "learning_rate": 9.976992300176428e-06, + "loss": 0.852, + "step": 1207 + }, + { + "epoch": 0.06648687324563818, + "grad_norm": 1.0499639511108398, + "learning_rate": 9.97695074572217e-06, + "loss": 1.0427, + "step": 1208 + }, + { + "epoch": 0.06654191204799384, + "grad_norm": 0.9337313771247864, + "learning_rate": 9.976909153862482e-06, + "loss": 0.8035, + "step": 1209 + }, + { + "epoch": 0.0665969508503495, + "grad_norm": 0.8795992732048035, + "learning_rate": 9.976867524597678e-06, + "loss": 0.9022, + "step": 1210 + }, + { + "epoch": 0.06665198965270516, + "grad_norm": 0.9787294268608093, + "learning_rate": 9.976825857928069e-06, + "loss": 0.8259, + "step": 1211 + }, + { + "epoch": 0.06670702845506082, + "grad_norm": 0.8570082187652588, + "learning_rate": 9.976784153853969e-06, + "loss": 0.8567, + "step": 1212 + }, + { + "epoch": 0.06676206725741649, + "grad_norm": 1.0620380640029907, + "learning_rate": 9.976742412375694e-06, + "loss": 0.851, + "step": 1213 + }, + { + "epoch": 0.06681710605977213, + "grad_norm": 0.8545439839363098, + "learning_rate": 9.976700633493551e-06, + "loss": 0.8827, + "step": 1214 + }, + { + "epoch": 0.0668721448621278, + "grad_norm": 0.8543682098388672, + "learning_rate": 9.97665881720786e-06, + "loss": 0.8524, + "step": 1215 + }, + { + "epoch": 0.06692718366448346, + "grad_norm": 0.7748527526855469, + "learning_rate": 9.976616963518935e-06, + "loss": 0.7459, + "step": 1216 + }, + { + "epoch": 0.06698222246683912, + "grad_norm": 0.9876659512519836, + "learning_rate": 9.976575072427087e-06, + "loss": 0.8426, + "step": 1217 + }, + { + "epoch": 0.06703726126919478, + "grad_norm": 0.8763901591300964, + "learning_rate": 9.976533143932635e-06, + "loss": 0.8561, + "step": 1218 + }, + { + "epoch": 0.06709230007155044, + "grad_norm": 0.7816654443740845, + "learning_rate": 9.97649117803589e-06, + "loss": 0.8361, + "step": 1219 + }, + { + "epoch": 0.0671473388739061, + "grad_norm": 0.8659802675247192, + "learning_rate": 9.97644917473717e-06, + "loss": 0.897, + "step": 1220 + }, + { + "epoch": 0.06720237767626176, + "grad_norm": 0.9180877208709717, + "learning_rate": 9.97640713403679e-06, + "loss": 0.9516, + "step": 1221 + }, + { + "epoch": 0.06725741647861742, + "grad_norm": 0.9624410271644592, + "learning_rate": 9.976365055935067e-06, + "loss": 0.9119, + "step": 1222 + }, + { + "epoch": 0.06731245528097309, + "grad_norm": 0.8291105031967163, + "learning_rate": 9.976322940432314e-06, + "loss": 0.788, + "step": 1223 + }, + { + "epoch": 0.06736749408332875, + "grad_norm": 0.9858983755111694, + "learning_rate": 9.976280787528854e-06, + "loss": 0.8794, + "step": 1224 + }, + { + "epoch": 0.06742253288568441, + "grad_norm": 0.8283948302268982, + "learning_rate": 9.976238597224996e-06, + "loss": 0.8571, + "step": 1225 + }, + { + "epoch": 0.06747757168804007, + "grad_norm": 0.8585363626480103, + "learning_rate": 9.976196369521063e-06, + "loss": 0.9005, + "step": 1226 + }, + { + "epoch": 0.06753261049039573, + "grad_norm": 0.847882091999054, + "learning_rate": 9.976154104417369e-06, + "loss": 0.8058, + "step": 1227 + }, + { + "epoch": 0.06758764929275139, + "grad_norm": 0.9045611023902893, + "learning_rate": 9.976111801914232e-06, + "loss": 0.7864, + "step": 1228 + }, + { + "epoch": 0.06764268809510705, + "grad_norm": 0.805932879447937, + "learning_rate": 9.976069462011972e-06, + "loss": 0.8436, + "step": 1229 + }, + { + "epoch": 0.06769772689746271, + "grad_norm": 0.8809003233909607, + "learning_rate": 9.976027084710906e-06, + "loss": 0.7876, + "step": 1230 + }, + { + "epoch": 0.06775276569981838, + "grad_norm": 0.8681740760803223, + "learning_rate": 9.975984670011352e-06, + "loss": 0.877, + "step": 1231 + }, + { + "epoch": 0.06780780450217404, + "grad_norm": 0.9909854531288147, + "learning_rate": 9.975942217913627e-06, + "loss": 0.8957, + "step": 1232 + }, + { + "epoch": 0.0678628433045297, + "grad_norm": 0.9213934540748596, + "learning_rate": 9.975899728418056e-06, + "loss": 0.8344, + "step": 1233 + }, + { + "epoch": 0.06791788210688536, + "grad_norm": 0.8289967179298401, + "learning_rate": 9.975857201524952e-06, + "loss": 0.876, + "step": 1234 + }, + { + "epoch": 0.06797292090924101, + "grad_norm": 0.891812264919281, + "learning_rate": 9.97581463723464e-06, + "loss": 0.8611, + "step": 1235 + }, + { + "epoch": 0.06802795971159667, + "grad_norm": 1.0301382541656494, + "learning_rate": 9.975772035547435e-06, + "loss": 0.8177, + "step": 1236 + }, + { + "epoch": 0.06808299851395233, + "grad_norm": 0.8380662798881531, + "learning_rate": 9.975729396463659e-06, + "loss": 0.8631, + "step": 1237 + }, + { + "epoch": 0.06813803731630799, + "grad_norm": 0.9226046204566956, + "learning_rate": 9.975686719983633e-06, + "loss": 0.8927, + "step": 1238 + }, + { + "epoch": 0.06819307611866365, + "grad_norm": 0.8917136192321777, + "learning_rate": 9.975644006107679e-06, + "loss": 0.9048, + "step": 1239 + }, + { + "epoch": 0.06824811492101931, + "grad_norm": 0.8559191226959229, + "learning_rate": 9.975601254836114e-06, + "loss": 0.8169, + "step": 1240 + }, + { + "epoch": 0.06830315372337498, + "grad_norm": 0.9345341920852661, + "learning_rate": 9.975558466169263e-06, + "loss": 0.7929, + "step": 1241 + }, + { + "epoch": 0.06835819252573064, + "grad_norm": 0.9155850410461426, + "learning_rate": 9.975515640107447e-06, + "loss": 0.8825, + "step": 1242 + }, + { + "epoch": 0.0684132313280863, + "grad_norm": 0.899712860584259, + "learning_rate": 9.975472776650987e-06, + "loss": 0.825, + "step": 1243 + }, + { + "epoch": 0.06846827013044196, + "grad_norm": 0.8280880451202393, + "learning_rate": 9.975429875800206e-06, + "loss": 0.8539, + "step": 1244 + }, + { + "epoch": 0.06852330893279762, + "grad_norm": 0.9589636325836182, + "learning_rate": 9.975386937555426e-06, + "loss": 0.9465, + "step": 1245 + }, + { + "epoch": 0.06857834773515328, + "grad_norm": 1.1027253866195679, + "learning_rate": 9.97534396191697e-06, + "loss": 0.87, + "step": 1246 + }, + { + "epoch": 0.06863338653750894, + "grad_norm": 1.0510318279266357, + "learning_rate": 9.975300948885158e-06, + "loss": 0.8569, + "step": 1247 + }, + { + "epoch": 0.0686884253398646, + "grad_norm": 0.8897958397865295, + "learning_rate": 9.975257898460317e-06, + "loss": 0.8431, + "step": 1248 + }, + { + "epoch": 0.06874346414222027, + "grad_norm": 0.8827036619186401, + "learning_rate": 9.975214810642771e-06, + "loss": 0.922, + "step": 1249 + }, + { + "epoch": 0.06879850294457593, + "grad_norm": 0.8798324465751648, + "learning_rate": 9.97517168543284e-06, + "loss": 0.7837, + "step": 1250 + }, + { + "epoch": 0.06885354174693159, + "grad_norm": 0.9053803086280823, + "learning_rate": 9.975128522830853e-06, + "loss": 0.82, + "step": 1251 + }, + { + "epoch": 0.06890858054928725, + "grad_norm": 0.8362607359886169, + "learning_rate": 9.975085322837129e-06, + "loss": 0.7684, + "step": 1252 + }, + { + "epoch": 0.06896361935164291, + "grad_norm": 0.8898602724075317, + "learning_rate": 9.975042085451997e-06, + "loss": 0.8205, + "step": 1253 + }, + { + "epoch": 0.06901865815399857, + "grad_norm": 0.9210274815559387, + "learning_rate": 9.97499881067578e-06, + "loss": 0.8364, + "step": 1254 + }, + { + "epoch": 0.06907369695635424, + "grad_norm": 1.0881952047348022, + "learning_rate": 9.974955498508804e-06, + "loss": 0.8234, + "step": 1255 + }, + { + "epoch": 0.0691287357587099, + "grad_norm": 0.8875024914741516, + "learning_rate": 9.974912148951394e-06, + "loss": 0.7974, + "step": 1256 + }, + { + "epoch": 0.06918377456106554, + "grad_norm": 0.9065666794776917, + "learning_rate": 9.974868762003876e-06, + "loss": 0.7721, + "step": 1257 + }, + { + "epoch": 0.0692388133634212, + "grad_norm": 0.8904553651809692, + "learning_rate": 9.974825337666576e-06, + "loss": 0.8551, + "step": 1258 + }, + { + "epoch": 0.06929385216577687, + "grad_norm": 0.8586102724075317, + "learning_rate": 9.974781875939821e-06, + "loss": 0.8666, + "step": 1259 + }, + { + "epoch": 0.06934889096813253, + "grad_norm": 0.9103402495384216, + "learning_rate": 9.974738376823935e-06, + "loss": 0.8361, + "step": 1260 + }, + { + "epoch": 0.06940392977048819, + "grad_norm": 0.8657701015472412, + "learning_rate": 9.974694840319249e-06, + "loss": 0.8217, + "step": 1261 + }, + { + "epoch": 0.06945896857284385, + "grad_norm": 0.865703821182251, + "learning_rate": 9.974651266426088e-06, + "loss": 0.8751, + "step": 1262 + }, + { + "epoch": 0.06951400737519951, + "grad_norm": 0.8932577967643738, + "learning_rate": 9.974607655144779e-06, + "loss": 0.8709, + "step": 1263 + }, + { + "epoch": 0.06956904617755517, + "grad_norm": 0.8417405486106873, + "learning_rate": 9.97456400647565e-06, + "loss": 0.8104, + "step": 1264 + }, + { + "epoch": 0.06962408497991084, + "grad_norm": 0.8578035235404968, + "learning_rate": 9.974520320419032e-06, + "loss": 0.9173, + "step": 1265 + }, + { + "epoch": 0.0696791237822665, + "grad_norm": 0.957539439201355, + "learning_rate": 9.974476596975249e-06, + "loss": 0.8955, + "step": 1266 + }, + { + "epoch": 0.06973416258462216, + "grad_norm": 0.851222038269043, + "learning_rate": 9.974432836144632e-06, + "loss": 0.8696, + "step": 1267 + }, + { + "epoch": 0.06978920138697782, + "grad_norm": 0.8178789615631104, + "learning_rate": 9.974389037927508e-06, + "loss": 0.7921, + "step": 1268 + }, + { + "epoch": 0.06984424018933348, + "grad_norm": 0.954091489315033, + "learning_rate": 9.97434520232421e-06, + "loss": 0.9362, + "step": 1269 + }, + { + "epoch": 0.06989927899168914, + "grad_norm": 0.8525053858757019, + "learning_rate": 9.974301329335063e-06, + "loss": 0.7996, + "step": 1270 + }, + { + "epoch": 0.0699543177940448, + "grad_norm": 0.9340476393699646, + "learning_rate": 9.9742574189604e-06, + "loss": 0.9091, + "step": 1271 + }, + { + "epoch": 0.07000935659640047, + "grad_norm": 0.7946187257766724, + "learning_rate": 9.974213471200548e-06, + "loss": 0.874, + "step": 1272 + }, + { + "epoch": 0.07006439539875613, + "grad_norm": 0.8048381209373474, + "learning_rate": 9.97416948605584e-06, + "loss": 0.8557, + "step": 1273 + }, + { + "epoch": 0.07011943420111179, + "grad_norm": 0.9849064946174622, + "learning_rate": 9.974125463526607e-06, + "loss": 0.8154, + "step": 1274 + }, + { + "epoch": 0.07017447300346745, + "grad_norm": 0.9030239582061768, + "learning_rate": 9.974081403613178e-06, + "loss": 0.9411, + "step": 1275 + }, + { + "epoch": 0.07022951180582311, + "grad_norm": 0.8869300484657288, + "learning_rate": 9.974037306315882e-06, + "loss": 0.8978, + "step": 1276 + }, + { + "epoch": 0.07028455060817877, + "grad_norm": 0.8558536767959595, + "learning_rate": 9.973993171635057e-06, + "loss": 0.8937, + "step": 1277 + }, + { + "epoch": 0.07033958941053442, + "grad_norm": 0.9005453586578369, + "learning_rate": 9.973948999571029e-06, + "loss": 0.9336, + "step": 1278 + }, + { + "epoch": 0.07039462821289008, + "grad_norm": 0.8489978909492493, + "learning_rate": 9.973904790124131e-06, + "loss": 0.8267, + "step": 1279 + }, + { + "epoch": 0.07044966701524574, + "grad_norm": 0.8295948505401611, + "learning_rate": 9.973860543294696e-06, + "loss": 0.8478, + "step": 1280 + }, + { + "epoch": 0.0705047058176014, + "grad_norm": 0.8111379742622375, + "learning_rate": 9.973816259083058e-06, + "loss": 0.8333, + "step": 1281 + }, + { + "epoch": 0.07055974461995707, + "grad_norm": 0.9380189776420593, + "learning_rate": 9.973771937489547e-06, + "loss": 0.9718, + "step": 1282 + }, + { + "epoch": 0.07061478342231273, + "grad_norm": 1.251194953918457, + "learning_rate": 9.973727578514499e-06, + "loss": 0.9531, + "step": 1283 + }, + { + "epoch": 0.07066982222466839, + "grad_norm": 0.9897224307060242, + "learning_rate": 9.973683182158243e-06, + "loss": 0.7853, + "step": 1284 + }, + { + "epoch": 0.07072486102702405, + "grad_norm": 0.8409335017204285, + "learning_rate": 9.973638748421119e-06, + "loss": 0.7692, + "step": 1285 + }, + { + "epoch": 0.07077989982937971, + "grad_norm": 0.9019681215286255, + "learning_rate": 9.973594277303456e-06, + "loss": 0.8135, + "step": 1286 + }, + { + "epoch": 0.07083493863173537, + "grad_norm": 0.9236096739768982, + "learning_rate": 9.973549768805588e-06, + "loss": 0.9304, + "step": 1287 + }, + { + "epoch": 0.07088997743409103, + "grad_norm": 0.9244743585586548, + "learning_rate": 9.973505222927854e-06, + "loss": 0.9056, + "step": 1288 + }, + { + "epoch": 0.0709450162364467, + "grad_norm": 1.3418753147125244, + "learning_rate": 9.973460639670585e-06, + "loss": 0.8419, + "step": 1289 + }, + { + "epoch": 0.07100005503880236, + "grad_norm": 0.8715767860412598, + "learning_rate": 9.973416019034117e-06, + "loss": 0.9704, + "step": 1290 + }, + { + "epoch": 0.07105509384115802, + "grad_norm": 0.9609012007713318, + "learning_rate": 9.973371361018787e-06, + "loss": 0.8807, + "step": 1291 + }, + { + "epoch": 0.07111013264351368, + "grad_norm": 0.8085873126983643, + "learning_rate": 9.973326665624927e-06, + "loss": 0.7947, + "step": 1292 + }, + { + "epoch": 0.07116517144586934, + "grad_norm": 0.919280469417572, + "learning_rate": 9.973281932852877e-06, + "loss": 0.9743, + "step": 1293 + }, + { + "epoch": 0.071220210248225, + "grad_norm": 1.0651074647903442, + "learning_rate": 9.973237162702968e-06, + "loss": 0.7164, + "step": 1294 + }, + { + "epoch": 0.07127524905058066, + "grad_norm": 0.987251341342926, + "learning_rate": 9.973192355175542e-06, + "loss": 0.9286, + "step": 1295 + }, + { + "epoch": 0.07133028785293632, + "grad_norm": 1.5507274866104126, + "learning_rate": 9.973147510270935e-06, + "loss": 0.9733, + "step": 1296 + }, + { + "epoch": 0.07138532665529199, + "grad_norm": 0.8439416885375977, + "learning_rate": 9.97310262798948e-06, + "loss": 0.7462, + "step": 1297 + }, + { + "epoch": 0.07144036545764765, + "grad_norm": 0.9604889750480652, + "learning_rate": 9.973057708331519e-06, + "loss": 1.0006, + "step": 1298 + }, + { + "epoch": 0.07149540426000331, + "grad_norm": 0.8568960428237915, + "learning_rate": 9.973012751297386e-06, + "loss": 0.878, + "step": 1299 + }, + { + "epoch": 0.07155044306235896, + "grad_norm": 0.8169522285461426, + "learning_rate": 9.972967756887419e-06, + "loss": 0.8241, + "step": 1300 + }, + { + "epoch": 0.07160548186471462, + "grad_norm": 0.875738799571991, + "learning_rate": 9.97292272510196e-06, + "loss": 0.854, + "step": 1301 + }, + { + "epoch": 0.07166052066707028, + "grad_norm": 0.7877739071846008, + "learning_rate": 9.972877655941345e-06, + "loss": 0.779, + "step": 1302 + }, + { + "epoch": 0.07171555946942594, + "grad_norm": 0.8148574829101562, + "learning_rate": 9.972832549405912e-06, + "loss": 0.6965, + "step": 1303 + }, + { + "epoch": 0.0717705982717816, + "grad_norm": 0.936720609664917, + "learning_rate": 9.972787405495998e-06, + "loss": 0.798, + "step": 1304 + }, + { + "epoch": 0.07182563707413726, + "grad_norm": 0.8932886123657227, + "learning_rate": 9.972742224211949e-06, + "loss": 0.9196, + "step": 1305 + }, + { + "epoch": 0.07188067587649292, + "grad_norm": 0.899246871471405, + "learning_rate": 9.972697005554099e-06, + "loss": 0.8081, + "step": 1306 + }, + { + "epoch": 0.07193571467884859, + "grad_norm": 0.8789899349212646, + "learning_rate": 9.972651749522788e-06, + "loss": 0.89, + "step": 1307 + }, + { + "epoch": 0.07199075348120425, + "grad_norm": 1.2412173748016357, + "learning_rate": 9.97260645611836e-06, + "loss": 0.9866, + "step": 1308 + }, + { + "epoch": 0.07204579228355991, + "grad_norm": 0.8655833005905151, + "learning_rate": 9.972561125341152e-06, + "loss": 0.8144, + "step": 1309 + }, + { + "epoch": 0.07210083108591557, + "grad_norm": 0.8705299496650696, + "learning_rate": 9.972515757191506e-06, + "loss": 0.8431, + "step": 1310 + }, + { + "epoch": 0.07215586988827123, + "grad_norm": 0.8813188672065735, + "learning_rate": 9.972470351669761e-06, + "loss": 0.859, + "step": 1311 + }, + { + "epoch": 0.0722109086906269, + "grad_norm": 2.043627977371216, + "learning_rate": 9.972424908776262e-06, + "loss": 0.9886, + "step": 1312 + }, + { + "epoch": 0.07226594749298255, + "grad_norm": 0.9167500734329224, + "learning_rate": 9.972379428511348e-06, + "loss": 0.7203, + "step": 1313 + }, + { + "epoch": 0.07232098629533822, + "grad_norm": 1.3145136833190918, + "learning_rate": 9.972333910875358e-06, + "loss": 0.9325, + "step": 1314 + }, + { + "epoch": 0.07237602509769388, + "grad_norm": 0.834710419178009, + "learning_rate": 9.972288355868641e-06, + "loss": 0.9361, + "step": 1315 + }, + { + "epoch": 0.07243106390004954, + "grad_norm": 0.9039230942726135, + "learning_rate": 9.972242763491535e-06, + "loss": 0.8027, + "step": 1316 + }, + { + "epoch": 0.0724861027024052, + "grad_norm": 0.8911495208740234, + "learning_rate": 9.972197133744384e-06, + "loss": 0.951, + "step": 1317 + }, + { + "epoch": 0.07254114150476086, + "grad_norm": 1.0752439498901367, + "learning_rate": 9.972151466627529e-06, + "loss": 0.8421, + "step": 1318 + }, + { + "epoch": 0.07259618030711652, + "grad_norm": 0.926135778427124, + "learning_rate": 9.972105762141314e-06, + "loss": 0.8901, + "step": 1319 + }, + { + "epoch": 0.07265121910947218, + "grad_norm": 0.8166295289993286, + "learning_rate": 9.972060020286085e-06, + "loss": 0.7845, + "step": 1320 + }, + { + "epoch": 0.07270625791182783, + "grad_norm": 1.0000934600830078, + "learning_rate": 9.972014241062182e-06, + "loss": 0.8383, + "step": 1321 + }, + { + "epoch": 0.0727612967141835, + "grad_norm": 1.2617899179458618, + "learning_rate": 9.971968424469951e-06, + "loss": 0.9826, + "step": 1322 + }, + { + "epoch": 0.07281633551653915, + "grad_norm": 0.8451040983200073, + "learning_rate": 9.971922570509738e-06, + "loss": 0.8262, + "step": 1323 + }, + { + "epoch": 0.07287137431889482, + "grad_norm": 0.8101939558982849, + "learning_rate": 9.971876679181884e-06, + "loss": 0.6904, + "step": 1324 + }, + { + "epoch": 0.07292641312125048, + "grad_norm": 0.8805514574050903, + "learning_rate": 9.971830750486736e-06, + "loss": 0.8491, + "step": 1325 + }, + { + "epoch": 0.07298145192360614, + "grad_norm": 0.8236901164054871, + "learning_rate": 9.97178478442464e-06, + "loss": 0.8462, + "step": 1326 + }, + { + "epoch": 0.0730364907259618, + "grad_norm": 0.9183042645454407, + "learning_rate": 9.971738780995938e-06, + "loss": 0.7577, + "step": 1327 + }, + { + "epoch": 0.07309152952831746, + "grad_norm": 0.8425934314727783, + "learning_rate": 9.971692740200982e-06, + "loss": 0.8462, + "step": 1328 + }, + { + "epoch": 0.07314656833067312, + "grad_norm": 0.9114993214607239, + "learning_rate": 9.971646662040112e-06, + "loss": 0.9132, + "step": 1329 + }, + { + "epoch": 0.07320160713302878, + "grad_norm": 0.8516649603843689, + "learning_rate": 9.971600546513675e-06, + "loss": 0.8819, + "step": 1330 + }, + { + "epoch": 0.07325664593538445, + "grad_norm": 1.0859558582305908, + "learning_rate": 9.971554393622023e-06, + "loss": 0.9929, + "step": 1331 + }, + { + "epoch": 0.07331168473774011, + "grad_norm": 0.8906900882720947, + "learning_rate": 9.971508203365497e-06, + "loss": 0.9166, + "step": 1332 + }, + { + "epoch": 0.07336672354009577, + "grad_norm": 0.8931803703308105, + "learning_rate": 9.971461975744445e-06, + "loss": 0.864, + "step": 1333 + }, + { + "epoch": 0.07342176234245143, + "grad_norm": 0.8404982686042786, + "learning_rate": 9.971415710759216e-06, + "loss": 0.8609, + "step": 1334 + }, + { + "epoch": 0.07347680114480709, + "grad_norm": 0.8016490340232849, + "learning_rate": 9.971369408410157e-06, + "loss": 0.7694, + "step": 1335 + }, + { + "epoch": 0.07353183994716275, + "grad_norm": 0.7700600028038025, + "learning_rate": 9.971323068697618e-06, + "loss": 0.7875, + "step": 1336 + }, + { + "epoch": 0.07358687874951841, + "grad_norm": 0.8679799437522888, + "learning_rate": 9.971276691621946e-06, + "loss": 0.8409, + "step": 1337 + }, + { + "epoch": 0.07364191755187408, + "grad_norm": 0.8329173922538757, + "learning_rate": 9.971230277183486e-06, + "loss": 0.8707, + "step": 1338 + }, + { + "epoch": 0.07369695635422974, + "grad_norm": 0.8790140151977539, + "learning_rate": 9.97118382538259e-06, + "loss": 0.7631, + "step": 1339 + }, + { + "epoch": 0.0737519951565854, + "grad_norm": 1.1895341873168945, + "learning_rate": 9.97113733621961e-06, + "loss": 0.8555, + "step": 1340 + }, + { + "epoch": 0.07380703395894106, + "grad_norm": 0.8531593680381775, + "learning_rate": 9.97109080969489e-06, + "loss": 0.7192, + "step": 1341 + }, + { + "epoch": 0.07386207276129672, + "grad_norm": 1.0388946533203125, + "learning_rate": 9.971044245808784e-06, + "loss": 0.8182, + "step": 1342 + }, + { + "epoch": 0.07391711156365237, + "grad_norm": 0.8858556747436523, + "learning_rate": 9.970997644561639e-06, + "loss": 0.7981, + "step": 1343 + }, + { + "epoch": 0.07397215036600803, + "grad_norm": 0.8710204362869263, + "learning_rate": 9.970951005953807e-06, + "loss": 0.7667, + "step": 1344 + }, + { + "epoch": 0.07402718916836369, + "grad_norm": 0.9788708090782166, + "learning_rate": 9.970904329985638e-06, + "loss": 0.9693, + "step": 1345 + }, + { + "epoch": 0.07408222797071935, + "grad_norm": 0.7805914878845215, + "learning_rate": 9.970857616657482e-06, + "loss": 0.6683, + "step": 1346 + }, + { + "epoch": 0.07413726677307501, + "grad_norm": 0.9977933168411255, + "learning_rate": 9.97081086596969e-06, + "loss": 0.8288, + "step": 1347 + }, + { + "epoch": 0.07419230557543068, + "grad_norm": 0.829115629196167, + "learning_rate": 9.970764077922617e-06, + "loss": 0.8361, + "step": 1348 + }, + { + "epoch": 0.07424734437778634, + "grad_norm": 1.226120114326477, + "learning_rate": 9.97071725251661e-06, + "loss": 1.0008, + "step": 1349 + }, + { + "epoch": 0.074302383180142, + "grad_norm": 0.8997750878334045, + "learning_rate": 9.970670389752021e-06, + "loss": 0.8048, + "step": 1350 + }, + { + "epoch": 0.07435742198249766, + "grad_norm": 1.0885238647460938, + "learning_rate": 9.970623489629205e-06, + "loss": 0.9202, + "step": 1351 + }, + { + "epoch": 0.07441246078485332, + "grad_norm": 0.8736100792884827, + "learning_rate": 9.970576552148515e-06, + "loss": 0.8515, + "step": 1352 + }, + { + "epoch": 0.07446749958720898, + "grad_norm": 0.9211294651031494, + "learning_rate": 9.970529577310301e-06, + "loss": 0.9389, + "step": 1353 + }, + { + "epoch": 0.07452253838956464, + "grad_norm": 0.9334765672683716, + "learning_rate": 9.970482565114917e-06, + "loss": 0.8165, + "step": 1354 + }, + { + "epoch": 0.0745775771919203, + "grad_norm": 0.8307162523269653, + "learning_rate": 9.970435515562717e-06, + "loss": 0.7829, + "step": 1355 + }, + { + "epoch": 0.07463261599427597, + "grad_norm": 0.987634003162384, + "learning_rate": 9.970388428654055e-06, + "loss": 0.848, + "step": 1356 + }, + { + "epoch": 0.07468765479663163, + "grad_norm": 1.094752311706543, + "learning_rate": 9.970341304389281e-06, + "loss": 1.003, + "step": 1357 + }, + { + "epoch": 0.07474269359898729, + "grad_norm": 0.9865909814834595, + "learning_rate": 9.970294142768755e-06, + "loss": 0.9116, + "step": 1358 + }, + { + "epoch": 0.07479773240134295, + "grad_norm": 0.8404149413108826, + "learning_rate": 9.970246943792828e-06, + "loss": 0.8699, + "step": 1359 + }, + { + "epoch": 0.07485277120369861, + "grad_norm": 0.9602416753768921, + "learning_rate": 9.970199707461855e-06, + "loss": 0.8166, + "step": 1360 + }, + { + "epoch": 0.07490781000605427, + "grad_norm": 0.9748693704605103, + "learning_rate": 9.970152433776193e-06, + "loss": 0.8767, + "step": 1361 + }, + { + "epoch": 0.07496284880840993, + "grad_norm": 0.8721657991409302, + "learning_rate": 9.970105122736194e-06, + "loss": 0.8825, + "step": 1362 + }, + { + "epoch": 0.0750178876107656, + "grad_norm": 0.8683610558509827, + "learning_rate": 9.970057774342215e-06, + "loss": 0.7873, + "step": 1363 + }, + { + "epoch": 0.07507292641312124, + "grad_norm": 0.856396496295929, + "learning_rate": 9.970010388594613e-06, + "loss": 0.8505, + "step": 1364 + }, + { + "epoch": 0.0751279652154769, + "grad_norm": 1.0709880590438843, + "learning_rate": 9.969962965493744e-06, + "loss": 0.9519, + "step": 1365 + }, + { + "epoch": 0.07518300401783257, + "grad_norm": 0.8839450478553772, + "learning_rate": 9.969915505039963e-06, + "loss": 0.8041, + "step": 1366 + }, + { + "epoch": 0.07523804282018823, + "grad_norm": 0.89545738697052, + "learning_rate": 9.969868007233627e-06, + "loss": 0.8713, + "step": 1367 + }, + { + "epoch": 0.07529308162254389, + "grad_norm": 0.9870849251747131, + "learning_rate": 9.969820472075094e-06, + "loss": 0.8655, + "step": 1368 + }, + { + "epoch": 0.07534812042489955, + "grad_norm": 1.3123797178268433, + "learning_rate": 9.96977289956472e-06, + "loss": 1.0425, + "step": 1369 + }, + { + "epoch": 0.07540315922725521, + "grad_norm": 0.8538400530815125, + "learning_rate": 9.969725289702865e-06, + "loss": 0.7052, + "step": 1370 + }, + { + "epoch": 0.07545819802961087, + "grad_norm": 0.933397114276886, + "learning_rate": 9.969677642489884e-06, + "loss": 0.9819, + "step": 1371 + }, + { + "epoch": 0.07551323683196653, + "grad_norm": 0.8428112268447876, + "learning_rate": 9.969629957926134e-06, + "loss": 0.7313, + "step": 1372 + }, + { + "epoch": 0.0755682756343222, + "grad_norm": 0.9023239612579346, + "learning_rate": 9.96958223601198e-06, + "loss": 0.8297, + "step": 1373 + }, + { + "epoch": 0.07562331443667786, + "grad_norm": 0.8971324563026428, + "learning_rate": 9.969534476747771e-06, + "loss": 0.8832, + "step": 1374 + }, + { + "epoch": 0.07567835323903352, + "grad_norm": 0.8709388375282288, + "learning_rate": 9.969486680133874e-06, + "loss": 0.743, + "step": 1375 + }, + { + "epoch": 0.07573339204138918, + "grad_norm": 0.9094591736793518, + "learning_rate": 9.969438846170644e-06, + "loss": 0.8294, + "step": 1376 + }, + { + "epoch": 0.07578843084374484, + "grad_norm": 1.0753988027572632, + "learning_rate": 9.969390974858444e-06, + "loss": 0.7479, + "step": 1377 + }, + { + "epoch": 0.0758434696461005, + "grad_norm": 0.933775007724762, + "learning_rate": 9.96934306619763e-06, + "loss": 0.8235, + "step": 1378 + }, + { + "epoch": 0.07589850844845616, + "grad_norm": 0.8419735431671143, + "learning_rate": 9.969295120188565e-06, + "loss": 0.8103, + "step": 1379 + }, + { + "epoch": 0.07595354725081183, + "grad_norm": 0.8912790417671204, + "learning_rate": 9.969247136831606e-06, + "loss": 0.911, + "step": 1380 + }, + { + "epoch": 0.07600858605316749, + "grad_norm": 0.8780983090400696, + "learning_rate": 9.969199116127118e-06, + "loss": 0.8619, + "step": 1381 + }, + { + "epoch": 0.07606362485552315, + "grad_norm": 0.8503809571266174, + "learning_rate": 9.969151058075459e-06, + "loss": 0.8093, + "step": 1382 + }, + { + "epoch": 0.07611866365787881, + "grad_norm": 0.8633087277412415, + "learning_rate": 9.96910296267699e-06, + "loss": 0.7524, + "step": 1383 + }, + { + "epoch": 0.07617370246023447, + "grad_norm": 1.1203595399856567, + "learning_rate": 9.969054829932074e-06, + "loss": 0.945, + "step": 1384 + }, + { + "epoch": 0.07622874126259013, + "grad_norm": 0.8766878843307495, + "learning_rate": 9.969006659841072e-06, + "loss": 0.7537, + "step": 1385 + }, + { + "epoch": 0.07628378006494578, + "grad_norm": 0.9795958399772644, + "learning_rate": 9.968958452404345e-06, + "loss": 0.7963, + "step": 1386 + }, + { + "epoch": 0.07633881886730144, + "grad_norm": 0.9117506146430969, + "learning_rate": 9.968910207622257e-06, + "loss": 0.9469, + "step": 1387 + }, + { + "epoch": 0.0763938576696571, + "grad_norm": 0.9731466770172119, + "learning_rate": 9.96886192549517e-06, + "loss": 0.9536, + "step": 1388 + }, + { + "epoch": 0.07644889647201276, + "grad_norm": 0.8923571109771729, + "learning_rate": 9.968813606023446e-06, + "loss": 0.8362, + "step": 1389 + }, + { + "epoch": 0.07650393527436843, + "grad_norm": 0.8819600343704224, + "learning_rate": 9.96876524920745e-06, + "loss": 0.6938, + "step": 1390 + }, + { + "epoch": 0.07655897407672409, + "grad_norm": 0.9629887342453003, + "learning_rate": 9.968716855047545e-06, + "loss": 0.9104, + "step": 1391 + }, + { + "epoch": 0.07661401287907975, + "grad_norm": 0.992770254611969, + "learning_rate": 9.968668423544093e-06, + "loss": 0.944, + "step": 1392 + }, + { + "epoch": 0.07666905168143541, + "grad_norm": 0.8578491806983948, + "learning_rate": 9.96861995469746e-06, + "loss": 0.898, + "step": 1393 + }, + { + "epoch": 0.07672409048379107, + "grad_norm": 1.1169229745864868, + "learning_rate": 9.968571448508008e-06, + "loss": 0.8324, + "step": 1394 + }, + { + "epoch": 0.07677912928614673, + "grad_norm": 0.9600160121917725, + "learning_rate": 9.968522904976106e-06, + "loss": 0.9519, + "step": 1395 + }, + { + "epoch": 0.0768341680885024, + "grad_norm": 0.8271373510360718, + "learning_rate": 9.968474324102112e-06, + "loss": 0.8576, + "step": 1396 + }, + { + "epoch": 0.07688920689085806, + "grad_norm": 0.9437325596809387, + "learning_rate": 9.968425705886397e-06, + "loss": 0.9201, + "step": 1397 + }, + { + "epoch": 0.07694424569321372, + "grad_norm": 0.8679039478302002, + "learning_rate": 9.968377050329325e-06, + "loss": 0.8893, + "step": 1398 + }, + { + "epoch": 0.07699928449556938, + "grad_norm": 1.0178717374801636, + "learning_rate": 9.96832835743126e-06, + "loss": 0.9718, + "step": 1399 + }, + { + "epoch": 0.07705432329792504, + "grad_norm": 0.8354432582855225, + "learning_rate": 9.96827962719257e-06, + "loss": 0.83, + "step": 1400 + }, + { + "epoch": 0.0771093621002807, + "grad_norm": 1.2244631052017212, + "learning_rate": 9.968230859613619e-06, + "loss": 0.907, + "step": 1401 + }, + { + "epoch": 0.07716440090263636, + "grad_norm": 0.9099625945091248, + "learning_rate": 9.968182054694775e-06, + "loss": 0.809, + "step": 1402 + }, + { + "epoch": 0.07721943970499202, + "grad_norm": 0.8591424226760864, + "learning_rate": 9.968133212436404e-06, + "loss": 0.8869, + "step": 1403 + }, + { + "epoch": 0.07727447850734769, + "grad_norm": 1.068003535270691, + "learning_rate": 9.968084332838876e-06, + "loss": 0.8747, + "step": 1404 + }, + { + "epoch": 0.07732951730970335, + "grad_norm": 0.8503691554069519, + "learning_rate": 9.968035415902555e-06, + "loss": 0.7478, + "step": 1405 + }, + { + "epoch": 0.07738455611205901, + "grad_norm": 0.9209537506103516, + "learning_rate": 9.967986461627808e-06, + "loss": 0.9052, + "step": 1406 + }, + { + "epoch": 0.07743959491441466, + "grad_norm": 0.8447962999343872, + "learning_rate": 9.967937470015006e-06, + "loss": 0.7897, + "step": 1407 + }, + { + "epoch": 0.07749463371677032, + "grad_norm": 0.8731846809387207, + "learning_rate": 9.967888441064515e-06, + "loss": 0.837, + "step": 1408 + }, + { + "epoch": 0.07754967251912598, + "grad_norm": 0.9810444712638855, + "learning_rate": 9.967839374776705e-06, + "loss": 0.8236, + "step": 1409 + }, + { + "epoch": 0.07760471132148164, + "grad_norm": 0.8283190131187439, + "learning_rate": 9.967790271151944e-06, + "loss": 0.8443, + "step": 1410 + }, + { + "epoch": 0.0776597501238373, + "grad_norm": 0.7999932765960693, + "learning_rate": 9.9677411301906e-06, + "loss": 0.7945, + "step": 1411 + }, + { + "epoch": 0.07771478892619296, + "grad_norm": 0.9435983300209045, + "learning_rate": 9.967691951893044e-06, + "loss": 0.9745, + "step": 1412 + }, + { + "epoch": 0.07776982772854862, + "grad_norm": 0.8885984420776367, + "learning_rate": 9.967642736259646e-06, + "loss": 0.9163, + "step": 1413 + }, + { + "epoch": 0.07782486653090429, + "grad_norm": 0.993928074836731, + "learning_rate": 9.967593483290776e-06, + "loss": 0.7797, + "step": 1414 + }, + { + "epoch": 0.07787990533325995, + "grad_norm": 1.058830976486206, + "learning_rate": 9.9675441929868e-06, + "loss": 0.8671, + "step": 1415 + }, + { + "epoch": 0.07793494413561561, + "grad_norm": 1.0469766855239868, + "learning_rate": 9.967494865348093e-06, + "loss": 0.8671, + "step": 1416 + }, + { + "epoch": 0.07798998293797127, + "grad_norm": 0.902729868888855, + "learning_rate": 9.967445500375025e-06, + "loss": 0.8748, + "step": 1417 + }, + { + "epoch": 0.07804502174032693, + "grad_norm": 0.90755295753479, + "learning_rate": 9.967396098067965e-06, + "loss": 0.8279, + "step": 1418 + }, + { + "epoch": 0.07810006054268259, + "grad_norm": 0.8822374939918518, + "learning_rate": 9.967346658427287e-06, + "loss": 0.9386, + "step": 1419 + }, + { + "epoch": 0.07815509934503825, + "grad_norm": 0.9201469421386719, + "learning_rate": 9.96729718145336e-06, + "loss": 0.8684, + "step": 1420 + }, + { + "epoch": 0.07821013814739392, + "grad_norm": 0.9451109766960144, + "learning_rate": 9.967247667146558e-06, + "loss": 0.7854, + "step": 1421 + }, + { + "epoch": 0.07826517694974958, + "grad_norm": 0.9146197438240051, + "learning_rate": 9.96719811550725e-06, + "loss": 0.8496, + "step": 1422 + }, + { + "epoch": 0.07832021575210524, + "grad_norm": 0.9771224856376648, + "learning_rate": 9.967148526535813e-06, + "loss": 0.9657, + "step": 1423 + }, + { + "epoch": 0.0783752545544609, + "grad_norm": 0.8437683582305908, + "learning_rate": 9.967098900232616e-06, + "loss": 0.8336, + "step": 1424 + }, + { + "epoch": 0.07843029335681656, + "grad_norm": 0.8232185244560242, + "learning_rate": 9.967049236598034e-06, + "loss": 0.8878, + "step": 1425 + }, + { + "epoch": 0.07848533215917222, + "grad_norm": 1.0200369358062744, + "learning_rate": 9.96699953563244e-06, + "loss": 0.8135, + "step": 1426 + }, + { + "epoch": 0.07854037096152788, + "grad_norm": 0.8779187202453613, + "learning_rate": 9.966949797336208e-06, + "loss": 0.9124, + "step": 1427 + }, + { + "epoch": 0.07859540976388354, + "grad_norm": 0.9557466506958008, + "learning_rate": 9.966900021709708e-06, + "loss": 0.9118, + "step": 1428 + }, + { + "epoch": 0.07865044856623919, + "grad_norm": 0.8431050777435303, + "learning_rate": 9.966850208753317e-06, + "loss": 0.8361, + "step": 1429 + }, + { + "epoch": 0.07870548736859485, + "grad_norm": 0.9269648194313049, + "learning_rate": 9.966800358467412e-06, + "loss": 0.9194, + "step": 1430 + }, + { + "epoch": 0.07876052617095052, + "grad_norm": 0.818681538105011, + "learning_rate": 9.966750470852363e-06, + "loss": 0.7483, + "step": 1431 + }, + { + "epoch": 0.07881556497330618, + "grad_norm": 0.8788284659385681, + "learning_rate": 9.966700545908547e-06, + "loss": 0.858, + "step": 1432 + }, + { + "epoch": 0.07887060377566184, + "grad_norm": 0.7734160423278809, + "learning_rate": 9.966650583636342e-06, + "loss": 0.694, + "step": 1433 + }, + { + "epoch": 0.0789256425780175, + "grad_norm": 0.8846608996391296, + "learning_rate": 9.966600584036117e-06, + "loss": 0.8144, + "step": 1434 + }, + { + "epoch": 0.07898068138037316, + "grad_norm": 0.9740058183670044, + "learning_rate": 9.966550547108254e-06, + "loss": 0.9314, + "step": 1435 + }, + { + "epoch": 0.07903572018272882, + "grad_norm": 0.8731759786605835, + "learning_rate": 9.966500472853124e-06, + "loss": 0.8475, + "step": 1436 + }, + { + "epoch": 0.07909075898508448, + "grad_norm": 0.8984843492507935, + "learning_rate": 9.966450361271109e-06, + "loss": 0.7803, + "step": 1437 + }, + { + "epoch": 0.07914579778744014, + "grad_norm": 0.8897966742515564, + "learning_rate": 9.96640021236258e-06, + "loss": 0.8879, + "step": 1438 + }, + { + "epoch": 0.0792008365897958, + "grad_norm": 0.80704265832901, + "learning_rate": 9.966350026127917e-06, + "loss": 0.7585, + "step": 1439 + }, + { + "epoch": 0.07925587539215147, + "grad_norm": 1.0807467699050903, + "learning_rate": 9.966299802567499e-06, + "loss": 1.078, + "step": 1440 + }, + { + "epoch": 0.07931091419450713, + "grad_norm": 0.7994028925895691, + "learning_rate": 9.966249541681697e-06, + "loss": 0.8074, + "step": 1441 + }, + { + "epoch": 0.07936595299686279, + "grad_norm": 0.877592921257019, + "learning_rate": 9.966199243470895e-06, + "loss": 0.8084, + "step": 1442 + }, + { + "epoch": 0.07942099179921845, + "grad_norm": 0.7704572081565857, + "learning_rate": 9.966148907935469e-06, + "loss": 0.7206, + "step": 1443 + }, + { + "epoch": 0.07947603060157411, + "grad_norm": 0.8222140669822693, + "learning_rate": 9.966098535075797e-06, + "loss": 0.7768, + "step": 1444 + }, + { + "epoch": 0.07953106940392977, + "grad_norm": 1.389320731163025, + "learning_rate": 9.966048124892257e-06, + "loss": 1.0356, + "step": 1445 + }, + { + "epoch": 0.07958610820628544, + "grad_norm": 0.9082457423210144, + "learning_rate": 9.965997677385229e-06, + "loss": 0.7379, + "step": 1446 + }, + { + "epoch": 0.0796411470086411, + "grad_norm": 0.8029153943061829, + "learning_rate": 9.965947192555093e-06, + "loss": 0.7826, + "step": 1447 + }, + { + "epoch": 0.07969618581099676, + "grad_norm": 0.8752758502960205, + "learning_rate": 9.965896670402227e-06, + "loss": 0.8526, + "step": 1448 + }, + { + "epoch": 0.07975122461335242, + "grad_norm": 1.0665404796600342, + "learning_rate": 9.965846110927009e-06, + "loss": 0.858, + "step": 1449 + }, + { + "epoch": 0.07980626341570807, + "grad_norm": 0.9468502402305603, + "learning_rate": 9.96579551412982e-06, + "loss": 0.9658, + "step": 1450 + }, + { + "epoch": 0.07986130221806373, + "grad_norm": 1.0239403247833252, + "learning_rate": 9.965744880011046e-06, + "loss": 0.7995, + "step": 1451 + }, + { + "epoch": 0.07991634102041939, + "grad_norm": 0.9808099865913391, + "learning_rate": 9.965694208571059e-06, + "loss": 1.0173, + "step": 1452 + }, + { + "epoch": 0.07997137982277505, + "grad_norm": 0.9338780641555786, + "learning_rate": 9.965643499810245e-06, + "loss": 0.7917, + "step": 1453 + }, + { + "epoch": 0.08002641862513071, + "grad_norm": 0.9294295310974121, + "learning_rate": 9.965592753728981e-06, + "loss": 0.88, + "step": 1454 + }, + { + "epoch": 0.08008145742748637, + "grad_norm": 1.0261508226394653, + "learning_rate": 9.965541970327654e-06, + "loss": 0.8825, + "step": 1455 + }, + { + "epoch": 0.08013649622984204, + "grad_norm": 0.8964946269989014, + "learning_rate": 9.965491149606642e-06, + "loss": 0.81, + "step": 1456 + }, + { + "epoch": 0.0801915350321977, + "grad_norm": 0.9468267560005188, + "learning_rate": 9.965440291566329e-06, + "loss": 0.9453, + "step": 1457 + }, + { + "epoch": 0.08024657383455336, + "grad_norm": 0.8289040327072144, + "learning_rate": 9.965389396207092e-06, + "loss": 0.7373, + "step": 1458 + }, + { + "epoch": 0.08030161263690902, + "grad_norm": 0.8782384991645813, + "learning_rate": 9.965338463529322e-06, + "loss": 0.9199, + "step": 1459 + }, + { + "epoch": 0.08035665143926468, + "grad_norm": 0.8613787293434143, + "learning_rate": 9.965287493533395e-06, + "loss": 0.8719, + "step": 1460 + }, + { + "epoch": 0.08041169024162034, + "grad_norm": 0.8474903106689453, + "learning_rate": 9.965236486219696e-06, + "loss": 0.8033, + "step": 1461 + }, + { + "epoch": 0.080466729043976, + "grad_norm": 1.1442681550979614, + "learning_rate": 9.965185441588609e-06, + "loss": 0.8996, + "step": 1462 + }, + { + "epoch": 0.08052176784633167, + "grad_norm": 1.564138412475586, + "learning_rate": 9.965134359640518e-06, + "loss": 0.7451, + "step": 1463 + }, + { + "epoch": 0.08057680664868733, + "grad_norm": 0.9211083054542542, + "learning_rate": 9.965083240375806e-06, + "loss": 0.8939, + "step": 1464 + }, + { + "epoch": 0.08063184545104299, + "grad_norm": 0.9503418207168579, + "learning_rate": 9.965032083794856e-06, + "loss": 0.8544, + "step": 1465 + }, + { + "epoch": 0.08068688425339865, + "grad_norm": 0.9304021596908569, + "learning_rate": 9.964980889898055e-06, + "loss": 0.9192, + "step": 1466 + }, + { + "epoch": 0.08074192305575431, + "grad_norm": 0.8430425524711609, + "learning_rate": 9.964929658685787e-06, + "loss": 0.8586, + "step": 1467 + }, + { + "epoch": 0.08079696185810997, + "grad_norm": 0.8671759366989136, + "learning_rate": 9.964878390158437e-06, + "loss": 0.8807, + "step": 1468 + }, + { + "epoch": 0.08085200066046563, + "grad_norm": 0.9548830986022949, + "learning_rate": 9.964827084316389e-06, + "loss": 0.9033, + "step": 1469 + }, + { + "epoch": 0.0809070394628213, + "grad_norm": 0.8736767768859863, + "learning_rate": 9.964775741160029e-06, + "loss": 0.8509, + "step": 1470 + }, + { + "epoch": 0.08096207826517696, + "grad_norm": 0.8827025890350342, + "learning_rate": 9.964724360689745e-06, + "loss": 0.897, + "step": 1471 + }, + { + "epoch": 0.0810171170675326, + "grad_norm": 1.02822744846344, + "learning_rate": 9.964672942905921e-06, + "loss": 1.0371, + "step": 1472 + }, + { + "epoch": 0.08107215586988827, + "grad_norm": 0.8619557619094849, + "learning_rate": 9.964621487808946e-06, + "loss": 0.7654, + "step": 1473 + }, + { + "epoch": 0.08112719467224393, + "grad_norm": 0.7855951189994812, + "learning_rate": 9.9645699953992e-06, + "loss": 0.7767, + "step": 1474 + }, + { + "epoch": 0.08118223347459959, + "grad_norm": 0.8139809370040894, + "learning_rate": 9.96451846567708e-06, + "loss": 0.7535, + "step": 1475 + }, + { + "epoch": 0.08123727227695525, + "grad_norm": 0.8491657376289368, + "learning_rate": 9.964466898642966e-06, + "loss": 0.854, + "step": 1476 + }, + { + "epoch": 0.08129231107931091, + "grad_norm": 0.8968605399131775, + "learning_rate": 9.964415294297247e-06, + "loss": 0.8914, + "step": 1477 + }, + { + "epoch": 0.08134734988166657, + "grad_norm": 0.8692505359649658, + "learning_rate": 9.964363652640313e-06, + "loss": 0.9245, + "step": 1478 + }, + { + "epoch": 0.08140238868402223, + "grad_norm": 0.8916530013084412, + "learning_rate": 9.964311973672549e-06, + "loss": 0.7662, + "step": 1479 + }, + { + "epoch": 0.0814574274863779, + "grad_norm": 0.8239215612411499, + "learning_rate": 9.964260257394347e-06, + "loss": 0.9191, + "step": 1480 + }, + { + "epoch": 0.08151246628873356, + "grad_norm": 0.8672100901603699, + "learning_rate": 9.964208503806092e-06, + "loss": 0.7656, + "step": 1481 + }, + { + "epoch": 0.08156750509108922, + "grad_norm": 0.9195712208747864, + "learning_rate": 9.964156712908177e-06, + "loss": 0.8656, + "step": 1482 + }, + { + "epoch": 0.08162254389344488, + "grad_norm": 0.8282535672187805, + "learning_rate": 9.964104884700986e-06, + "loss": 0.8264, + "step": 1483 + }, + { + "epoch": 0.08167758269580054, + "grad_norm": 0.8492032289505005, + "learning_rate": 9.964053019184913e-06, + "loss": 0.7816, + "step": 1484 + }, + { + "epoch": 0.0817326214981562, + "grad_norm": 0.8491117358207703, + "learning_rate": 9.964001116360347e-06, + "loss": 0.7885, + "step": 1485 + }, + { + "epoch": 0.08178766030051186, + "grad_norm": 0.9415153861045837, + "learning_rate": 9.963949176227677e-06, + "loss": 0.8165, + "step": 1486 + }, + { + "epoch": 0.08184269910286752, + "grad_norm": 0.8462526202201843, + "learning_rate": 9.963897198787294e-06, + "loss": 0.8498, + "step": 1487 + }, + { + "epoch": 0.08189773790522319, + "grad_norm": 0.8591959476470947, + "learning_rate": 9.963845184039586e-06, + "loss": 0.8906, + "step": 1488 + }, + { + "epoch": 0.08195277670757885, + "grad_norm": 0.840761661529541, + "learning_rate": 9.963793131984949e-06, + "loss": 0.7831, + "step": 1489 + }, + { + "epoch": 0.08200781550993451, + "grad_norm": 0.931404173374176, + "learning_rate": 9.96374104262377e-06, + "loss": 0.889, + "step": 1490 + }, + { + "epoch": 0.08206285431229017, + "grad_norm": 0.9048783779144287, + "learning_rate": 9.963688915956443e-06, + "loss": 0.8321, + "step": 1491 + }, + { + "epoch": 0.08211789311464583, + "grad_norm": 0.9145931601524353, + "learning_rate": 9.96363675198336e-06, + "loss": 0.9918, + "step": 1492 + }, + { + "epoch": 0.08217293191700148, + "grad_norm": 0.9256643652915955, + "learning_rate": 9.963584550704908e-06, + "loss": 0.8731, + "step": 1493 + }, + { + "epoch": 0.08222797071935714, + "grad_norm": 1.0212007761001587, + "learning_rate": 9.963532312121486e-06, + "loss": 0.9077, + "step": 1494 + }, + { + "epoch": 0.0822830095217128, + "grad_norm": 0.9206242561340332, + "learning_rate": 9.963480036233483e-06, + "loss": 0.9076, + "step": 1495 + }, + { + "epoch": 0.08233804832406846, + "grad_norm": 0.8846865296363831, + "learning_rate": 9.963427723041294e-06, + "loss": 0.6826, + "step": 1496 + }, + { + "epoch": 0.08239308712642412, + "grad_norm": 0.8745351433753967, + "learning_rate": 9.963375372545309e-06, + "loss": 0.7935, + "step": 1497 + }, + { + "epoch": 0.08244812592877979, + "grad_norm": 0.9019666314125061, + "learning_rate": 9.963322984745924e-06, + "loss": 0.8435, + "step": 1498 + }, + { + "epoch": 0.08250316473113545, + "grad_norm": 0.8586859703063965, + "learning_rate": 9.963270559643531e-06, + "loss": 0.8118, + "step": 1499 + }, + { + "epoch": 0.08255820353349111, + "grad_norm": 0.9192817807197571, + "learning_rate": 9.963218097238528e-06, + "loss": 0.824, + "step": 1500 + }, + { + "epoch": 0.08261324233584677, + "grad_norm": 0.8972243070602417, + "learning_rate": 9.963165597531304e-06, + "loss": 0.8404, + "step": 1501 + }, + { + "epoch": 0.08266828113820243, + "grad_norm": 0.8953961133956909, + "learning_rate": 9.963113060522256e-06, + "loss": 0.9031, + "step": 1502 + }, + { + "epoch": 0.0827233199405581, + "grad_norm": 0.9551270604133606, + "learning_rate": 9.963060486211779e-06, + "loss": 0.9177, + "step": 1503 + }, + { + "epoch": 0.08277835874291375, + "grad_norm": 0.8524616956710815, + "learning_rate": 9.963007874600268e-06, + "loss": 0.8582, + "step": 1504 + }, + { + "epoch": 0.08283339754526942, + "grad_norm": 0.8148764371871948, + "learning_rate": 9.962955225688118e-06, + "loss": 0.6859, + "step": 1505 + }, + { + "epoch": 0.08288843634762508, + "grad_norm": 0.9110590219497681, + "learning_rate": 9.962902539475728e-06, + "loss": 0.7189, + "step": 1506 + }, + { + "epoch": 0.08294347514998074, + "grad_norm": 0.8700116872787476, + "learning_rate": 9.962849815963487e-06, + "loss": 0.9462, + "step": 1507 + }, + { + "epoch": 0.0829985139523364, + "grad_norm": 0.877109706401825, + "learning_rate": 9.962797055151797e-06, + "loss": 0.8138, + "step": 1508 + }, + { + "epoch": 0.08305355275469206, + "grad_norm": 0.7818365097045898, + "learning_rate": 9.962744257041053e-06, + "loss": 0.8474, + "step": 1509 + }, + { + "epoch": 0.08310859155704772, + "grad_norm": 0.88360196352005, + "learning_rate": 9.96269142163165e-06, + "loss": 0.8724, + "step": 1510 + }, + { + "epoch": 0.08316363035940338, + "grad_norm": 0.8982682228088379, + "learning_rate": 9.962638548923988e-06, + "loss": 0.9687, + "step": 1511 + }, + { + "epoch": 0.08321866916175905, + "grad_norm": 0.7362002730369568, + "learning_rate": 9.962585638918462e-06, + "loss": 0.7666, + "step": 1512 + }, + { + "epoch": 0.08327370796411471, + "grad_norm": 1.0993375778198242, + "learning_rate": 9.962532691615472e-06, + "loss": 0.8869, + "step": 1513 + }, + { + "epoch": 0.08332874676647037, + "grad_norm": 0.8684842586517334, + "learning_rate": 9.962479707015415e-06, + "loss": 0.872, + "step": 1514 + }, + { + "epoch": 0.08338378556882602, + "grad_norm": 1.0598478317260742, + "learning_rate": 9.962426685118689e-06, + "loss": 0.9102, + "step": 1515 + }, + { + "epoch": 0.08343882437118168, + "grad_norm": 0.8492125272750854, + "learning_rate": 9.96237362592569e-06, + "loss": 0.7554, + "step": 1516 + }, + { + "epoch": 0.08349386317353734, + "grad_norm": 0.8489052653312683, + "learning_rate": 9.962320529436821e-06, + "loss": 0.9139, + "step": 1517 + }, + { + "epoch": 0.083548901975893, + "grad_norm": 0.8650774359703064, + "learning_rate": 9.962267395652479e-06, + "loss": 0.8717, + "step": 1518 + }, + { + "epoch": 0.08360394077824866, + "grad_norm": 0.8393206596374512, + "learning_rate": 9.962214224573064e-06, + "loss": 0.8256, + "step": 1519 + }, + { + "epoch": 0.08365897958060432, + "grad_norm": 0.8304896354675293, + "learning_rate": 9.962161016198974e-06, + "loss": 0.8232, + "step": 1520 + }, + { + "epoch": 0.08371401838295998, + "grad_norm": 0.8718386292457581, + "learning_rate": 9.962107770530612e-06, + "loss": 0.8206, + "step": 1521 + }, + { + "epoch": 0.08376905718531565, + "grad_norm": 0.9109341502189636, + "learning_rate": 9.962054487568373e-06, + "loss": 0.9576, + "step": 1522 + }, + { + "epoch": 0.08382409598767131, + "grad_norm": 0.9543303847312927, + "learning_rate": 9.962001167312663e-06, + "loss": 0.8816, + "step": 1523 + }, + { + "epoch": 0.08387913479002697, + "grad_norm": 0.9992844462394714, + "learning_rate": 9.961947809763881e-06, + "loss": 0.8682, + "step": 1524 + }, + { + "epoch": 0.08393417359238263, + "grad_norm": 0.8092770576477051, + "learning_rate": 9.961894414922425e-06, + "loss": 0.6352, + "step": 1525 + }, + { + "epoch": 0.08398921239473829, + "grad_norm": 0.9888653755187988, + "learning_rate": 9.961840982788703e-06, + "loss": 0.8721, + "step": 1526 + }, + { + "epoch": 0.08404425119709395, + "grad_norm": 1.0092703104019165, + "learning_rate": 9.961787513363108e-06, + "loss": 0.7776, + "step": 1527 + }, + { + "epoch": 0.08409928999944961, + "grad_norm": 0.8654646277427673, + "learning_rate": 9.961734006646049e-06, + "loss": 0.8835, + "step": 1528 + }, + { + "epoch": 0.08415432880180528, + "grad_norm": 0.7630153298377991, + "learning_rate": 9.961680462637924e-06, + "loss": 0.7501, + "step": 1529 + }, + { + "epoch": 0.08420936760416094, + "grad_norm": 1.1883158683776855, + "learning_rate": 9.961626881339138e-06, + "loss": 0.9476, + "step": 1530 + }, + { + "epoch": 0.0842644064065166, + "grad_norm": 0.8710927963256836, + "learning_rate": 9.96157326275009e-06, + "loss": 0.749, + "step": 1531 + }, + { + "epoch": 0.08431944520887226, + "grad_norm": 0.9500633478164673, + "learning_rate": 9.961519606871188e-06, + "loss": 0.8994, + "step": 1532 + }, + { + "epoch": 0.08437448401122792, + "grad_norm": 0.873257577419281, + "learning_rate": 9.961465913702833e-06, + "loss": 0.816, + "step": 1533 + }, + { + "epoch": 0.08442952281358358, + "grad_norm": 0.8007022142410278, + "learning_rate": 9.961412183245426e-06, + "loss": 0.787, + "step": 1534 + }, + { + "epoch": 0.08448456161593924, + "grad_norm": 0.8998435139656067, + "learning_rate": 9.961358415499374e-06, + "loss": 0.8741, + "step": 1535 + }, + { + "epoch": 0.08453960041829489, + "grad_norm": 0.9152502417564392, + "learning_rate": 9.961304610465081e-06, + "loss": 0.9749, + "step": 1536 + }, + { + "epoch": 0.08459463922065055, + "grad_norm": 0.8961958289146423, + "learning_rate": 9.961250768142949e-06, + "loss": 0.8683, + "step": 1537 + }, + { + "epoch": 0.08464967802300621, + "grad_norm": 0.8683995008468628, + "learning_rate": 9.961196888533387e-06, + "loss": 0.8347, + "step": 1538 + }, + { + "epoch": 0.08470471682536188, + "grad_norm": 0.835221529006958, + "learning_rate": 9.961142971636795e-06, + "loss": 0.8936, + "step": 1539 + }, + { + "epoch": 0.08475975562771754, + "grad_norm": 0.8666725158691406, + "learning_rate": 9.96108901745358e-06, + "loss": 0.7344, + "step": 1540 + }, + { + "epoch": 0.0848147944300732, + "grad_norm": 0.9509082436561584, + "learning_rate": 9.96103502598415e-06, + "loss": 0.8965, + "step": 1541 + }, + { + "epoch": 0.08486983323242886, + "grad_norm": 0.8134233951568604, + "learning_rate": 9.960980997228908e-06, + "loss": 0.797, + "step": 1542 + }, + { + "epoch": 0.08492487203478452, + "grad_norm": 1.0432242155075073, + "learning_rate": 9.96092693118826e-06, + "loss": 0.8754, + "step": 1543 + }, + { + "epoch": 0.08497991083714018, + "grad_norm": 0.9560218453407288, + "learning_rate": 9.960872827862613e-06, + "loss": 0.9238, + "step": 1544 + }, + { + "epoch": 0.08503494963949584, + "grad_norm": 0.8471649885177612, + "learning_rate": 9.960818687252374e-06, + "loss": 0.8622, + "step": 1545 + }, + { + "epoch": 0.0850899884418515, + "grad_norm": 1.2584747076034546, + "learning_rate": 9.960764509357951e-06, + "loss": 0.8007, + "step": 1546 + }, + { + "epoch": 0.08514502724420717, + "grad_norm": 0.8730618953704834, + "learning_rate": 9.960710294179748e-06, + "loss": 0.7412, + "step": 1547 + }, + { + "epoch": 0.08520006604656283, + "grad_norm": 0.8361592292785645, + "learning_rate": 9.960656041718176e-06, + "loss": 0.7018, + "step": 1548 + }, + { + "epoch": 0.08525510484891849, + "grad_norm": 0.8351722359657288, + "learning_rate": 9.96060175197364e-06, + "loss": 0.843, + "step": 1549 + }, + { + "epoch": 0.08531014365127415, + "grad_norm": 0.8665090203285217, + "learning_rate": 9.960547424946549e-06, + "loss": 0.8235, + "step": 1550 + }, + { + "epoch": 0.08536518245362981, + "grad_norm": 0.9254478812217712, + "learning_rate": 9.960493060637313e-06, + "loss": 0.8122, + "step": 1551 + }, + { + "epoch": 0.08542022125598547, + "grad_norm": 0.8712261319160461, + "learning_rate": 9.960438659046337e-06, + "loss": 0.823, + "step": 1552 + }, + { + "epoch": 0.08547526005834113, + "grad_norm": 0.9027207493782043, + "learning_rate": 9.960384220174033e-06, + "loss": 0.7964, + "step": 1553 + }, + { + "epoch": 0.0855302988606968, + "grad_norm": 0.854626476764679, + "learning_rate": 9.960329744020808e-06, + "loss": 0.755, + "step": 1554 + }, + { + "epoch": 0.08558533766305246, + "grad_norm": 0.9398048520088196, + "learning_rate": 9.960275230587073e-06, + "loss": 0.8607, + "step": 1555 + }, + { + "epoch": 0.08564037646540812, + "grad_norm": 1.008002758026123, + "learning_rate": 9.960220679873238e-06, + "loss": 0.9711, + "step": 1556 + }, + { + "epoch": 0.08569541526776378, + "grad_norm": 0.8999453783035278, + "learning_rate": 9.96016609187971e-06, + "loss": 0.8233, + "step": 1557 + }, + { + "epoch": 0.08575045407011943, + "grad_norm": 0.8912106156349182, + "learning_rate": 9.960111466606903e-06, + "loss": 0.8271, + "step": 1558 + }, + { + "epoch": 0.08580549287247509, + "grad_norm": 0.9269998073577881, + "learning_rate": 9.960056804055227e-06, + "loss": 0.7959, + "step": 1559 + }, + { + "epoch": 0.08586053167483075, + "grad_norm": 1.083815336227417, + "learning_rate": 9.96000210422509e-06, + "loss": 0.9436, + "step": 1560 + }, + { + "epoch": 0.08591557047718641, + "grad_norm": 0.8906280398368835, + "learning_rate": 9.959947367116905e-06, + "loss": 0.9317, + "step": 1561 + }, + { + "epoch": 0.08597060927954207, + "grad_norm": 1.211696743965149, + "learning_rate": 9.959892592731084e-06, + "loss": 0.9076, + "step": 1562 + }, + { + "epoch": 0.08602564808189773, + "grad_norm": 0.9050534963607788, + "learning_rate": 9.959837781068038e-06, + "loss": 0.8728, + "step": 1563 + }, + { + "epoch": 0.0860806868842534, + "grad_norm": 0.9384796619415283, + "learning_rate": 9.959782932128178e-06, + "loss": 0.9277, + "step": 1564 + }, + { + "epoch": 0.08613572568660906, + "grad_norm": 0.795844316482544, + "learning_rate": 9.959728045911915e-06, + "loss": 0.7666, + "step": 1565 + }, + { + "epoch": 0.08619076448896472, + "grad_norm": 0.925956666469574, + "learning_rate": 9.959673122419668e-06, + "loss": 0.815, + "step": 1566 + }, + { + "epoch": 0.08624580329132038, + "grad_norm": 0.898047924041748, + "learning_rate": 9.959618161651843e-06, + "loss": 0.8131, + "step": 1567 + }, + { + "epoch": 0.08630084209367604, + "grad_norm": 0.8656220436096191, + "learning_rate": 9.959563163608856e-06, + "loss": 0.9336, + "step": 1568 + }, + { + "epoch": 0.0863558808960317, + "grad_norm": 0.9184645414352417, + "learning_rate": 9.95950812829112e-06, + "loss": 0.9557, + "step": 1569 + }, + { + "epoch": 0.08641091969838736, + "grad_norm": 0.8607667684555054, + "learning_rate": 9.959453055699048e-06, + "loss": 0.8272, + "step": 1570 + }, + { + "epoch": 0.08646595850074303, + "grad_norm": 0.9561272263526917, + "learning_rate": 9.959397945833056e-06, + "loss": 0.8876, + "step": 1571 + }, + { + "epoch": 0.08652099730309869, + "grad_norm": 0.8562412261962891, + "learning_rate": 9.959342798693556e-06, + "loss": 0.8404, + "step": 1572 + }, + { + "epoch": 0.08657603610545435, + "grad_norm": 0.8924610614776611, + "learning_rate": 9.95928761428096e-06, + "loss": 0.8779, + "step": 1573 + }, + { + "epoch": 0.08663107490781001, + "grad_norm": 0.8343208432197571, + "learning_rate": 9.95923239259569e-06, + "loss": 0.8992, + "step": 1574 + }, + { + "epoch": 0.08668611371016567, + "grad_norm": 0.8835015296936035, + "learning_rate": 9.959177133638155e-06, + "loss": 1.0026, + "step": 1575 + }, + { + "epoch": 0.08674115251252133, + "grad_norm": 0.9540221095085144, + "learning_rate": 9.959121837408771e-06, + "loss": 0.8507, + "step": 1576 + }, + { + "epoch": 0.086796191314877, + "grad_norm": 1.087817668914795, + "learning_rate": 9.959066503907957e-06, + "loss": 0.8607, + "step": 1577 + }, + { + "epoch": 0.08685123011723266, + "grad_norm": 0.8072447180747986, + "learning_rate": 9.959011133136124e-06, + "loss": 0.882, + "step": 1578 + }, + { + "epoch": 0.0869062689195883, + "grad_norm": 0.7646876573562622, + "learning_rate": 9.958955725093694e-06, + "loss": 0.7653, + "step": 1579 + }, + { + "epoch": 0.08696130772194396, + "grad_norm": 0.8979537487030029, + "learning_rate": 9.958900279781078e-06, + "loss": 0.9033, + "step": 1580 + }, + { + "epoch": 0.08701634652429963, + "grad_norm": 0.9445611834526062, + "learning_rate": 9.958844797198696e-06, + "loss": 0.9423, + "step": 1581 + }, + { + "epoch": 0.08707138532665529, + "grad_norm": 0.8836671113967896, + "learning_rate": 9.958789277346963e-06, + "loss": 0.839, + "step": 1582 + }, + { + "epoch": 0.08712642412901095, + "grad_norm": 1.0333542823791504, + "learning_rate": 9.958733720226296e-06, + "loss": 0.9211, + "step": 1583 + }, + { + "epoch": 0.08718146293136661, + "grad_norm": 0.8084085583686829, + "learning_rate": 9.958678125837117e-06, + "loss": 0.8387, + "step": 1584 + }, + { + "epoch": 0.08723650173372227, + "grad_norm": 0.7769419550895691, + "learning_rate": 9.958622494179838e-06, + "loss": 0.8307, + "step": 1585 + }, + { + "epoch": 0.08729154053607793, + "grad_norm": 0.8387578129768372, + "learning_rate": 9.95856682525488e-06, + "loss": 0.8001, + "step": 1586 + }, + { + "epoch": 0.0873465793384336, + "grad_norm": 0.8989812731742859, + "learning_rate": 9.95851111906266e-06, + "loss": 0.7752, + "step": 1587 + }, + { + "epoch": 0.08740161814078926, + "grad_norm": 0.8558734655380249, + "learning_rate": 9.958455375603602e-06, + "loss": 0.8149, + "step": 1588 + }, + { + "epoch": 0.08745665694314492, + "grad_norm": 0.8890896439552307, + "learning_rate": 9.958399594878117e-06, + "loss": 0.8232, + "step": 1589 + }, + { + "epoch": 0.08751169574550058, + "grad_norm": 0.875912070274353, + "learning_rate": 9.95834377688663e-06, + "loss": 0.7458, + "step": 1590 + }, + { + "epoch": 0.08756673454785624, + "grad_norm": 0.808355987071991, + "learning_rate": 9.958287921629557e-06, + "loss": 0.8296, + "step": 1591 + }, + { + "epoch": 0.0876217733502119, + "grad_norm": 0.9637090563774109, + "learning_rate": 9.958232029107318e-06, + "loss": 0.8769, + "step": 1592 + }, + { + "epoch": 0.08767681215256756, + "grad_norm": 0.8980715870857239, + "learning_rate": 9.958176099320336e-06, + "loss": 0.7995, + "step": 1593 + }, + { + "epoch": 0.08773185095492322, + "grad_norm": 0.9369860291481018, + "learning_rate": 9.95812013226903e-06, + "loss": 0.8545, + "step": 1594 + }, + { + "epoch": 0.08778688975727889, + "grad_norm": 0.8589349389076233, + "learning_rate": 9.958064127953819e-06, + "loss": 0.8693, + "step": 1595 + }, + { + "epoch": 0.08784192855963455, + "grad_norm": 0.929207444190979, + "learning_rate": 9.958008086375126e-06, + "loss": 0.811, + "step": 1596 + }, + { + "epoch": 0.08789696736199021, + "grad_norm": 1.0825661420822144, + "learning_rate": 9.957952007533371e-06, + "loss": 1.0145, + "step": 1597 + }, + { + "epoch": 0.08795200616434587, + "grad_norm": 0.8818382024765015, + "learning_rate": 9.957895891428978e-06, + "loss": 0.7771, + "step": 1598 + }, + { + "epoch": 0.08800704496670153, + "grad_norm": 0.882780909538269, + "learning_rate": 9.957839738062363e-06, + "loss": 0.8857, + "step": 1599 + }, + { + "epoch": 0.08806208376905718, + "grad_norm": 0.9136924743652344, + "learning_rate": 9.957783547433955e-06, + "loss": 0.8873, + "step": 1600 + }, + { + "epoch": 0.08811712257141284, + "grad_norm": 0.8896858096122742, + "learning_rate": 9.95772731954417e-06, + "loss": 0.8463, + "step": 1601 + }, + { + "epoch": 0.0881721613737685, + "grad_norm": 0.8671631813049316, + "learning_rate": 9.957671054393436e-06, + "loss": 0.8333, + "step": 1602 + }, + { + "epoch": 0.08822720017612416, + "grad_norm": 0.9442896246910095, + "learning_rate": 9.957614751982172e-06, + "loss": 0.9676, + "step": 1603 + }, + { + "epoch": 0.08828223897847982, + "grad_norm": 0.8249240517616272, + "learning_rate": 9.957558412310803e-06, + "loss": 0.7746, + "step": 1604 + }, + { + "epoch": 0.08833727778083549, + "grad_norm": 0.8125253319740295, + "learning_rate": 9.957502035379751e-06, + "loss": 0.7816, + "step": 1605 + }, + { + "epoch": 0.08839231658319115, + "grad_norm": 0.8467233777046204, + "learning_rate": 9.957445621189442e-06, + "loss": 0.7697, + "step": 1606 + }, + { + "epoch": 0.08844735538554681, + "grad_norm": 0.8322175145149231, + "learning_rate": 9.957389169740299e-06, + "loss": 0.7561, + "step": 1607 + }, + { + "epoch": 0.08850239418790247, + "grad_norm": 0.869163453578949, + "learning_rate": 9.957332681032746e-06, + "loss": 0.8984, + "step": 1608 + }, + { + "epoch": 0.08855743299025813, + "grad_norm": 0.8755944967269897, + "learning_rate": 9.957276155067206e-06, + "loss": 0.8016, + "step": 1609 + }, + { + "epoch": 0.08861247179261379, + "grad_norm": 0.8152669668197632, + "learning_rate": 9.957219591844108e-06, + "loss": 0.7763, + "step": 1610 + }, + { + "epoch": 0.08866751059496945, + "grad_norm": 0.979752779006958, + "learning_rate": 9.957162991363871e-06, + "loss": 0.7755, + "step": 1611 + }, + { + "epoch": 0.08872254939732512, + "grad_norm": 1.0481054782867432, + "learning_rate": 9.957106353626926e-06, + "loss": 0.9395, + "step": 1612 + }, + { + "epoch": 0.08877758819968078, + "grad_norm": 0.7773686647415161, + "learning_rate": 9.957049678633697e-06, + "loss": 0.7713, + "step": 1613 + }, + { + "epoch": 0.08883262700203644, + "grad_norm": 0.838979959487915, + "learning_rate": 9.956992966384609e-06, + "loss": 0.7909, + "step": 1614 + }, + { + "epoch": 0.0888876658043921, + "grad_norm": 0.9527049660682678, + "learning_rate": 9.956936216880089e-06, + "loss": 0.7944, + "step": 1615 + }, + { + "epoch": 0.08894270460674776, + "grad_norm": 0.7967305183410645, + "learning_rate": 9.956879430120561e-06, + "loss": 0.7703, + "step": 1616 + }, + { + "epoch": 0.08899774340910342, + "grad_norm": 0.9065802097320557, + "learning_rate": 9.956822606106456e-06, + "loss": 0.8188, + "step": 1617 + }, + { + "epoch": 0.08905278221145908, + "grad_norm": 0.7329322099685669, + "learning_rate": 9.956765744838199e-06, + "loss": 0.8043, + "step": 1618 + }, + { + "epoch": 0.08910782101381474, + "grad_norm": 0.864973247051239, + "learning_rate": 9.95670884631622e-06, + "loss": 0.8334, + "step": 1619 + }, + { + "epoch": 0.0891628598161704, + "grad_norm": 1.073559045791626, + "learning_rate": 9.95665191054094e-06, + "loss": 0.7755, + "step": 1620 + }, + { + "epoch": 0.08921789861852607, + "grad_norm": 0.7347918748855591, + "learning_rate": 9.956594937512794e-06, + "loss": 0.7556, + "step": 1621 + }, + { + "epoch": 0.08927293742088172, + "grad_norm": 0.8756610751152039, + "learning_rate": 9.956537927232205e-06, + "loss": 0.8129, + "step": 1622 + }, + { + "epoch": 0.08932797622323738, + "grad_norm": 0.9132435917854309, + "learning_rate": 9.956480879699605e-06, + "loss": 0.8221, + "step": 1623 + }, + { + "epoch": 0.08938301502559304, + "grad_norm": 1.1978256702423096, + "learning_rate": 9.956423794915421e-06, + "loss": 0.8651, + "step": 1624 + }, + { + "epoch": 0.0894380538279487, + "grad_norm": 0.8493894934654236, + "learning_rate": 9.956366672880082e-06, + "loss": 0.7267, + "step": 1625 + }, + { + "epoch": 0.08949309263030436, + "grad_norm": 1.0971951484680176, + "learning_rate": 9.956309513594019e-06, + "loss": 0.7852, + "step": 1626 + }, + { + "epoch": 0.08954813143266002, + "grad_norm": 0.899974524974823, + "learning_rate": 9.95625231705766e-06, + "loss": 0.8868, + "step": 1627 + }, + { + "epoch": 0.08960317023501568, + "grad_norm": 0.8995566368103027, + "learning_rate": 9.956195083271436e-06, + "loss": 0.87, + "step": 1628 + }, + { + "epoch": 0.08965820903737134, + "grad_norm": 0.8924218416213989, + "learning_rate": 9.956137812235776e-06, + "loss": 0.7885, + "step": 1629 + }, + { + "epoch": 0.089713247839727, + "grad_norm": 0.9232820868492126, + "learning_rate": 9.956080503951108e-06, + "loss": 0.7923, + "step": 1630 + }, + { + "epoch": 0.08976828664208267, + "grad_norm": 0.9298982620239258, + "learning_rate": 9.956023158417869e-06, + "loss": 0.8625, + "step": 1631 + }, + { + "epoch": 0.08982332544443833, + "grad_norm": 0.86515212059021, + "learning_rate": 9.955965775636488e-06, + "loss": 0.7683, + "step": 1632 + }, + { + "epoch": 0.08987836424679399, + "grad_norm": 0.8016952276229858, + "learning_rate": 9.955908355607392e-06, + "loss": 0.8122, + "step": 1633 + }, + { + "epoch": 0.08993340304914965, + "grad_norm": 0.842703640460968, + "learning_rate": 9.955850898331015e-06, + "loss": 0.8487, + "step": 1634 + }, + { + "epoch": 0.08998844185150531, + "grad_norm": 0.8239083886146545, + "learning_rate": 9.95579340380779e-06, + "loss": 0.8701, + "step": 1635 + }, + { + "epoch": 0.09004348065386097, + "grad_norm": 0.8575418591499329, + "learning_rate": 9.955735872038149e-06, + "loss": 0.8263, + "step": 1636 + }, + { + "epoch": 0.09009851945621664, + "grad_norm": 0.8884586095809937, + "learning_rate": 9.955678303022522e-06, + "loss": 0.8112, + "step": 1637 + }, + { + "epoch": 0.0901535582585723, + "grad_norm": 0.9024681448936462, + "learning_rate": 9.955620696761345e-06, + "loss": 0.9174, + "step": 1638 + }, + { + "epoch": 0.09020859706092796, + "grad_norm": 0.8151944875717163, + "learning_rate": 9.955563053255049e-06, + "loss": 0.806, + "step": 1639 + }, + { + "epoch": 0.09026363586328362, + "grad_norm": 0.8292184472084045, + "learning_rate": 9.955505372504069e-06, + "loss": 0.8007, + "step": 1640 + }, + { + "epoch": 0.09031867466563928, + "grad_norm": 0.9445936679840088, + "learning_rate": 9.955447654508835e-06, + "loss": 0.7089, + "step": 1641 + }, + { + "epoch": 0.09037371346799494, + "grad_norm": 0.781579315662384, + "learning_rate": 9.955389899269782e-06, + "loss": 0.8224, + "step": 1642 + }, + { + "epoch": 0.09042875227035059, + "grad_norm": 0.9028880596160889, + "learning_rate": 9.955332106787348e-06, + "loss": 0.7976, + "step": 1643 + }, + { + "epoch": 0.09048379107270625, + "grad_norm": 1.0336887836456299, + "learning_rate": 9.955274277061963e-06, + "loss": 0.9296, + "step": 1644 + }, + { + "epoch": 0.09053882987506191, + "grad_norm": 0.8894197940826416, + "learning_rate": 9.955216410094062e-06, + "loss": 0.815, + "step": 1645 + }, + { + "epoch": 0.09059386867741757, + "grad_norm": 0.8955528140068054, + "learning_rate": 9.955158505884083e-06, + "loss": 0.8707, + "step": 1646 + }, + { + "epoch": 0.09064890747977324, + "grad_norm": 0.8012683987617493, + "learning_rate": 9.955100564432458e-06, + "loss": 0.7467, + "step": 1647 + }, + { + "epoch": 0.0907039462821289, + "grad_norm": 0.917969286441803, + "learning_rate": 9.955042585739623e-06, + "loss": 0.8835, + "step": 1648 + }, + { + "epoch": 0.09075898508448456, + "grad_norm": 0.8066666722297668, + "learning_rate": 9.954984569806014e-06, + "loss": 0.8338, + "step": 1649 + }, + { + "epoch": 0.09081402388684022, + "grad_norm": 1.1324070692062378, + "learning_rate": 9.954926516632069e-06, + "loss": 0.8245, + "step": 1650 + }, + { + "epoch": 0.09086906268919588, + "grad_norm": 0.8196014761924744, + "learning_rate": 9.954868426218222e-06, + "loss": 0.7897, + "step": 1651 + }, + { + "epoch": 0.09092410149155154, + "grad_norm": 0.8713478446006775, + "learning_rate": 9.95481029856491e-06, + "loss": 0.891, + "step": 1652 + }, + { + "epoch": 0.0909791402939072, + "grad_norm": 0.8489059805870056, + "learning_rate": 9.954752133672569e-06, + "loss": 0.7748, + "step": 1653 + }, + { + "epoch": 0.09103417909626287, + "grad_norm": 0.8914602994918823, + "learning_rate": 9.954693931541638e-06, + "loss": 0.8657, + "step": 1654 + }, + { + "epoch": 0.09108921789861853, + "grad_norm": 0.9031614661216736, + "learning_rate": 9.954635692172555e-06, + "loss": 0.7409, + "step": 1655 + }, + { + "epoch": 0.09114425670097419, + "grad_norm": 0.8680000305175781, + "learning_rate": 9.954577415565756e-06, + "loss": 0.8535, + "step": 1656 + }, + { + "epoch": 0.09119929550332985, + "grad_norm": 0.830596923828125, + "learning_rate": 9.954519101721679e-06, + "loss": 0.8601, + "step": 1657 + }, + { + "epoch": 0.09125433430568551, + "grad_norm": 0.9041332602500916, + "learning_rate": 9.954460750640762e-06, + "loss": 0.9104, + "step": 1658 + }, + { + "epoch": 0.09130937310804117, + "grad_norm": 0.7786296606063843, + "learning_rate": 9.954402362323445e-06, + "loss": 0.7671, + "step": 1659 + }, + { + "epoch": 0.09136441191039683, + "grad_norm": 1.0363564491271973, + "learning_rate": 9.954343936770165e-06, + "loss": 0.9339, + "step": 1660 + }, + { + "epoch": 0.0914194507127525, + "grad_norm": 0.8049986958503723, + "learning_rate": 9.954285473981363e-06, + "loss": 0.8125, + "step": 1661 + }, + { + "epoch": 0.09147448951510816, + "grad_norm": 0.7842011451721191, + "learning_rate": 9.954226973957477e-06, + "loss": 0.7153, + "step": 1662 + }, + { + "epoch": 0.09152952831746382, + "grad_norm": 0.8929729461669922, + "learning_rate": 9.954168436698948e-06, + "loss": 0.9563, + "step": 1663 + }, + { + "epoch": 0.09158456711981948, + "grad_norm": 0.8850226402282715, + "learning_rate": 9.954109862206216e-06, + "loss": 0.8257, + "step": 1664 + }, + { + "epoch": 0.09163960592217513, + "grad_norm": 0.8673348426818848, + "learning_rate": 9.954051250479719e-06, + "loss": 0.9489, + "step": 1665 + }, + { + "epoch": 0.09169464472453079, + "grad_norm": 0.8726119995117188, + "learning_rate": 9.9539926015199e-06, + "loss": 0.8222, + "step": 1666 + }, + { + "epoch": 0.09174968352688645, + "grad_norm": 0.7609312534332275, + "learning_rate": 9.953933915327196e-06, + "loss": 0.7749, + "step": 1667 + }, + { + "epoch": 0.09180472232924211, + "grad_norm": 0.857404887676239, + "learning_rate": 9.953875191902055e-06, + "loss": 0.8496, + "step": 1668 + }, + { + "epoch": 0.09185976113159777, + "grad_norm": 0.7835526466369629, + "learning_rate": 9.953816431244909e-06, + "loss": 0.7258, + "step": 1669 + }, + { + "epoch": 0.09191479993395343, + "grad_norm": 0.944984495639801, + "learning_rate": 9.95375763335621e-06, + "loss": 0.902, + "step": 1670 + }, + { + "epoch": 0.0919698387363091, + "grad_norm": 0.9038936495780945, + "learning_rate": 9.953698798236391e-06, + "loss": 0.7559, + "step": 1671 + }, + { + "epoch": 0.09202487753866476, + "grad_norm": 0.8450848460197449, + "learning_rate": 9.953639925885898e-06, + "loss": 0.8338, + "step": 1672 + }, + { + "epoch": 0.09207991634102042, + "grad_norm": 0.827419102191925, + "learning_rate": 9.953581016305175e-06, + "loss": 0.8167, + "step": 1673 + }, + { + "epoch": 0.09213495514337608, + "grad_norm": 0.8517075777053833, + "learning_rate": 9.953522069494663e-06, + "loss": 0.8681, + "step": 1674 + }, + { + "epoch": 0.09218999394573174, + "grad_norm": 0.9504323601722717, + "learning_rate": 9.953463085454804e-06, + "loss": 0.8688, + "step": 1675 + }, + { + "epoch": 0.0922450327480874, + "grad_norm": 0.8905719518661499, + "learning_rate": 9.953404064186044e-06, + "loss": 0.8818, + "step": 1676 + }, + { + "epoch": 0.09230007155044306, + "grad_norm": 0.9223340153694153, + "learning_rate": 9.953345005688822e-06, + "loss": 0.8752, + "step": 1677 + }, + { + "epoch": 0.09235511035279872, + "grad_norm": 1.0500547885894775, + "learning_rate": 9.953285909963588e-06, + "loss": 0.7816, + "step": 1678 + }, + { + "epoch": 0.09241014915515439, + "grad_norm": 0.8407441973686218, + "learning_rate": 9.953226777010781e-06, + "loss": 0.745, + "step": 1679 + }, + { + "epoch": 0.09246518795751005, + "grad_norm": 0.7997288107872009, + "learning_rate": 9.953167606830847e-06, + "loss": 0.8171, + "step": 1680 + }, + { + "epoch": 0.09252022675986571, + "grad_norm": 0.9752318859100342, + "learning_rate": 9.953108399424234e-06, + "loss": 0.8719, + "step": 1681 + }, + { + "epoch": 0.09257526556222137, + "grad_norm": 0.8524298667907715, + "learning_rate": 9.953049154791382e-06, + "loss": 0.8257, + "step": 1682 + }, + { + "epoch": 0.09263030436457703, + "grad_norm": 0.9460529088973999, + "learning_rate": 9.952989872932739e-06, + "loss": 0.7278, + "step": 1683 + }, + { + "epoch": 0.0926853431669327, + "grad_norm": 0.8959575891494751, + "learning_rate": 9.95293055384875e-06, + "loss": 0.903, + "step": 1684 + }, + { + "epoch": 0.09274038196928835, + "grad_norm": 0.8764386177062988, + "learning_rate": 9.95287119753986e-06, + "loss": 0.7958, + "step": 1685 + }, + { + "epoch": 0.092795420771644, + "grad_norm": 0.9611337184906006, + "learning_rate": 9.952811804006517e-06, + "loss": 0.8726, + "step": 1686 + }, + { + "epoch": 0.09285045957399966, + "grad_norm": 0.8155574202537537, + "learning_rate": 9.952752373249165e-06, + "loss": 0.7882, + "step": 1687 + }, + { + "epoch": 0.09290549837635532, + "grad_norm": 0.8789697289466858, + "learning_rate": 9.952692905268253e-06, + "loss": 0.8642, + "step": 1688 + }, + { + "epoch": 0.09296053717871099, + "grad_norm": 0.7910027503967285, + "learning_rate": 9.952633400064227e-06, + "loss": 0.7852, + "step": 1689 + }, + { + "epoch": 0.09301557598106665, + "grad_norm": 0.815819501876831, + "learning_rate": 9.952573857637533e-06, + "loss": 0.8606, + "step": 1690 + }, + { + "epoch": 0.09307061478342231, + "grad_norm": 0.9840701818466187, + "learning_rate": 9.95251427798862e-06, + "loss": 0.9349, + "step": 1691 + }, + { + "epoch": 0.09312565358577797, + "grad_norm": 0.8715788722038269, + "learning_rate": 9.952454661117936e-06, + "loss": 0.813, + "step": 1692 + }, + { + "epoch": 0.09318069238813363, + "grad_norm": 0.8287779092788696, + "learning_rate": 9.952395007025926e-06, + "loss": 0.8346, + "step": 1693 + }, + { + "epoch": 0.0932357311904893, + "grad_norm": 0.9375059008598328, + "learning_rate": 9.952335315713044e-06, + "loss": 0.8868, + "step": 1694 + }, + { + "epoch": 0.09329076999284495, + "grad_norm": 0.9063667058944702, + "learning_rate": 9.952275587179734e-06, + "loss": 0.9562, + "step": 1695 + }, + { + "epoch": 0.09334580879520062, + "grad_norm": 0.816643476486206, + "learning_rate": 9.952215821426447e-06, + "loss": 0.7456, + "step": 1696 + }, + { + "epoch": 0.09340084759755628, + "grad_norm": 0.9004347324371338, + "learning_rate": 9.95215601845363e-06, + "loss": 0.8545, + "step": 1697 + }, + { + "epoch": 0.09345588639991194, + "grad_norm": 0.919195830821991, + "learning_rate": 9.952096178261736e-06, + "loss": 0.9347, + "step": 1698 + }, + { + "epoch": 0.0935109252022676, + "grad_norm": 0.8313261866569519, + "learning_rate": 9.952036300851211e-06, + "loss": 0.9169, + "step": 1699 + }, + { + "epoch": 0.09356596400462326, + "grad_norm": 0.8674910664558411, + "learning_rate": 9.951976386222507e-06, + "loss": 0.7621, + "step": 1700 + }, + { + "epoch": 0.09362100280697892, + "grad_norm": 0.8931052684783936, + "learning_rate": 9.951916434376074e-06, + "loss": 0.8702, + "step": 1701 + }, + { + "epoch": 0.09367604160933458, + "grad_norm": 0.8748393058776855, + "learning_rate": 9.951856445312364e-06, + "loss": 0.7446, + "step": 1702 + }, + { + "epoch": 0.09373108041169025, + "grad_norm": 1.005459189414978, + "learning_rate": 9.951796419031825e-06, + "loss": 0.9843, + "step": 1703 + }, + { + "epoch": 0.09378611921404591, + "grad_norm": 1.0155184268951416, + "learning_rate": 9.95173635553491e-06, + "loss": 0.8868, + "step": 1704 + }, + { + "epoch": 0.09384115801640157, + "grad_norm": 2.1387271881103516, + "learning_rate": 9.951676254822072e-06, + "loss": 0.8691, + "step": 1705 + }, + { + "epoch": 0.09389619681875723, + "grad_norm": 0.9768403768539429, + "learning_rate": 9.951616116893757e-06, + "loss": 0.8409, + "step": 1706 + }, + { + "epoch": 0.09395123562111289, + "grad_norm": 0.7994607090950012, + "learning_rate": 9.951555941750424e-06, + "loss": 0.7836, + "step": 1707 + }, + { + "epoch": 0.09400627442346854, + "grad_norm": 0.8460201025009155, + "learning_rate": 9.95149572939252e-06, + "loss": 0.8216, + "step": 1708 + }, + { + "epoch": 0.0940613132258242, + "grad_norm": 0.8904135227203369, + "learning_rate": 9.951435479820499e-06, + "loss": 0.9053, + "step": 1709 + }, + { + "epoch": 0.09411635202817986, + "grad_norm": 0.9084494113922119, + "learning_rate": 9.951375193034815e-06, + "loss": 0.9308, + "step": 1710 + }, + { + "epoch": 0.09417139083053552, + "grad_norm": 1.0826482772827148, + "learning_rate": 9.951314869035921e-06, + "loss": 0.8468, + "step": 1711 + }, + { + "epoch": 0.09422642963289118, + "grad_norm": 0.8068915009498596, + "learning_rate": 9.95125450782427e-06, + "loss": 0.8253, + "step": 1712 + }, + { + "epoch": 0.09428146843524685, + "grad_norm": 0.8445400595664978, + "learning_rate": 9.951194109400316e-06, + "loss": 0.8386, + "step": 1713 + }, + { + "epoch": 0.09433650723760251, + "grad_norm": 0.8180645704269409, + "learning_rate": 9.951133673764513e-06, + "loss": 0.7907, + "step": 1714 + }, + { + "epoch": 0.09439154603995817, + "grad_norm": 0.8111036419868469, + "learning_rate": 9.951073200917311e-06, + "loss": 0.7918, + "step": 1715 + }, + { + "epoch": 0.09444658484231383, + "grad_norm": 0.862042248249054, + "learning_rate": 9.951012690859172e-06, + "loss": 0.783, + "step": 1716 + }, + { + "epoch": 0.09450162364466949, + "grad_norm": 0.8189615607261658, + "learning_rate": 9.950952143590544e-06, + "loss": 0.8192, + "step": 1717 + }, + { + "epoch": 0.09455666244702515, + "grad_norm": 0.9714062809944153, + "learning_rate": 9.950891559111887e-06, + "loss": 0.774, + "step": 1718 + }, + { + "epoch": 0.09461170124938081, + "grad_norm": 0.9691846370697021, + "learning_rate": 9.950830937423655e-06, + "loss": 0.8347, + "step": 1719 + }, + { + "epoch": 0.09466674005173648, + "grad_norm": 0.8488250970840454, + "learning_rate": 9.950770278526301e-06, + "loss": 0.8228, + "step": 1720 + }, + { + "epoch": 0.09472177885409214, + "grad_norm": 0.8638359904289246, + "learning_rate": 9.950709582420282e-06, + "loss": 0.8973, + "step": 1721 + }, + { + "epoch": 0.0947768176564478, + "grad_norm": 1.0148643255233765, + "learning_rate": 9.950648849106058e-06, + "loss": 0.9638, + "step": 1722 + }, + { + "epoch": 0.09483185645880346, + "grad_norm": 0.8870131969451904, + "learning_rate": 9.95058807858408e-06, + "loss": 0.8259, + "step": 1723 + }, + { + "epoch": 0.09488689526115912, + "grad_norm": 0.9134769439697266, + "learning_rate": 9.950527270854807e-06, + "loss": 0.865, + "step": 1724 + }, + { + "epoch": 0.09494193406351478, + "grad_norm": 0.7221654653549194, + "learning_rate": 9.950466425918697e-06, + "loss": 0.7593, + "step": 1725 + }, + { + "epoch": 0.09499697286587044, + "grad_norm": 0.9386674165725708, + "learning_rate": 9.950405543776207e-06, + "loss": 0.9508, + "step": 1726 + }, + { + "epoch": 0.0950520116682261, + "grad_norm": 0.7850627899169922, + "learning_rate": 9.950344624427795e-06, + "loss": 0.7999, + "step": 1727 + }, + { + "epoch": 0.09510705047058177, + "grad_norm": 0.921198308467865, + "learning_rate": 9.950283667873916e-06, + "loss": 0.8249, + "step": 1728 + }, + { + "epoch": 0.09516208927293741, + "grad_norm": 0.9503389000892639, + "learning_rate": 9.95022267411503e-06, + "loss": 0.901, + "step": 1729 + }, + { + "epoch": 0.09521712807529308, + "grad_norm": 0.7977343201637268, + "learning_rate": 9.950161643151597e-06, + "loss": 0.838, + "step": 1730 + }, + { + "epoch": 0.09527216687764874, + "grad_norm": 0.9056238532066345, + "learning_rate": 9.950100574984072e-06, + "loss": 0.9756, + "step": 1731 + }, + { + "epoch": 0.0953272056800044, + "grad_norm": 0.8092935681343079, + "learning_rate": 9.950039469612918e-06, + "loss": 0.8812, + "step": 1732 + }, + { + "epoch": 0.09538224448236006, + "grad_norm": 0.823693573474884, + "learning_rate": 9.949978327038592e-06, + "loss": 0.7914, + "step": 1733 + }, + { + "epoch": 0.09543728328471572, + "grad_norm": 0.9114876389503479, + "learning_rate": 9.949917147261554e-06, + "loss": 0.7944, + "step": 1734 + }, + { + "epoch": 0.09549232208707138, + "grad_norm": 1.0084123611450195, + "learning_rate": 9.949855930282262e-06, + "loss": 0.8544, + "step": 1735 + }, + { + "epoch": 0.09554736088942704, + "grad_norm": 0.842462956905365, + "learning_rate": 9.949794676101181e-06, + "loss": 0.7056, + "step": 1736 + }, + { + "epoch": 0.0956023996917827, + "grad_norm": 1.00497305393219, + "learning_rate": 9.949733384718766e-06, + "loss": 0.8372, + "step": 1737 + }, + { + "epoch": 0.09565743849413837, + "grad_norm": 1.0166410207748413, + "learning_rate": 9.94967205613548e-06, + "loss": 0.9316, + "step": 1738 + }, + { + "epoch": 0.09571247729649403, + "grad_norm": 0.8520192503929138, + "learning_rate": 9.949610690351784e-06, + "loss": 0.786, + "step": 1739 + }, + { + "epoch": 0.09576751609884969, + "grad_norm": 0.8003227114677429, + "learning_rate": 9.949549287368139e-06, + "loss": 0.8003, + "step": 1740 + }, + { + "epoch": 0.09582255490120535, + "grad_norm": 0.8657151460647583, + "learning_rate": 9.949487847185006e-06, + "loss": 0.8407, + "step": 1741 + }, + { + "epoch": 0.09587759370356101, + "grad_norm": 1.1119858026504517, + "learning_rate": 9.949426369802848e-06, + "loss": 0.8594, + "step": 1742 + }, + { + "epoch": 0.09593263250591667, + "grad_norm": 0.8968474864959717, + "learning_rate": 9.949364855222126e-06, + "loss": 0.8254, + "step": 1743 + }, + { + "epoch": 0.09598767130827233, + "grad_norm": 0.8740531206130981, + "learning_rate": 9.949303303443304e-06, + "loss": 0.8748, + "step": 1744 + }, + { + "epoch": 0.096042710110628, + "grad_norm": 0.8833459615707397, + "learning_rate": 9.94924171446684e-06, + "loss": 0.838, + "step": 1745 + }, + { + "epoch": 0.09609774891298366, + "grad_norm": 0.8783486485481262, + "learning_rate": 9.949180088293201e-06, + "loss": 0.7972, + "step": 1746 + }, + { + "epoch": 0.09615278771533932, + "grad_norm": 0.9197877049446106, + "learning_rate": 9.949118424922852e-06, + "loss": 0.8669, + "step": 1747 + }, + { + "epoch": 0.09620782651769498, + "grad_norm": 0.9771283864974976, + "learning_rate": 9.949056724356251e-06, + "loss": 0.8461, + "step": 1748 + }, + { + "epoch": 0.09626286532005064, + "grad_norm": 0.8325022459030151, + "learning_rate": 9.948994986593864e-06, + "loss": 0.8482, + "step": 1749 + }, + { + "epoch": 0.0963179041224063, + "grad_norm": 0.9732363224029541, + "learning_rate": 9.948933211636158e-06, + "loss": 0.8825, + "step": 1750 + }, + { + "epoch": 0.09637294292476195, + "grad_norm": 0.8229798078536987, + "learning_rate": 9.948871399483592e-06, + "loss": 0.8079, + "step": 1751 + }, + { + "epoch": 0.09642798172711761, + "grad_norm": 0.8861554265022278, + "learning_rate": 9.948809550136635e-06, + "loss": 0.8323, + "step": 1752 + }, + { + "epoch": 0.09648302052947327, + "grad_norm": 1.0618904829025269, + "learning_rate": 9.94874766359575e-06, + "loss": 0.8519, + "step": 1753 + }, + { + "epoch": 0.09653805933182893, + "grad_norm": 0.8494864702224731, + "learning_rate": 9.948685739861403e-06, + "loss": 0.961, + "step": 1754 + }, + { + "epoch": 0.0965930981341846, + "grad_norm": 0.8872213959693909, + "learning_rate": 9.948623778934058e-06, + "loss": 0.9367, + "step": 1755 + }, + { + "epoch": 0.09664813693654026, + "grad_norm": 0.8441230058670044, + "learning_rate": 9.948561780814181e-06, + "loss": 0.7654, + "step": 1756 + }, + { + "epoch": 0.09670317573889592, + "grad_norm": 0.8072223663330078, + "learning_rate": 9.948499745502239e-06, + "loss": 0.7894, + "step": 1757 + }, + { + "epoch": 0.09675821454125158, + "grad_norm": 0.8285261392593384, + "learning_rate": 9.948437672998696e-06, + "loss": 0.8351, + "step": 1758 + }, + { + "epoch": 0.09681325334360724, + "grad_norm": 0.9272124767303467, + "learning_rate": 9.94837556330402e-06, + "loss": 0.8708, + "step": 1759 + }, + { + "epoch": 0.0968682921459629, + "grad_norm": 0.8689375519752502, + "learning_rate": 9.94831341641868e-06, + "loss": 0.8478, + "step": 1760 + }, + { + "epoch": 0.09692333094831856, + "grad_norm": 1.040784239768982, + "learning_rate": 9.94825123234314e-06, + "loss": 0.8915, + "step": 1761 + }, + { + "epoch": 0.09697836975067423, + "grad_norm": 0.7819718718528748, + "learning_rate": 9.948189011077867e-06, + "loss": 0.7728, + "step": 1762 + }, + { + "epoch": 0.09703340855302989, + "grad_norm": 0.7959379553794861, + "learning_rate": 9.948126752623331e-06, + "loss": 0.8248, + "step": 1763 + }, + { + "epoch": 0.09708844735538555, + "grad_norm": 0.8844753503799438, + "learning_rate": 9.94806445698e-06, + "loss": 0.7742, + "step": 1764 + }, + { + "epoch": 0.09714348615774121, + "grad_norm": 0.9168505668640137, + "learning_rate": 9.948002124148339e-06, + "loss": 0.9145, + "step": 1765 + }, + { + "epoch": 0.09719852496009687, + "grad_norm": 0.7199662923812866, + "learning_rate": 9.947939754128819e-06, + "loss": 0.6652, + "step": 1766 + }, + { + "epoch": 0.09725356376245253, + "grad_norm": 0.866470992565155, + "learning_rate": 9.947877346921909e-06, + "loss": 0.8293, + "step": 1767 + }, + { + "epoch": 0.0973086025648082, + "grad_norm": 0.9124754667282104, + "learning_rate": 9.947814902528078e-06, + "loss": 0.8599, + "step": 1768 + }, + { + "epoch": 0.09736364136716386, + "grad_norm": 0.9169870615005493, + "learning_rate": 9.947752420947792e-06, + "loss": 0.8382, + "step": 1769 + }, + { + "epoch": 0.09741868016951952, + "grad_norm": 1.0147640705108643, + "learning_rate": 9.947689902181526e-06, + "loss": 0.8425, + "step": 1770 + }, + { + "epoch": 0.09747371897187518, + "grad_norm": 0.778575599193573, + "learning_rate": 9.947627346229745e-06, + "loss": 0.6979, + "step": 1771 + }, + { + "epoch": 0.09752875777423083, + "grad_norm": 0.815101146697998, + "learning_rate": 9.947564753092922e-06, + "loss": 0.8617, + "step": 1772 + }, + { + "epoch": 0.09758379657658649, + "grad_norm": 0.9556358456611633, + "learning_rate": 9.947502122771527e-06, + "loss": 0.9009, + "step": 1773 + }, + { + "epoch": 0.09763883537894215, + "grad_norm": 0.8603761196136475, + "learning_rate": 9.94743945526603e-06, + "loss": 0.9443, + "step": 1774 + }, + { + "epoch": 0.09769387418129781, + "grad_norm": 0.8621761798858643, + "learning_rate": 9.947376750576903e-06, + "loss": 0.7537, + "step": 1775 + }, + { + "epoch": 0.09774891298365347, + "grad_norm": 0.7399948835372925, + "learning_rate": 9.947314008704616e-06, + "loss": 0.7477, + "step": 1776 + }, + { + "epoch": 0.09780395178600913, + "grad_norm": 0.8855582475662231, + "learning_rate": 9.947251229649641e-06, + "loss": 0.8745, + "step": 1777 + }, + { + "epoch": 0.0978589905883648, + "grad_norm": 0.8718472719192505, + "learning_rate": 9.947188413412452e-06, + "loss": 0.9672, + "step": 1778 + }, + { + "epoch": 0.09791402939072046, + "grad_norm": 0.8598514795303345, + "learning_rate": 9.947125559993517e-06, + "loss": 0.8278, + "step": 1779 + }, + { + "epoch": 0.09796906819307612, + "grad_norm": 1.0373798608779907, + "learning_rate": 9.947062669393312e-06, + "loss": 0.8123, + "step": 1780 + }, + { + "epoch": 0.09802410699543178, + "grad_norm": 1.0198705196380615, + "learning_rate": 9.946999741612306e-06, + "loss": 0.9039, + "step": 1781 + }, + { + "epoch": 0.09807914579778744, + "grad_norm": 0.8770025968551636, + "learning_rate": 9.946936776650977e-06, + "loss": 0.8326, + "step": 1782 + }, + { + "epoch": 0.0981341846001431, + "grad_norm": 0.7970215678215027, + "learning_rate": 9.946873774509794e-06, + "loss": 0.848, + "step": 1783 + }, + { + "epoch": 0.09818922340249876, + "grad_norm": 0.90342777967453, + "learning_rate": 9.946810735189231e-06, + "loss": 0.7993, + "step": 1784 + }, + { + "epoch": 0.09824426220485442, + "grad_norm": 1.2095681428909302, + "learning_rate": 9.946747658689763e-06, + "loss": 0.8544, + "step": 1785 + }, + { + "epoch": 0.09829930100721009, + "grad_norm": 0.8500953316688538, + "learning_rate": 9.946684545011866e-06, + "loss": 0.8398, + "step": 1786 + }, + { + "epoch": 0.09835433980956575, + "grad_norm": 0.8570724725723267, + "learning_rate": 9.946621394156011e-06, + "loss": 0.9255, + "step": 1787 + }, + { + "epoch": 0.09840937861192141, + "grad_norm": 0.8314846158027649, + "learning_rate": 9.946558206122672e-06, + "loss": 0.8398, + "step": 1788 + }, + { + "epoch": 0.09846441741427707, + "grad_norm": 0.8894716501235962, + "learning_rate": 9.946494980912326e-06, + "loss": 0.8612, + "step": 1789 + }, + { + "epoch": 0.09851945621663273, + "grad_norm": 0.9555756449699402, + "learning_rate": 9.94643171852545e-06, + "loss": 0.9551, + "step": 1790 + }, + { + "epoch": 0.09857449501898839, + "grad_norm": 0.9556692838668823, + "learning_rate": 9.946368418962515e-06, + "loss": 0.8175, + "step": 1791 + }, + { + "epoch": 0.09862953382134405, + "grad_norm": 0.7288535833358765, + "learning_rate": 9.946305082224e-06, + "loss": 0.6162, + "step": 1792 + }, + { + "epoch": 0.09868457262369972, + "grad_norm": 0.95478355884552, + "learning_rate": 9.94624170831038e-06, + "loss": 0.9089, + "step": 1793 + }, + { + "epoch": 0.09873961142605536, + "grad_norm": 0.9080137610435486, + "learning_rate": 9.946178297222133e-06, + "loss": 0.9443, + "step": 1794 + }, + { + "epoch": 0.09879465022841102, + "grad_norm": 0.8060124516487122, + "learning_rate": 9.946114848959732e-06, + "loss": 0.7412, + "step": 1795 + }, + { + "epoch": 0.09884968903076669, + "grad_norm": 0.8487932085990906, + "learning_rate": 9.946051363523655e-06, + "loss": 0.7098, + "step": 1796 + }, + { + "epoch": 0.09890472783312235, + "grad_norm": 0.8982037901878357, + "learning_rate": 9.945987840914381e-06, + "loss": 0.8304, + "step": 1797 + }, + { + "epoch": 0.09895976663547801, + "grad_norm": 0.8124602437019348, + "learning_rate": 9.945924281132386e-06, + "loss": 0.8441, + "step": 1798 + }, + { + "epoch": 0.09901480543783367, + "grad_norm": 0.8081663250923157, + "learning_rate": 9.945860684178147e-06, + "loss": 0.732, + "step": 1799 + }, + { + "epoch": 0.09906984424018933, + "grad_norm": 0.7662907242774963, + "learning_rate": 9.945797050052147e-06, + "loss": 0.7538, + "step": 1800 + }, + { + "epoch": 0.09912488304254499, + "grad_norm": 0.8418399095535278, + "learning_rate": 9.945733378754856e-06, + "loss": 0.8488, + "step": 1801 + }, + { + "epoch": 0.09917992184490065, + "grad_norm": 0.7298988699913025, + "learning_rate": 9.94566967028676e-06, + "loss": 0.7822, + "step": 1802 + }, + { + "epoch": 0.09923496064725632, + "grad_norm": 0.7788695693016052, + "learning_rate": 9.945605924648332e-06, + "loss": 0.8037, + "step": 1803 + }, + { + "epoch": 0.09928999944961198, + "grad_norm": 0.939297080039978, + "learning_rate": 9.945542141840054e-06, + "loss": 0.8654, + "step": 1804 + }, + { + "epoch": 0.09934503825196764, + "grad_norm": 0.9274358749389648, + "learning_rate": 9.945478321862406e-06, + "loss": 0.7712, + "step": 1805 + }, + { + "epoch": 0.0994000770543233, + "grad_norm": 0.816561222076416, + "learning_rate": 9.945414464715866e-06, + "loss": 0.7676, + "step": 1806 + }, + { + "epoch": 0.09945511585667896, + "grad_norm": 0.867915153503418, + "learning_rate": 9.945350570400916e-06, + "loss": 0.8343, + "step": 1807 + }, + { + "epoch": 0.09951015465903462, + "grad_norm": 0.8446162939071655, + "learning_rate": 9.945286638918034e-06, + "loss": 0.8128, + "step": 1808 + }, + { + "epoch": 0.09956519346139028, + "grad_norm": 0.8372986316680908, + "learning_rate": 9.945222670267703e-06, + "loss": 0.8611, + "step": 1809 + }, + { + "epoch": 0.09962023226374594, + "grad_norm": 0.787836492061615, + "learning_rate": 9.945158664450399e-06, + "loss": 0.7286, + "step": 1810 + }, + { + "epoch": 0.0996752710661016, + "grad_norm": 0.9293436408042908, + "learning_rate": 9.945094621466609e-06, + "loss": 0.8699, + "step": 1811 + }, + { + "epoch": 0.09973030986845727, + "grad_norm": 0.8336932063102722, + "learning_rate": 9.94503054131681e-06, + "loss": 0.8222, + "step": 1812 + }, + { + "epoch": 0.09978534867081293, + "grad_norm": 0.8310953378677368, + "learning_rate": 9.944966424001486e-06, + "loss": 0.8131, + "step": 1813 + }, + { + "epoch": 0.09984038747316859, + "grad_norm": 0.7703443169593811, + "learning_rate": 9.944902269521117e-06, + "loss": 0.8135, + "step": 1814 + }, + { + "epoch": 0.09989542627552424, + "grad_norm": 0.750990092754364, + "learning_rate": 9.944838077876186e-06, + "loss": 0.8137, + "step": 1815 + }, + { + "epoch": 0.0999504650778799, + "grad_norm": 0.8502481579780579, + "learning_rate": 9.944773849067178e-06, + "loss": 0.8973, + "step": 1816 + }, + { + "epoch": 0.10000550388023556, + "grad_norm": 0.8299791812896729, + "learning_rate": 9.94470958309457e-06, + "loss": 0.8341, + "step": 1817 + }, + { + "epoch": 0.10006054268259122, + "grad_norm": 0.8519022464752197, + "learning_rate": 9.94464527995885e-06, + "loss": 0.8529, + "step": 1818 + }, + { + "epoch": 0.10011558148494688, + "grad_norm": 0.9318063259124756, + "learning_rate": 9.944580939660501e-06, + "loss": 0.8978, + "step": 1819 + }, + { + "epoch": 0.10017062028730254, + "grad_norm": 0.847023069858551, + "learning_rate": 9.944516562200004e-06, + "loss": 0.8007, + "step": 1820 + }, + { + "epoch": 0.1002256590896582, + "grad_norm": 0.8817011117935181, + "learning_rate": 9.944452147577844e-06, + "loss": 0.8819, + "step": 1821 + }, + { + "epoch": 0.10028069789201387, + "grad_norm": 0.8560144901275635, + "learning_rate": 9.944387695794505e-06, + "loss": 0.8219, + "step": 1822 + }, + { + "epoch": 0.10033573669436953, + "grad_norm": 0.9358342885971069, + "learning_rate": 9.944323206850472e-06, + "loss": 0.8533, + "step": 1823 + }, + { + "epoch": 0.10039077549672519, + "grad_norm": 0.8327087163925171, + "learning_rate": 9.94425868074623e-06, + "loss": 0.8359, + "step": 1824 + }, + { + "epoch": 0.10044581429908085, + "grad_norm": 1.0590367317199707, + "learning_rate": 9.944194117482263e-06, + "loss": 0.9659, + "step": 1825 + }, + { + "epoch": 0.10050085310143651, + "grad_norm": 0.8739829063415527, + "learning_rate": 9.944129517059055e-06, + "loss": 0.7868, + "step": 1826 + }, + { + "epoch": 0.10055589190379217, + "grad_norm": 0.8465235233306885, + "learning_rate": 9.944064879477093e-06, + "loss": 0.8554, + "step": 1827 + }, + { + "epoch": 0.10061093070614784, + "grad_norm": 0.9068321585655212, + "learning_rate": 9.944000204736864e-06, + "loss": 0.8648, + "step": 1828 + }, + { + "epoch": 0.1006659695085035, + "grad_norm": 0.8308066725730896, + "learning_rate": 9.943935492838853e-06, + "loss": 0.8471, + "step": 1829 + }, + { + "epoch": 0.10072100831085916, + "grad_norm": 0.9973901510238647, + "learning_rate": 9.943870743783545e-06, + "loss": 0.9398, + "step": 1830 + }, + { + "epoch": 0.10077604711321482, + "grad_norm": 0.8532593250274658, + "learning_rate": 9.94380595757143e-06, + "loss": 0.9001, + "step": 1831 + }, + { + "epoch": 0.10083108591557048, + "grad_norm": 0.8571139574050903, + "learning_rate": 9.94374113420299e-06, + "loss": 0.85, + "step": 1832 + }, + { + "epoch": 0.10088612471792614, + "grad_norm": 0.905624508857727, + "learning_rate": 9.943676273678717e-06, + "loss": 0.9587, + "step": 1833 + }, + { + "epoch": 0.1009411635202818, + "grad_norm": 1.0224663019180298, + "learning_rate": 9.943611375999097e-06, + "loss": 0.8236, + "step": 1834 + }, + { + "epoch": 0.10099620232263747, + "grad_norm": 0.8900588154792786, + "learning_rate": 9.943546441164615e-06, + "loss": 0.877, + "step": 1835 + }, + { + "epoch": 0.10105124112499313, + "grad_norm": 0.8852938413619995, + "learning_rate": 9.943481469175765e-06, + "loss": 0.9521, + "step": 1836 + }, + { + "epoch": 0.10110627992734877, + "grad_norm": 0.9249371290206909, + "learning_rate": 9.943416460033027e-06, + "loss": 0.8541, + "step": 1837 + }, + { + "epoch": 0.10116131872970444, + "grad_norm": 0.8533583283424377, + "learning_rate": 9.943351413736897e-06, + "loss": 0.8571, + "step": 1838 + }, + { + "epoch": 0.1012163575320601, + "grad_norm": 0.743800699710846, + "learning_rate": 9.94328633028786e-06, + "loss": 0.749, + "step": 1839 + }, + { + "epoch": 0.10127139633441576, + "grad_norm": 0.7836641669273376, + "learning_rate": 9.943221209686407e-06, + "loss": 0.8237, + "step": 1840 + }, + { + "epoch": 0.10132643513677142, + "grad_norm": 0.800782322883606, + "learning_rate": 9.943156051933024e-06, + "loss": 0.8323, + "step": 1841 + }, + { + "epoch": 0.10138147393912708, + "grad_norm": 0.7531478404998779, + "learning_rate": 9.943090857028206e-06, + "loss": 0.8041, + "step": 1842 + }, + { + "epoch": 0.10143651274148274, + "grad_norm": 0.9837996959686279, + "learning_rate": 9.94302562497244e-06, + "loss": 0.8084, + "step": 1843 + }, + { + "epoch": 0.1014915515438384, + "grad_norm": 0.8038331866264343, + "learning_rate": 9.942960355766216e-06, + "loss": 0.8454, + "step": 1844 + }, + { + "epoch": 0.10154659034619407, + "grad_norm": 0.7822145819664001, + "learning_rate": 9.942895049410024e-06, + "loss": 0.8137, + "step": 1845 + }, + { + "epoch": 0.10160162914854973, + "grad_norm": 0.8222663998603821, + "learning_rate": 9.942829705904358e-06, + "loss": 0.8981, + "step": 1846 + }, + { + "epoch": 0.10165666795090539, + "grad_norm": 1.0095717906951904, + "learning_rate": 9.942764325249707e-06, + "loss": 0.9159, + "step": 1847 + }, + { + "epoch": 0.10171170675326105, + "grad_norm": 0.8264054656028748, + "learning_rate": 9.942698907446561e-06, + "loss": 0.9233, + "step": 1848 + }, + { + "epoch": 0.10176674555561671, + "grad_norm": 0.8244288563728333, + "learning_rate": 9.942633452495414e-06, + "loss": 0.8507, + "step": 1849 + }, + { + "epoch": 0.10182178435797237, + "grad_norm": 0.8457715511322021, + "learning_rate": 9.942567960396755e-06, + "loss": 0.7897, + "step": 1850 + }, + { + "epoch": 0.10187682316032803, + "grad_norm": 0.8356698155403137, + "learning_rate": 9.94250243115108e-06, + "loss": 0.7927, + "step": 1851 + }, + { + "epoch": 0.1019318619626837, + "grad_norm": 0.8251230716705322, + "learning_rate": 9.94243686475888e-06, + "loss": 0.8977, + "step": 1852 + }, + { + "epoch": 0.10198690076503936, + "grad_norm": 0.8370125889778137, + "learning_rate": 9.942371261220647e-06, + "loss": 0.8204, + "step": 1853 + }, + { + "epoch": 0.10204193956739502, + "grad_norm": 1.6722066402435303, + "learning_rate": 9.942305620536876e-06, + "loss": 0.9284, + "step": 1854 + }, + { + "epoch": 0.10209697836975068, + "grad_norm": 0.8424906730651855, + "learning_rate": 9.942239942708057e-06, + "loss": 0.833, + "step": 1855 + }, + { + "epoch": 0.10215201717210634, + "grad_norm": 0.7475115656852722, + "learning_rate": 9.942174227734686e-06, + "loss": 0.6158, + "step": 1856 + }, + { + "epoch": 0.102207055974462, + "grad_norm": 0.8652095198631287, + "learning_rate": 9.942108475617256e-06, + "loss": 0.8781, + "step": 1857 + }, + { + "epoch": 0.10226209477681765, + "grad_norm": 1.0621691942214966, + "learning_rate": 9.942042686356263e-06, + "loss": 1.0276, + "step": 1858 + }, + { + "epoch": 0.10231713357917331, + "grad_norm": 1.113357424736023, + "learning_rate": 9.941976859952199e-06, + "loss": 0.8799, + "step": 1859 + }, + { + "epoch": 0.10237217238152897, + "grad_norm": 0.9153568148612976, + "learning_rate": 9.94191099640556e-06, + "loss": 0.7988, + "step": 1860 + }, + { + "epoch": 0.10242721118388463, + "grad_norm": 0.9217341542243958, + "learning_rate": 9.941845095716842e-06, + "loss": 0.7785, + "step": 1861 + }, + { + "epoch": 0.1024822499862403, + "grad_norm": 0.8702190518379211, + "learning_rate": 9.941779157886538e-06, + "loss": 0.7648, + "step": 1862 + }, + { + "epoch": 0.10253728878859596, + "grad_norm": 0.8609822988510132, + "learning_rate": 9.941713182915144e-06, + "loss": 0.9095, + "step": 1863 + }, + { + "epoch": 0.10259232759095162, + "grad_norm": 0.7766719460487366, + "learning_rate": 9.941647170803157e-06, + "loss": 0.6984, + "step": 1864 + }, + { + "epoch": 0.10264736639330728, + "grad_norm": 0.8497375249862671, + "learning_rate": 9.941581121551074e-06, + "loss": 0.9161, + "step": 1865 + }, + { + "epoch": 0.10270240519566294, + "grad_norm": 0.8007600903511047, + "learning_rate": 9.941515035159388e-06, + "loss": 0.8099, + "step": 1866 + }, + { + "epoch": 0.1027574439980186, + "grad_norm": 0.7932959794998169, + "learning_rate": 9.941448911628599e-06, + "loss": 0.8049, + "step": 1867 + }, + { + "epoch": 0.10281248280037426, + "grad_norm": 1.3169244527816772, + "learning_rate": 9.941382750959203e-06, + "loss": 0.8601, + "step": 1868 + }, + { + "epoch": 0.10286752160272992, + "grad_norm": 0.8011140823364258, + "learning_rate": 9.941316553151696e-06, + "loss": 0.8397, + "step": 1869 + }, + { + "epoch": 0.10292256040508559, + "grad_norm": 0.811210572719574, + "learning_rate": 9.941250318206577e-06, + "loss": 0.7863, + "step": 1870 + }, + { + "epoch": 0.10297759920744125, + "grad_norm": 0.8172751665115356, + "learning_rate": 9.941184046124342e-06, + "loss": 0.8114, + "step": 1871 + }, + { + "epoch": 0.10303263800979691, + "grad_norm": 0.8072887063026428, + "learning_rate": 9.941117736905493e-06, + "loss": 0.8928, + "step": 1872 + }, + { + "epoch": 0.10308767681215257, + "grad_norm": 0.9111380577087402, + "learning_rate": 9.941051390550524e-06, + "loss": 0.866, + "step": 1873 + }, + { + "epoch": 0.10314271561450823, + "grad_norm": 0.8158383369445801, + "learning_rate": 9.940985007059936e-06, + "loss": 0.7805, + "step": 1874 + }, + { + "epoch": 0.1031977544168639, + "grad_norm": 0.8858961462974548, + "learning_rate": 9.940918586434226e-06, + "loss": 0.8424, + "step": 1875 + }, + { + "epoch": 0.10325279321921955, + "grad_norm": 0.8835182189941406, + "learning_rate": 9.940852128673895e-06, + "loss": 0.7816, + "step": 1876 + }, + { + "epoch": 0.10330783202157522, + "grad_norm": 1.044227123260498, + "learning_rate": 9.940785633779444e-06, + "loss": 0.8952, + "step": 1877 + }, + { + "epoch": 0.10336287082393088, + "grad_norm": 0.8255050778388977, + "learning_rate": 9.940719101751367e-06, + "loss": 0.8215, + "step": 1878 + }, + { + "epoch": 0.10341790962628654, + "grad_norm": 0.8561689257621765, + "learning_rate": 9.940652532590172e-06, + "loss": 0.9686, + "step": 1879 + }, + { + "epoch": 0.10347294842864219, + "grad_norm": 0.8798959255218506, + "learning_rate": 9.94058592629635e-06, + "loss": 0.8993, + "step": 1880 + }, + { + "epoch": 0.10352798723099785, + "grad_norm": 0.9292098879814148, + "learning_rate": 9.940519282870411e-06, + "loss": 0.8536, + "step": 1881 + }, + { + "epoch": 0.10358302603335351, + "grad_norm": 0.8865400552749634, + "learning_rate": 9.940452602312851e-06, + "loss": 0.8024, + "step": 1882 + }, + { + "epoch": 0.10363806483570917, + "grad_norm": 0.8985510468482971, + "learning_rate": 9.94038588462417e-06, + "loss": 0.7748, + "step": 1883 + }, + { + "epoch": 0.10369310363806483, + "grad_norm": 0.9973617196083069, + "learning_rate": 9.940319129804872e-06, + "loss": 0.875, + "step": 1884 + }, + { + "epoch": 0.1037481424404205, + "grad_norm": 0.8615350723266602, + "learning_rate": 9.940252337855458e-06, + "loss": 0.904, + "step": 1885 + }, + { + "epoch": 0.10380318124277615, + "grad_norm": 0.8752412796020508, + "learning_rate": 9.940185508776429e-06, + "loss": 0.8735, + "step": 1886 + }, + { + "epoch": 0.10385822004513182, + "grad_norm": 0.8639446496963501, + "learning_rate": 9.94011864256829e-06, + "loss": 0.7952, + "step": 1887 + }, + { + "epoch": 0.10391325884748748, + "grad_norm": 0.7932116389274597, + "learning_rate": 9.94005173923154e-06, + "loss": 0.8721, + "step": 1888 + }, + { + "epoch": 0.10396829764984314, + "grad_norm": 0.8573791980743408, + "learning_rate": 9.939984798766685e-06, + "loss": 0.9271, + "step": 1889 + }, + { + "epoch": 0.1040233364521988, + "grad_norm": 0.9080122113227844, + "learning_rate": 9.939917821174225e-06, + "loss": 0.8991, + "step": 1890 + }, + { + "epoch": 0.10407837525455446, + "grad_norm": 0.7883808612823486, + "learning_rate": 9.939850806454664e-06, + "loss": 0.6895, + "step": 1891 + }, + { + "epoch": 0.10413341405691012, + "grad_norm": 0.8067768216133118, + "learning_rate": 9.93978375460851e-06, + "loss": 0.835, + "step": 1892 + }, + { + "epoch": 0.10418845285926578, + "grad_norm": 0.8756459951400757, + "learning_rate": 9.939716665636262e-06, + "loss": 0.8144, + "step": 1893 + }, + { + "epoch": 0.10424349166162145, + "grad_norm": 0.8056700825691223, + "learning_rate": 9.939649539538425e-06, + "loss": 0.7454, + "step": 1894 + }, + { + "epoch": 0.10429853046397711, + "grad_norm": 1.0756300687789917, + "learning_rate": 9.939582376315505e-06, + "loss": 0.8096, + "step": 1895 + }, + { + "epoch": 0.10435356926633277, + "grad_norm": 0.8938102126121521, + "learning_rate": 9.939515175968006e-06, + "loss": 0.7496, + "step": 1896 + }, + { + "epoch": 0.10440860806868843, + "grad_norm": 0.9371656775474548, + "learning_rate": 9.939447938496434e-06, + "loss": 0.9817, + "step": 1897 + }, + { + "epoch": 0.10446364687104409, + "grad_norm": 1.0216082334518433, + "learning_rate": 9.939380663901292e-06, + "loss": 0.8804, + "step": 1898 + }, + { + "epoch": 0.10451868567339975, + "grad_norm": 0.8791126012802124, + "learning_rate": 9.939313352183088e-06, + "loss": 0.7811, + "step": 1899 + }, + { + "epoch": 0.10457372447575541, + "grad_norm": 0.9925445914268494, + "learning_rate": 9.939246003342326e-06, + "loss": 0.8892, + "step": 1900 + }, + { + "epoch": 0.10462876327811106, + "grad_norm": 1.0459916591644287, + "learning_rate": 9.939178617379514e-06, + "loss": 0.7938, + "step": 1901 + }, + { + "epoch": 0.10468380208046672, + "grad_norm": 0.9103816747665405, + "learning_rate": 9.93911119429516e-06, + "loss": 0.8282, + "step": 1902 + }, + { + "epoch": 0.10473884088282238, + "grad_norm": 0.9602296352386475, + "learning_rate": 9.939043734089764e-06, + "loss": 0.919, + "step": 1903 + }, + { + "epoch": 0.10479387968517805, + "grad_norm": 0.9529246687889099, + "learning_rate": 9.93897623676384e-06, + "loss": 0.9469, + "step": 1904 + }, + { + "epoch": 0.10484891848753371, + "grad_norm": 0.9619705080986023, + "learning_rate": 9.938908702317893e-06, + "loss": 0.9371, + "step": 1905 + }, + { + "epoch": 0.10490395728988937, + "grad_norm": 1.0106935501098633, + "learning_rate": 9.938841130752428e-06, + "loss": 0.7502, + "step": 1906 + }, + { + "epoch": 0.10495899609224503, + "grad_norm": 0.913985013961792, + "learning_rate": 9.938773522067957e-06, + "loss": 0.8172, + "step": 1907 + }, + { + "epoch": 0.10501403489460069, + "grad_norm": 0.9474983215332031, + "learning_rate": 9.938705876264985e-06, + "loss": 0.8999, + "step": 1908 + }, + { + "epoch": 0.10506907369695635, + "grad_norm": 0.9185097813606262, + "learning_rate": 9.938638193344024e-06, + "loss": 0.8976, + "step": 1909 + }, + { + "epoch": 0.10512411249931201, + "grad_norm": 0.7633675932884216, + "learning_rate": 9.938570473305578e-06, + "loss": 0.7777, + "step": 1910 + }, + { + "epoch": 0.10517915130166768, + "grad_norm": 0.9547691345214844, + "learning_rate": 9.938502716150159e-06, + "loss": 0.8154, + "step": 1911 + }, + { + "epoch": 0.10523419010402334, + "grad_norm": 0.8556191921234131, + "learning_rate": 9.938434921878275e-06, + "loss": 0.828, + "step": 1912 + }, + { + "epoch": 0.105289228906379, + "grad_norm": 0.9826140999794006, + "learning_rate": 9.938367090490437e-06, + "loss": 0.8085, + "step": 1913 + }, + { + "epoch": 0.10534426770873466, + "grad_norm": 0.8610432744026184, + "learning_rate": 9.938299221987154e-06, + "loss": 0.9103, + "step": 1914 + }, + { + "epoch": 0.10539930651109032, + "grad_norm": 0.8383543491363525, + "learning_rate": 9.938231316368934e-06, + "loss": 0.8182, + "step": 1915 + }, + { + "epoch": 0.10545434531344598, + "grad_norm": 0.8552964925765991, + "learning_rate": 9.93816337363629e-06, + "loss": 0.8024, + "step": 1916 + }, + { + "epoch": 0.10550938411580164, + "grad_norm": 0.9255730509757996, + "learning_rate": 9.938095393789732e-06, + "loss": 0.8566, + "step": 1917 + }, + { + "epoch": 0.1055644229181573, + "grad_norm": 0.9882987141609192, + "learning_rate": 9.938027376829774e-06, + "loss": 0.7119, + "step": 1918 + }, + { + "epoch": 0.10561946172051297, + "grad_norm": 1.139404535293579, + "learning_rate": 9.93795932275692e-06, + "loss": 0.8839, + "step": 1919 + }, + { + "epoch": 0.10567450052286863, + "grad_norm": 1.004782795906067, + "learning_rate": 9.937891231571686e-06, + "loss": 0.904, + "step": 1920 + }, + { + "epoch": 0.10572953932522429, + "grad_norm": 0.8437260389328003, + "learning_rate": 9.937823103274585e-06, + "loss": 0.7942, + "step": 1921 + }, + { + "epoch": 0.10578457812757995, + "grad_norm": 1.1388722658157349, + "learning_rate": 9.937754937866127e-06, + "loss": 0.9491, + "step": 1922 + }, + { + "epoch": 0.1058396169299356, + "grad_norm": 0.9266740083694458, + "learning_rate": 9.937686735346823e-06, + "loss": 0.9067, + "step": 1923 + }, + { + "epoch": 0.10589465573229126, + "grad_norm": 0.7536123991012573, + "learning_rate": 9.93761849571719e-06, + "loss": 0.6533, + "step": 1924 + }, + { + "epoch": 0.10594969453464692, + "grad_norm": 0.8781737089157104, + "learning_rate": 9.937550218977737e-06, + "loss": 0.8319, + "step": 1925 + }, + { + "epoch": 0.10600473333700258, + "grad_norm": 0.8577924966812134, + "learning_rate": 9.937481905128976e-06, + "loss": 0.8604, + "step": 1926 + }, + { + "epoch": 0.10605977213935824, + "grad_norm": 0.8351713418960571, + "learning_rate": 9.937413554171424e-06, + "loss": 0.946, + "step": 1927 + }, + { + "epoch": 0.1061148109417139, + "grad_norm": 0.971491813659668, + "learning_rate": 9.937345166105594e-06, + "loss": 0.7383, + "step": 1928 + }, + { + "epoch": 0.10616984974406957, + "grad_norm": 0.8020079731941223, + "learning_rate": 9.937276740932001e-06, + "loss": 0.7468, + "step": 1929 + }, + { + "epoch": 0.10622488854642523, + "grad_norm": 0.9057347178459167, + "learning_rate": 9.937208278651153e-06, + "loss": 0.8223, + "step": 1930 + }, + { + "epoch": 0.10627992734878089, + "grad_norm": 0.8384734392166138, + "learning_rate": 9.937139779263574e-06, + "loss": 0.8773, + "step": 1931 + }, + { + "epoch": 0.10633496615113655, + "grad_norm": 0.8732065558433533, + "learning_rate": 9.93707124276977e-06, + "loss": 0.8265, + "step": 1932 + }, + { + "epoch": 0.10639000495349221, + "grad_norm": 0.8744868040084839, + "learning_rate": 9.937002669170264e-06, + "loss": 0.8497, + "step": 1933 + }, + { + "epoch": 0.10644504375584787, + "grad_norm": 0.8589879870414734, + "learning_rate": 9.936934058465564e-06, + "loss": 0.8116, + "step": 1934 + }, + { + "epoch": 0.10650008255820353, + "grad_norm": 0.8614563941955566, + "learning_rate": 9.936865410656192e-06, + "loss": 0.7823, + "step": 1935 + }, + { + "epoch": 0.1065551213605592, + "grad_norm": 0.8381434082984924, + "learning_rate": 9.93679672574266e-06, + "loss": 0.7889, + "step": 1936 + }, + { + "epoch": 0.10661016016291486, + "grad_norm": 0.9834293127059937, + "learning_rate": 9.936728003725484e-06, + "loss": 0.8358, + "step": 1937 + }, + { + "epoch": 0.10666519896527052, + "grad_norm": 0.8461851477622986, + "learning_rate": 9.936659244605184e-06, + "loss": 0.8408, + "step": 1938 + }, + { + "epoch": 0.10672023776762618, + "grad_norm": 1.0186371803283691, + "learning_rate": 9.936590448382273e-06, + "loss": 0.8118, + "step": 1939 + }, + { + "epoch": 0.10677527656998184, + "grad_norm": 0.866321325302124, + "learning_rate": 9.93652161505727e-06, + "loss": 0.8696, + "step": 1940 + }, + { + "epoch": 0.1068303153723375, + "grad_norm": 0.9179622530937195, + "learning_rate": 9.936452744630692e-06, + "loss": 0.8419, + "step": 1941 + }, + { + "epoch": 0.10688535417469316, + "grad_norm": 0.8250496983528137, + "learning_rate": 9.936383837103057e-06, + "loss": 0.8511, + "step": 1942 + }, + { + "epoch": 0.10694039297704883, + "grad_norm": 0.8475700616836548, + "learning_rate": 9.936314892474883e-06, + "loss": 0.8404, + "step": 1943 + }, + { + "epoch": 0.10699543177940447, + "grad_norm": 0.774334192276001, + "learning_rate": 9.936245910746684e-06, + "loss": 0.7461, + "step": 1944 + }, + { + "epoch": 0.10705047058176013, + "grad_norm": 0.9313948154449463, + "learning_rate": 9.936176891918986e-06, + "loss": 0.8486, + "step": 1945 + }, + { + "epoch": 0.1071055093841158, + "grad_norm": 0.8784124255180359, + "learning_rate": 9.936107835992304e-06, + "loss": 0.84, + "step": 1946 + }, + { + "epoch": 0.10716054818647146, + "grad_norm": 0.9087465405464172, + "learning_rate": 9.936038742967154e-06, + "loss": 0.9012, + "step": 1947 + }, + { + "epoch": 0.10721558698882712, + "grad_norm": 0.8462012410163879, + "learning_rate": 9.93596961284406e-06, + "loss": 0.9193, + "step": 1948 + }, + { + "epoch": 0.10727062579118278, + "grad_norm": 0.8984553813934326, + "learning_rate": 9.935900445623538e-06, + "loss": 0.781, + "step": 1949 + }, + { + "epoch": 0.10732566459353844, + "grad_norm": 0.9197295308113098, + "learning_rate": 9.935831241306111e-06, + "loss": 0.8861, + "step": 1950 + }, + { + "epoch": 0.1073807033958941, + "grad_norm": 0.8452801704406738, + "learning_rate": 9.935761999892296e-06, + "loss": 0.8649, + "step": 1951 + }, + { + "epoch": 0.10743574219824976, + "grad_norm": 0.8047192096710205, + "learning_rate": 9.935692721382618e-06, + "loss": 0.8704, + "step": 1952 + }, + { + "epoch": 0.10749078100060543, + "grad_norm": 0.9536359906196594, + "learning_rate": 9.935623405777593e-06, + "loss": 0.7803, + "step": 1953 + }, + { + "epoch": 0.10754581980296109, + "grad_norm": 0.8215291500091553, + "learning_rate": 9.935554053077744e-06, + "loss": 0.8247, + "step": 1954 + }, + { + "epoch": 0.10760085860531675, + "grad_norm": 0.9261930584907532, + "learning_rate": 9.93548466328359e-06, + "loss": 0.8594, + "step": 1955 + }, + { + "epoch": 0.10765589740767241, + "grad_norm": 0.7973492741584778, + "learning_rate": 9.935415236395656e-06, + "loss": 0.7464, + "step": 1956 + }, + { + "epoch": 0.10771093621002807, + "grad_norm": 0.9328988790512085, + "learning_rate": 9.935345772414463e-06, + "loss": 0.8472, + "step": 1957 + }, + { + "epoch": 0.10776597501238373, + "grad_norm": 0.9490759968757629, + "learning_rate": 9.935276271340532e-06, + "loss": 0.806, + "step": 1958 + }, + { + "epoch": 0.1078210138147394, + "grad_norm": 0.9149925112724304, + "learning_rate": 9.935206733174385e-06, + "loss": 0.8741, + "step": 1959 + }, + { + "epoch": 0.10787605261709506, + "grad_norm": 1.0074039697647095, + "learning_rate": 9.935137157916546e-06, + "loss": 0.8493, + "step": 1960 + }, + { + "epoch": 0.10793109141945072, + "grad_norm": 0.8783678412437439, + "learning_rate": 9.935067545567535e-06, + "loss": 0.8132, + "step": 1961 + }, + { + "epoch": 0.10798613022180638, + "grad_norm": 0.8273885250091553, + "learning_rate": 9.934997896127879e-06, + "loss": 0.7448, + "step": 1962 + }, + { + "epoch": 0.10804116902416204, + "grad_norm": 0.761947512626648, + "learning_rate": 9.9349282095981e-06, + "loss": 0.7933, + "step": 1963 + }, + { + "epoch": 0.1080962078265177, + "grad_norm": 0.814809262752533, + "learning_rate": 9.934858485978722e-06, + "loss": 0.7551, + "step": 1964 + }, + { + "epoch": 0.10815124662887336, + "grad_norm": 0.8108895421028137, + "learning_rate": 9.934788725270266e-06, + "loss": 0.6787, + "step": 1965 + }, + { + "epoch": 0.10820628543122901, + "grad_norm": 0.8669139742851257, + "learning_rate": 9.934718927473262e-06, + "loss": 0.8395, + "step": 1966 + }, + { + "epoch": 0.10826132423358467, + "grad_norm": 0.9093756079673767, + "learning_rate": 9.93464909258823e-06, + "loss": 0.8341, + "step": 1967 + }, + { + "epoch": 0.10831636303594033, + "grad_norm": 0.8923841714859009, + "learning_rate": 9.934579220615697e-06, + "loss": 0.9422, + "step": 1968 + }, + { + "epoch": 0.108371401838296, + "grad_norm": 0.850429117679596, + "learning_rate": 9.934509311556186e-06, + "loss": 0.8446, + "step": 1969 + }, + { + "epoch": 0.10842644064065166, + "grad_norm": 0.8762460350990295, + "learning_rate": 9.934439365410224e-06, + "loss": 0.7788, + "step": 1970 + }, + { + "epoch": 0.10848147944300732, + "grad_norm": 0.9700387716293335, + "learning_rate": 9.934369382178338e-06, + "loss": 0.8455, + "step": 1971 + }, + { + "epoch": 0.10853651824536298, + "grad_norm": 0.8003185987472534, + "learning_rate": 9.934299361861053e-06, + "loss": 0.8026, + "step": 1972 + }, + { + "epoch": 0.10859155704771864, + "grad_norm": 0.9626984596252441, + "learning_rate": 9.934229304458893e-06, + "loss": 0.8219, + "step": 1973 + }, + { + "epoch": 0.1086465958500743, + "grad_norm": 0.8722280859947205, + "learning_rate": 9.934159209972386e-06, + "loss": 0.8866, + "step": 1974 + }, + { + "epoch": 0.10870163465242996, + "grad_norm": 0.838736355304718, + "learning_rate": 9.934089078402061e-06, + "loss": 0.7723, + "step": 1975 + }, + { + "epoch": 0.10875667345478562, + "grad_norm": 0.8373032808303833, + "learning_rate": 9.934018909748443e-06, + "loss": 0.9003, + "step": 1976 + }, + { + "epoch": 0.10881171225714129, + "grad_norm": 0.8704653978347778, + "learning_rate": 9.93394870401206e-06, + "loss": 0.8926, + "step": 1977 + }, + { + "epoch": 0.10886675105949695, + "grad_norm": 0.8088163733482361, + "learning_rate": 9.933878461193437e-06, + "loss": 0.8059, + "step": 1978 + }, + { + "epoch": 0.10892178986185261, + "grad_norm": 0.856421947479248, + "learning_rate": 9.933808181293108e-06, + "loss": 0.8447, + "step": 1979 + }, + { + "epoch": 0.10897682866420827, + "grad_norm": 0.9676237106323242, + "learning_rate": 9.933737864311595e-06, + "loss": 0.9009, + "step": 1980 + }, + { + "epoch": 0.10903186746656393, + "grad_norm": 0.7955103516578674, + "learning_rate": 9.933667510249428e-06, + "loss": 0.881, + "step": 1981 + }, + { + "epoch": 0.10908690626891959, + "grad_norm": 0.7935854196548462, + "learning_rate": 9.933597119107136e-06, + "loss": 0.8773, + "step": 1982 + }, + { + "epoch": 0.10914194507127525, + "grad_norm": 0.7726008296012878, + "learning_rate": 9.933526690885251e-06, + "loss": 0.8133, + "step": 1983 + }, + { + "epoch": 0.10919698387363092, + "grad_norm": 0.8577712178230286, + "learning_rate": 9.9334562255843e-06, + "loss": 0.7455, + "step": 1984 + }, + { + "epoch": 0.10925202267598658, + "grad_norm": 0.9996447563171387, + "learning_rate": 9.933385723204812e-06, + "loss": 0.7312, + "step": 1985 + }, + { + "epoch": 0.10930706147834224, + "grad_norm": 0.9600629806518555, + "learning_rate": 9.933315183747318e-06, + "loss": 0.8792, + "step": 1986 + }, + { + "epoch": 0.10936210028069789, + "grad_norm": 0.9126206636428833, + "learning_rate": 9.933244607212347e-06, + "loss": 1.0023, + "step": 1987 + }, + { + "epoch": 0.10941713908305355, + "grad_norm": 0.774153470993042, + "learning_rate": 9.93317399360043e-06, + "loss": 0.7877, + "step": 1988 + }, + { + "epoch": 0.10947217788540921, + "grad_norm": 0.848495364189148, + "learning_rate": 9.933103342912096e-06, + "loss": 0.8825, + "step": 1989 + }, + { + "epoch": 0.10952721668776487, + "grad_norm": 0.806408166885376, + "learning_rate": 9.933032655147881e-06, + "loss": 0.7389, + "step": 1990 + }, + { + "epoch": 0.10958225549012053, + "grad_norm": 0.8579222559928894, + "learning_rate": 9.932961930308312e-06, + "loss": 0.8283, + "step": 1991 + }, + { + "epoch": 0.10963729429247619, + "grad_norm": 0.7548109292984009, + "learning_rate": 9.93289116839392e-06, + "loss": 0.7971, + "step": 1992 + }, + { + "epoch": 0.10969233309483185, + "grad_norm": 0.7954711318016052, + "learning_rate": 9.93282036940524e-06, + "loss": 0.849, + "step": 1993 + }, + { + "epoch": 0.10974737189718752, + "grad_norm": 0.7911425232887268, + "learning_rate": 9.932749533342802e-06, + "loss": 0.86, + "step": 1994 + }, + { + "epoch": 0.10980241069954318, + "grad_norm": 0.8505094051361084, + "learning_rate": 9.932678660207141e-06, + "loss": 0.7871, + "step": 1995 + }, + { + "epoch": 0.10985744950189884, + "grad_norm": 0.809612512588501, + "learning_rate": 9.932607749998784e-06, + "loss": 0.8337, + "step": 1996 + }, + { + "epoch": 0.1099124883042545, + "grad_norm": 0.738523006439209, + "learning_rate": 9.93253680271827e-06, + "loss": 0.7634, + "step": 1997 + }, + { + "epoch": 0.10996752710661016, + "grad_norm": 0.8434372544288635, + "learning_rate": 9.932465818366128e-06, + "loss": 0.7987, + "step": 1998 + }, + { + "epoch": 0.11002256590896582, + "grad_norm": 0.8068081140518188, + "learning_rate": 9.932394796942895e-06, + "loss": 0.9496, + "step": 1999 + }, + { + "epoch": 0.11007760471132148, + "grad_norm": 0.754342794418335, + "learning_rate": 9.932323738449103e-06, + "loss": 0.7355, + "step": 2000 + }, + { + "epoch": 0.11013264351367714, + "grad_norm": 0.8830806612968445, + "learning_rate": 9.932252642885285e-06, + "loss": 0.8458, + "step": 2001 + }, + { + "epoch": 0.1101876823160328, + "grad_norm": 0.9915485978126526, + "learning_rate": 9.932181510251977e-06, + "loss": 0.8116, + "step": 2002 + }, + { + "epoch": 0.11024272111838847, + "grad_norm": 0.858368992805481, + "learning_rate": 9.932110340549712e-06, + "loss": 0.8354, + "step": 2003 + }, + { + "epoch": 0.11029775992074413, + "grad_norm": 0.8591521382331848, + "learning_rate": 9.932039133779028e-06, + "loss": 0.8316, + "step": 2004 + }, + { + "epoch": 0.11035279872309979, + "grad_norm": 0.8714838624000549, + "learning_rate": 9.931967889940455e-06, + "loss": 0.8106, + "step": 2005 + }, + { + "epoch": 0.11040783752545545, + "grad_norm": 0.8082797527313232, + "learning_rate": 9.931896609034534e-06, + "loss": 0.7762, + "step": 2006 + }, + { + "epoch": 0.11046287632781111, + "grad_norm": 0.9226199984550476, + "learning_rate": 9.931825291061797e-06, + "loss": 0.8641, + "step": 2007 + }, + { + "epoch": 0.11051791513016677, + "grad_norm": 0.8883050680160522, + "learning_rate": 9.931753936022783e-06, + "loss": 0.9014, + "step": 2008 + }, + { + "epoch": 0.11057295393252242, + "grad_norm": 0.9024807810783386, + "learning_rate": 9.931682543918024e-06, + "loss": 0.9085, + "step": 2009 + }, + { + "epoch": 0.11062799273487808, + "grad_norm": 0.8381460905075073, + "learning_rate": 9.931611114748062e-06, + "loss": 0.8043, + "step": 2010 + }, + { + "epoch": 0.11068303153723374, + "grad_norm": 1.1222339868545532, + "learning_rate": 9.931539648513429e-06, + "loss": 0.8388, + "step": 2011 + }, + { + "epoch": 0.1107380703395894, + "grad_norm": 0.9710868000984192, + "learning_rate": 9.931468145214665e-06, + "loss": 0.8934, + "step": 2012 + }, + { + "epoch": 0.11079310914194507, + "grad_norm": 0.9821141958236694, + "learning_rate": 9.931396604852304e-06, + "loss": 0.931, + "step": 2013 + }, + { + "epoch": 0.11084814794430073, + "grad_norm": 1.0658717155456543, + "learning_rate": 9.931325027426889e-06, + "loss": 0.9032, + "step": 2014 + }, + { + "epoch": 0.11090318674665639, + "grad_norm": 0.8836946487426758, + "learning_rate": 9.931253412938956e-06, + "loss": 0.9131, + "step": 2015 + }, + { + "epoch": 0.11095822554901205, + "grad_norm": 0.8438361883163452, + "learning_rate": 9.93118176138904e-06, + "loss": 0.8674, + "step": 2016 + }, + { + "epoch": 0.11101326435136771, + "grad_norm": 0.928142786026001, + "learning_rate": 9.93111007277768e-06, + "loss": 0.8882, + "step": 2017 + }, + { + "epoch": 0.11106830315372337, + "grad_norm": 0.9176276922225952, + "learning_rate": 9.93103834710542e-06, + "loss": 0.8904, + "step": 2018 + }, + { + "epoch": 0.11112334195607904, + "grad_norm": 1.0462889671325684, + "learning_rate": 9.930966584372795e-06, + "loss": 0.8029, + "step": 2019 + }, + { + "epoch": 0.1111783807584347, + "grad_norm": 0.7627375721931458, + "learning_rate": 9.930894784580344e-06, + "loss": 0.8474, + "step": 2020 + }, + { + "epoch": 0.11123341956079036, + "grad_norm": 1.0545588731765747, + "learning_rate": 9.93082294772861e-06, + "loss": 0.7985, + "step": 2021 + }, + { + "epoch": 0.11128845836314602, + "grad_norm": 0.9752298593521118, + "learning_rate": 9.93075107381813e-06, + "loss": 0.8725, + "step": 2022 + }, + { + "epoch": 0.11134349716550168, + "grad_norm": 0.8403159379959106, + "learning_rate": 9.930679162849444e-06, + "loss": 0.8854, + "step": 2023 + }, + { + "epoch": 0.11139853596785734, + "grad_norm": 0.8879380226135254, + "learning_rate": 9.930607214823094e-06, + "loss": 0.7269, + "step": 2024 + }, + { + "epoch": 0.111453574770213, + "grad_norm": 0.907256543636322, + "learning_rate": 9.930535229739618e-06, + "loss": 0.8145, + "step": 2025 + }, + { + "epoch": 0.11150861357256867, + "grad_norm": 1.1066968441009521, + "learning_rate": 9.93046320759956e-06, + "loss": 0.9281, + "step": 2026 + }, + { + "epoch": 0.11156365237492433, + "grad_norm": 0.9226258397102356, + "learning_rate": 9.930391148403462e-06, + "loss": 0.9048, + "step": 2027 + }, + { + "epoch": 0.11161869117727999, + "grad_norm": 0.9652156829833984, + "learning_rate": 9.930319052151862e-06, + "loss": 0.9321, + "step": 2028 + }, + { + "epoch": 0.11167372997963565, + "grad_norm": 0.9102638363838196, + "learning_rate": 9.930246918845305e-06, + "loss": 0.8169, + "step": 2029 + }, + { + "epoch": 0.1117287687819913, + "grad_norm": 0.7765716314315796, + "learning_rate": 9.93017474848433e-06, + "loss": 0.7691, + "step": 2030 + }, + { + "epoch": 0.11178380758434696, + "grad_norm": 0.9053775072097778, + "learning_rate": 9.930102541069484e-06, + "loss": 0.782, + "step": 2031 + }, + { + "epoch": 0.11183884638670262, + "grad_norm": 0.8892827033996582, + "learning_rate": 9.930030296601306e-06, + "loss": 0.8575, + "step": 2032 + }, + { + "epoch": 0.11189388518905828, + "grad_norm": 0.8947604894638062, + "learning_rate": 9.929958015080339e-06, + "loss": 0.8607, + "step": 2033 + }, + { + "epoch": 0.11194892399141394, + "grad_norm": 0.8936871290206909, + "learning_rate": 9.929885696507127e-06, + "loss": 0.8111, + "step": 2034 + }, + { + "epoch": 0.1120039627937696, + "grad_norm": 0.9579165577888489, + "learning_rate": 9.929813340882214e-06, + "loss": 0.911, + "step": 2035 + }, + { + "epoch": 0.11205900159612527, + "grad_norm": 0.7885386347770691, + "learning_rate": 9.929740948206146e-06, + "loss": 0.8074, + "step": 2036 + }, + { + "epoch": 0.11211404039848093, + "grad_norm": 0.817939281463623, + "learning_rate": 9.929668518479462e-06, + "loss": 0.8451, + "step": 2037 + }, + { + "epoch": 0.11216907920083659, + "grad_norm": 0.8695761561393738, + "learning_rate": 9.92959605170271e-06, + "loss": 0.7158, + "step": 2038 + }, + { + "epoch": 0.11222411800319225, + "grad_norm": 0.8569639325141907, + "learning_rate": 9.929523547876433e-06, + "loss": 0.8568, + "step": 2039 + }, + { + "epoch": 0.11227915680554791, + "grad_norm": 0.8569897413253784, + "learning_rate": 9.929451007001176e-06, + "loss": 0.8971, + "step": 2040 + }, + { + "epoch": 0.11233419560790357, + "grad_norm": 0.8520069718360901, + "learning_rate": 9.929378429077487e-06, + "loss": 0.9027, + "step": 2041 + }, + { + "epoch": 0.11238923441025923, + "grad_norm": 0.9338961839675903, + "learning_rate": 9.929305814105907e-06, + "loss": 0.8646, + "step": 2042 + }, + { + "epoch": 0.1124442732126149, + "grad_norm": 0.8497192859649658, + "learning_rate": 9.929233162086985e-06, + "loss": 0.9068, + "step": 2043 + }, + { + "epoch": 0.11249931201497056, + "grad_norm": 0.8570863008499146, + "learning_rate": 9.929160473021267e-06, + "loss": 0.962, + "step": 2044 + }, + { + "epoch": 0.11255435081732622, + "grad_norm": 0.9072359800338745, + "learning_rate": 9.929087746909296e-06, + "loss": 0.8454, + "step": 2045 + }, + { + "epoch": 0.11260938961968188, + "grad_norm": 0.7920698523521423, + "learning_rate": 9.929014983751623e-06, + "loss": 0.8031, + "step": 2046 + }, + { + "epoch": 0.11266442842203754, + "grad_norm": 1.0180169343948364, + "learning_rate": 9.928942183548791e-06, + "loss": 0.7759, + "step": 2047 + }, + { + "epoch": 0.1127194672243932, + "grad_norm": 0.8746892809867859, + "learning_rate": 9.928869346301351e-06, + "loss": 0.9038, + "step": 2048 + }, + { + "epoch": 0.11277450602674886, + "grad_norm": 0.8283438086509705, + "learning_rate": 9.928796472009846e-06, + "loss": 0.8883, + "step": 2049 + }, + { + "epoch": 0.11282954482910452, + "grad_norm": 1.321917176246643, + "learning_rate": 9.928723560674828e-06, + "loss": 0.835, + "step": 2050 + }, + { + "epoch": 0.11288458363146017, + "grad_norm": 0.9356202483177185, + "learning_rate": 9.928650612296841e-06, + "loss": 0.8077, + "step": 2051 + }, + { + "epoch": 0.11293962243381583, + "grad_norm": 0.8493767380714417, + "learning_rate": 9.928577626876439e-06, + "loss": 0.8295, + "step": 2052 + }, + { + "epoch": 0.1129946612361715, + "grad_norm": 0.784818708896637, + "learning_rate": 9.928504604414164e-06, + "loss": 0.8322, + "step": 2053 + }, + { + "epoch": 0.11304970003852716, + "grad_norm": 0.9095364809036255, + "learning_rate": 9.928431544910567e-06, + "loss": 0.8757, + "step": 2054 + }, + { + "epoch": 0.11310473884088282, + "grad_norm": 0.8889689445495605, + "learning_rate": 9.9283584483662e-06, + "loss": 0.8583, + "step": 2055 + }, + { + "epoch": 0.11315977764323848, + "grad_norm": 0.8702652454376221, + "learning_rate": 9.928285314781607e-06, + "loss": 0.8414, + "step": 2056 + }, + { + "epoch": 0.11321481644559414, + "grad_norm": 0.8531168699264526, + "learning_rate": 9.928212144157342e-06, + "loss": 0.7844, + "step": 2057 + }, + { + "epoch": 0.1132698552479498, + "grad_norm": 1.0250271558761597, + "learning_rate": 9.928138936493956e-06, + "loss": 0.8766, + "step": 2058 + }, + { + "epoch": 0.11332489405030546, + "grad_norm": 0.7963449358940125, + "learning_rate": 9.928065691791996e-06, + "loss": 0.8166, + "step": 2059 + }, + { + "epoch": 0.11337993285266112, + "grad_norm": 1.1033011674880981, + "learning_rate": 9.927992410052013e-06, + "loss": 0.8748, + "step": 2060 + }, + { + "epoch": 0.11343497165501679, + "grad_norm": 0.8760959506034851, + "learning_rate": 9.927919091274558e-06, + "loss": 0.8623, + "step": 2061 + }, + { + "epoch": 0.11349001045737245, + "grad_norm": 1.1783028841018677, + "learning_rate": 9.927845735460182e-06, + "loss": 0.9144, + "step": 2062 + }, + { + "epoch": 0.11354504925972811, + "grad_norm": 0.8868625164031982, + "learning_rate": 9.927772342609437e-06, + "loss": 0.8614, + "step": 2063 + }, + { + "epoch": 0.11360008806208377, + "grad_norm": 0.8784704804420471, + "learning_rate": 9.927698912722874e-06, + "loss": 0.7802, + "step": 2064 + }, + { + "epoch": 0.11365512686443943, + "grad_norm": 1.0090643167495728, + "learning_rate": 9.927625445801046e-06, + "loss": 0.8876, + "step": 2065 + }, + { + "epoch": 0.1137101656667951, + "grad_norm": 0.7624390721321106, + "learning_rate": 9.927551941844502e-06, + "loss": 0.794, + "step": 2066 + }, + { + "epoch": 0.11376520446915075, + "grad_norm": 0.7814189791679382, + "learning_rate": 9.927478400853798e-06, + "loss": 0.8176, + "step": 2067 + }, + { + "epoch": 0.11382024327150642, + "grad_norm": 0.876338541507721, + "learning_rate": 9.927404822829486e-06, + "loss": 0.8634, + "step": 2068 + }, + { + "epoch": 0.11387528207386208, + "grad_norm": 0.7931430339813232, + "learning_rate": 9.927331207772117e-06, + "loss": 0.8012, + "step": 2069 + }, + { + "epoch": 0.11393032087621774, + "grad_norm": 1.0064504146575928, + "learning_rate": 9.927257555682246e-06, + "loss": 0.8321, + "step": 2070 + }, + { + "epoch": 0.1139853596785734, + "grad_norm": 0.8233053684234619, + "learning_rate": 9.927183866560425e-06, + "loss": 0.8004, + "step": 2071 + }, + { + "epoch": 0.11404039848092906, + "grad_norm": 1.0106632709503174, + "learning_rate": 9.927110140407211e-06, + "loss": 0.8627, + "step": 2072 + }, + { + "epoch": 0.11409543728328471, + "grad_norm": 0.8262843489646912, + "learning_rate": 9.927036377223155e-06, + "loss": 0.737, + "step": 2073 + }, + { + "epoch": 0.11415047608564037, + "grad_norm": 0.9349029660224915, + "learning_rate": 9.926962577008813e-06, + "loss": 0.9049, + "step": 2074 + }, + { + "epoch": 0.11420551488799603, + "grad_norm": 0.8689929842948914, + "learning_rate": 9.926888739764739e-06, + "loss": 0.7858, + "step": 2075 + }, + { + "epoch": 0.1142605536903517, + "grad_norm": 0.8442347645759583, + "learning_rate": 9.926814865491487e-06, + "loss": 0.8145, + "step": 2076 + }, + { + "epoch": 0.11431559249270735, + "grad_norm": 0.9143397212028503, + "learning_rate": 9.926740954189615e-06, + "loss": 0.8025, + "step": 2077 + }, + { + "epoch": 0.11437063129506302, + "grad_norm": 1.293251395225525, + "learning_rate": 9.926667005859676e-06, + "loss": 1.0256, + "step": 2078 + }, + { + "epoch": 0.11442567009741868, + "grad_norm": 0.9661351442337036, + "learning_rate": 9.926593020502226e-06, + "loss": 0.991, + "step": 2079 + }, + { + "epoch": 0.11448070889977434, + "grad_norm": 0.8110861778259277, + "learning_rate": 9.926518998117823e-06, + "loss": 0.7129, + "step": 2080 + }, + { + "epoch": 0.11453574770213, + "grad_norm": 0.8351119160652161, + "learning_rate": 9.92644493870702e-06, + "loss": 0.8894, + "step": 2081 + }, + { + "epoch": 0.11459078650448566, + "grad_norm": 0.8492733240127563, + "learning_rate": 9.926370842270377e-06, + "loss": 0.8039, + "step": 2082 + }, + { + "epoch": 0.11464582530684132, + "grad_norm": 0.895353376865387, + "learning_rate": 9.92629670880845e-06, + "loss": 0.8743, + "step": 2083 + }, + { + "epoch": 0.11470086410919698, + "grad_norm": 0.7871271967887878, + "learning_rate": 9.926222538321795e-06, + "loss": 0.8426, + "step": 2084 + }, + { + "epoch": 0.11475590291155265, + "grad_norm": 0.8904643058776855, + "learning_rate": 9.92614833081097e-06, + "loss": 0.8454, + "step": 2085 + }, + { + "epoch": 0.11481094171390831, + "grad_norm": 0.9166308641433716, + "learning_rate": 9.926074086276532e-06, + "loss": 0.9162, + "step": 2086 + }, + { + "epoch": 0.11486598051626397, + "grad_norm": 0.8730728626251221, + "learning_rate": 9.92599980471904e-06, + "loss": 0.8524, + "step": 2087 + }, + { + "epoch": 0.11492101931861963, + "grad_norm": 0.7932829260826111, + "learning_rate": 9.925925486139052e-06, + "loss": 0.7838, + "step": 2088 + }, + { + "epoch": 0.11497605812097529, + "grad_norm": 1.0033760070800781, + "learning_rate": 9.925851130537127e-06, + "loss": 0.8746, + "step": 2089 + }, + { + "epoch": 0.11503109692333095, + "grad_norm": 0.7783192992210388, + "learning_rate": 9.925776737913823e-06, + "loss": 0.7308, + "step": 2090 + }, + { + "epoch": 0.11508613572568661, + "grad_norm": 0.8441587686538696, + "learning_rate": 9.925702308269702e-06, + "loss": 0.7933, + "step": 2091 + }, + { + "epoch": 0.11514117452804228, + "grad_norm": 0.9433023929595947, + "learning_rate": 9.925627841605319e-06, + "loss": 0.7857, + "step": 2092 + }, + { + "epoch": 0.11519621333039794, + "grad_norm": 0.8958256244659424, + "learning_rate": 9.925553337921235e-06, + "loss": 0.9116, + "step": 2093 + }, + { + "epoch": 0.11525125213275358, + "grad_norm": 0.7610845565795898, + "learning_rate": 9.925478797218011e-06, + "loss": 0.8006, + "step": 2094 + }, + { + "epoch": 0.11530629093510925, + "grad_norm": 0.7977023720741272, + "learning_rate": 9.925404219496207e-06, + "loss": 0.8068, + "step": 2095 + }, + { + "epoch": 0.11536132973746491, + "grad_norm": 0.8087283372879028, + "learning_rate": 9.925329604756383e-06, + "loss": 0.7968, + "step": 2096 + }, + { + "epoch": 0.11541636853982057, + "grad_norm": 1.1066477298736572, + "learning_rate": 9.925254952999102e-06, + "loss": 0.8167, + "step": 2097 + }, + { + "epoch": 0.11547140734217623, + "grad_norm": 0.7806832194328308, + "learning_rate": 9.925180264224921e-06, + "loss": 0.8069, + "step": 2098 + }, + { + "epoch": 0.11552644614453189, + "grad_norm": 0.7745190858840942, + "learning_rate": 9.925105538434406e-06, + "loss": 0.7968, + "step": 2099 + }, + { + "epoch": 0.11558148494688755, + "grad_norm": 0.9045543074607849, + "learning_rate": 9.925030775628113e-06, + "loss": 0.8417, + "step": 2100 + }, + { + "epoch": 0.11563652374924321, + "grad_norm": 1.2962623834609985, + "learning_rate": 9.924955975806608e-06, + "loss": 0.8162, + "step": 2101 + }, + { + "epoch": 0.11569156255159888, + "grad_norm": 0.8571485877037048, + "learning_rate": 9.924881138970453e-06, + "loss": 0.8581, + "step": 2102 + }, + { + "epoch": 0.11574660135395454, + "grad_norm": 0.8326650857925415, + "learning_rate": 9.92480626512021e-06, + "loss": 0.8438, + "step": 2103 + }, + { + "epoch": 0.1158016401563102, + "grad_norm": 0.7973701357841492, + "learning_rate": 9.924731354256441e-06, + "loss": 0.8337, + "step": 2104 + }, + { + "epoch": 0.11585667895866586, + "grad_norm": 0.8614075779914856, + "learning_rate": 9.924656406379708e-06, + "loss": 0.8275, + "step": 2105 + }, + { + "epoch": 0.11591171776102152, + "grad_norm": 0.7911350131034851, + "learning_rate": 9.924581421490577e-06, + "loss": 0.8032, + "step": 2106 + }, + { + "epoch": 0.11596675656337718, + "grad_norm": 0.8763116598129272, + "learning_rate": 9.92450639958961e-06, + "loss": 0.8725, + "step": 2107 + }, + { + "epoch": 0.11602179536573284, + "grad_norm": 0.9754133224487305, + "learning_rate": 9.92443134067737e-06, + "loss": 0.9115, + "step": 2108 + }, + { + "epoch": 0.1160768341680885, + "grad_norm": 0.7783731818199158, + "learning_rate": 9.924356244754425e-06, + "loss": 0.8223, + "step": 2109 + }, + { + "epoch": 0.11613187297044417, + "grad_norm": 0.865301787853241, + "learning_rate": 9.924281111821335e-06, + "loss": 0.8053, + "step": 2110 + }, + { + "epoch": 0.11618691177279983, + "grad_norm": 0.8654297590255737, + "learning_rate": 9.924205941878666e-06, + "loss": 0.716, + "step": 2111 + }, + { + "epoch": 0.11624195057515549, + "grad_norm": 0.7646550536155701, + "learning_rate": 9.924130734926982e-06, + "loss": 0.8027, + "step": 2112 + }, + { + "epoch": 0.11629698937751115, + "grad_norm": 0.810587465763092, + "learning_rate": 9.924055490966851e-06, + "loss": 0.7416, + "step": 2113 + }, + { + "epoch": 0.11635202817986681, + "grad_norm": 0.8610082268714905, + "learning_rate": 9.923980209998838e-06, + "loss": 0.8527, + "step": 2114 + }, + { + "epoch": 0.11640706698222247, + "grad_norm": 0.8409233689308167, + "learning_rate": 9.923904892023506e-06, + "loss": 0.8169, + "step": 2115 + }, + { + "epoch": 0.11646210578457812, + "grad_norm": 0.7786587476730347, + "learning_rate": 9.923829537041425e-06, + "loss": 0.6897, + "step": 2116 + }, + { + "epoch": 0.11651714458693378, + "grad_norm": 0.852908730506897, + "learning_rate": 9.923754145053158e-06, + "loss": 0.7821, + "step": 2117 + }, + { + "epoch": 0.11657218338928944, + "grad_norm": 0.9130391478538513, + "learning_rate": 9.923678716059273e-06, + "loss": 1.0377, + "step": 2118 + }, + { + "epoch": 0.1166272221916451, + "grad_norm": 0.8371701240539551, + "learning_rate": 9.923603250060336e-06, + "loss": 0.8312, + "step": 2119 + }, + { + "epoch": 0.11668226099400077, + "grad_norm": 0.8045756220817566, + "learning_rate": 9.923527747056916e-06, + "loss": 0.7971, + "step": 2120 + }, + { + "epoch": 0.11673729979635643, + "grad_norm": 0.8832160234451294, + "learning_rate": 9.923452207049577e-06, + "loss": 0.7362, + "step": 2121 + }, + { + "epoch": 0.11679233859871209, + "grad_norm": 0.8253088593482971, + "learning_rate": 9.923376630038893e-06, + "loss": 0.8177, + "step": 2122 + }, + { + "epoch": 0.11684737740106775, + "grad_norm": 0.7953168749809265, + "learning_rate": 9.923301016025424e-06, + "loss": 0.7053, + "step": 2123 + }, + { + "epoch": 0.11690241620342341, + "grad_norm": 0.7256457805633545, + "learning_rate": 9.923225365009745e-06, + "loss": 0.7554, + "step": 2124 + }, + { + "epoch": 0.11695745500577907, + "grad_norm": 0.9896693229675293, + "learning_rate": 9.923149676992424e-06, + "loss": 0.8285, + "step": 2125 + }, + { + "epoch": 0.11701249380813473, + "grad_norm": 0.7846312522888184, + "learning_rate": 9.923073951974023e-06, + "loss": 0.7527, + "step": 2126 + }, + { + "epoch": 0.1170675326104904, + "grad_norm": 0.8949825167655945, + "learning_rate": 9.92299818995512e-06, + "loss": 0.8545, + "step": 2127 + }, + { + "epoch": 0.11712257141284606, + "grad_norm": 1.0023548603057861, + "learning_rate": 9.922922390936278e-06, + "loss": 0.7668, + "step": 2128 + }, + { + "epoch": 0.11717761021520172, + "grad_norm": 0.8663881421089172, + "learning_rate": 9.92284655491807e-06, + "loss": 0.8073, + "step": 2129 + }, + { + "epoch": 0.11723264901755738, + "grad_norm": 0.8274385929107666, + "learning_rate": 9.922770681901064e-06, + "loss": 0.9002, + "step": 2130 + }, + { + "epoch": 0.11728768781991304, + "grad_norm": 0.8508959412574768, + "learning_rate": 9.922694771885832e-06, + "loss": 0.9325, + "step": 2131 + }, + { + "epoch": 0.1173427266222687, + "grad_norm": 0.8176792860031128, + "learning_rate": 9.922618824872946e-06, + "loss": 0.8415, + "step": 2132 + }, + { + "epoch": 0.11739776542462436, + "grad_norm": 0.770951509475708, + "learning_rate": 9.922542840862971e-06, + "loss": 0.8051, + "step": 2133 + }, + { + "epoch": 0.11745280422698003, + "grad_norm": 0.8558167219161987, + "learning_rate": 9.922466819856484e-06, + "loss": 0.85, + "step": 2134 + }, + { + "epoch": 0.11750784302933569, + "grad_norm": 0.8288151025772095, + "learning_rate": 9.922390761854053e-06, + "loss": 0.8141, + "step": 2135 + }, + { + "epoch": 0.11756288183169135, + "grad_norm": 0.8220882415771484, + "learning_rate": 9.922314666856252e-06, + "loss": 0.8109, + "step": 2136 + }, + { + "epoch": 0.117617920634047, + "grad_norm": 0.7875000238418579, + "learning_rate": 9.92223853486365e-06, + "loss": 0.9085, + "step": 2137 + }, + { + "epoch": 0.11767295943640266, + "grad_norm": 0.8052374124526978, + "learning_rate": 9.922162365876822e-06, + "loss": 0.8785, + "step": 2138 + }, + { + "epoch": 0.11772799823875832, + "grad_norm": 1.0311180353164673, + "learning_rate": 9.922086159896338e-06, + "loss": 0.9112, + "step": 2139 + }, + { + "epoch": 0.11778303704111398, + "grad_norm": 0.943911075592041, + "learning_rate": 9.922009916922773e-06, + "loss": 0.8332, + "step": 2140 + }, + { + "epoch": 0.11783807584346964, + "grad_norm": 0.8156648278236389, + "learning_rate": 9.921933636956697e-06, + "loss": 0.8837, + "step": 2141 + }, + { + "epoch": 0.1178931146458253, + "grad_norm": 0.860292375087738, + "learning_rate": 9.921857319998688e-06, + "loss": 0.7963, + "step": 2142 + }, + { + "epoch": 0.11794815344818096, + "grad_norm": 0.8861456513404846, + "learning_rate": 9.921780966049315e-06, + "loss": 0.8335, + "step": 2143 + }, + { + "epoch": 0.11800319225053663, + "grad_norm": 0.793533205986023, + "learning_rate": 9.921704575109155e-06, + "loss": 0.7881, + "step": 2144 + }, + { + "epoch": 0.11805823105289229, + "grad_norm": 0.8039320111274719, + "learning_rate": 9.921628147178781e-06, + "loss": 0.8369, + "step": 2145 + }, + { + "epoch": 0.11811326985524795, + "grad_norm": 0.8785450458526611, + "learning_rate": 9.921551682258765e-06, + "loss": 0.7981, + "step": 2146 + }, + { + "epoch": 0.11816830865760361, + "grad_norm": 0.810251772403717, + "learning_rate": 9.921475180349687e-06, + "loss": 0.7926, + "step": 2147 + }, + { + "epoch": 0.11822334745995927, + "grad_norm": 0.8470801115036011, + "learning_rate": 9.921398641452117e-06, + "loss": 0.8061, + "step": 2148 + }, + { + "epoch": 0.11827838626231493, + "grad_norm": 0.8147469162940979, + "learning_rate": 9.921322065566633e-06, + "loss": 0.7906, + "step": 2149 + }, + { + "epoch": 0.1183334250646706, + "grad_norm": 0.8792327046394348, + "learning_rate": 9.92124545269381e-06, + "loss": 0.9025, + "step": 2150 + }, + { + "epoch": 0.11838846386702626, + "grad_norm": 0.794607400894165, + "learning_rate": 9.921168802834223e-06, + "loss": 0.8284, + "step": 2151 + }, + { + "epoch": 0.11844350266938192, + "grad_norm": 0.8601556420326233, + "learning_rate": 9.921092115988447e-06, + "loss": 0.8196, + "step": 2152 + }, + { + "epoch": 0.11849854147173758, + "grad_norm": 0.786967933177948, + "learning_rate": 9.921015392157062e-06, + "loss": 0.8744, + "step": 2153 + }, + { + "epoch": 0.11855358027409324, + "grad_norm": 0.8481432199478149, + "learning_rate": 9.920938631340641e-06, + "loss": 0.7206, + "step": 2154 + }, + { + "epoch": 0.1186086190764489, + "grad_norm": 0.8025142550468445, + "learning_rate": 9.920861833539765e-06, + "loss": 0.8126, + "step": 2155 + }, + { + "epoch": 0.11866365787880456, + "grad_norm": 0.9853057265281677, + "learning_rate": 9.920784998755006e-06, + "loss": 0.8883, + "step": 2156 + }, + { + "epoch": 0.11871869668116022, + "grad_norm": 1.0008476972579956, + "learning_rate": 9.920708126986947e-06, + "loss": 0.9326, + "step": 2157 + }, + { + "epoch": 0.11877373548351589, + "grad_norm": 0.837347686290741, + "learning_rate": 9.920631218236161e-06, + "loss": 0.9002, + "step": 2158 + }, + { + "epoch": 0.11882877428587153, + "grad_norm": 0.7866735458374023, + "learning_rate": 9.920554272503227e-06, + "loss": 0.765, + "step": 2159 + }, + { + "epoch": 0.1188838130882272, + "grad_norm": 0.8714935779571533, + "learning_rate": 9.920477289788726e-06, + "loss": 1.0294, + "step": 2160 + }, + { + "epoch": 0.11893885189058286, + "grad_norm": 1.0671826601028442, + "learning_rate": 9.920400270093234e-06, + "loss": 0.8341, + "step": 2161 + }, + { + "epoch": 0.11899389069293852, + "grad_norm": 0.8594604134559631, + "learning_rate": 9.92032321341733e-06, + "loss": 0.8731, + "step": 2162 + }, + { + "epoch": 0.11904892949529418, + "grad_norm": 0.8387738466262817, + "learning_rate": 9.920246119761597e-06, + "loss": 0.7898, + "step": 2163 + }, + { + "epoch": 0.11910396829764984, + "grad_norm": 0.8957195281982422, + "learning_rate": 9.920168989126608e-06, + "loss": 0.8475, + "step": 2164 + }, + { + "epoch": 0.1191590071000055, + "grad_norm": 0.8224207162857056, + "learning_rate": 9.920091821512948e-06, + "loss": 0.7944, + "step": 2165 + }, + { + "epoch": 0.11921404590236116, + "grad_norm": 1.0309031009674072, + "learning_rate": 9.920014616921192e-06, + "loss": 0.8992, + "step": 2166 + }, + { + "epoch": 0.11926908470471682, + "grad_norm": 0.7300832271575928, + "learning_rate": 9.919937375351925e-06, + "loss": 0.7016, + "step": 2167 + }, + { + "epoch": 0.11932412350707249, + "grad_norm": 0.7565537691116333, + "learning_rate": 9.919860096805724e-06, + "loss": 0.8113, + "step": 2168 + }, + { + "epoch": 0.11937916230942815, + "grad_norm": 1.0101505517959595, + "learning_rate": 9.919782781283174e-06, + "loss": 0.8765, + "step": 2169 + }, + { + "epoch": 0.11943420111178381, + "grad_norm": 0.8369461894035339, + "learning_rate": 9.919705428784852e-06, + "loss": 0.8248, + "step": 2170 + }, + { + "epoch": 0.11948923991413947, + "grad_norm": 0.8106105327606201, + "learning_rate": 9.919628039311342e-06, + "loss": 0.8585, + "step": 2171 + }, + { + "epoch": 0.11954427871649513, + "grad_norm": 0.7863745093345642, + "learning_rate": 9.919550612863224e-06, + "loss": 0.8393, + "step": 2172 + }, + { + "epoch": 0.11959931751885079, + "grad_norm": 0.8664719462394714, + "learning_rate": 9.919473149441081e-06, + "loss": 0.8882, + "step": 2173 + }, + { + "epoch": 0.11965435632120645, + "grad_norm": 0.6977574825286865, + "learning_rate": 9.919395649045494e-06, + "loss": 0.7264, + "step": 2174 + }, + { + "epoch": 0.11970939512356212, + "grad_norm": 0.8000102639198303, + "learning_rate": 9.919318111677045e-06, + "loss": 0.7828, + "step": 2175 + }, + { + "epoch": 0.11976443392591778, + "grad_norm": 0.868228018283844, + "learning_rate": 9.91924053733632e-06, + "loss": 0.7904, + "step": 2176 + }, + { + "epoch": 0.11981947272827344, + "grad_norm": 0.839080274105072, + "learning_rate": 9.9191629260239e-06, + "loss": 0.7663, + "step": 2177 + }, + { + "epoch": 0.1198745115306291, + "grad_norm": 0.8222747445106506, + "learning_rate": 9.919085277740366e-06, + "loss": 0.7208, + "step": 2178 + }, + { + "epoch": 0.11992955033298476, + "grad_norm": 1.4550986289978027, + "learning_rate": 9.919007592486304e-06, + "loss": 0.8154, + "step": 2179 + }, + { + "epoch": 0.11998458913534041, + "grad_norm": 0.9110257625579834, + "learning_rate": 9.9189298702623e-06, + "loss": 0.8134, + "step": 2180 + }, + { + "epoch": 0.12003962793769607, + "grad_norm": 0.84796142578125, + "learning_rate": 9.918852111068935e-06, + "loss": 0.8074, + "step": 2181 + }, + { + "epoch": 0.12009466674005173, + "grad_norm": 0.8134179711341858, + "learning_rate": 9.918774314906793e-06, + "loss": 0.6335, + "step": 2182 + }, + { + "epoch": 0.12014970554240739, + "grad_norm": 0.8481448888778687, + "learning_rate": 9.918696481776461e-06, + "loss": 0.8804, + "step": 2183 + }, + { + "epoch": 0.12020474434476305, + "grad_norm": 0.88057941198349, + "learning_rate": 9.918618611678523e-06, + "loss": 0.9326, + "step": 2184 + }, + { + "epoch": 0.12025978314711872, + "grad_norm": 0.8435977697372437, + "learning_rate": 9.918540704613564e-06, + "loss": 0.8141, + "step": 2185 + }, + { + "epoch": 0.12031482194947438, + "grad_norm": 0.8186982870101929, + "learning_rate": 9.918462760582169e-06, + "loss": 0.837, + "step": 2186 + }, + { + "epoch": 0.12036986075183004, + "grad_norm": 0.887783944606781, + "learning_rate": 9.918384779584924e-06, + "loss": 0.8062, + "step": 2187 + }, + { + "epoch": 0.1204248995541857, + "grad_norm": 0.9368415474891663, + "learning_rate": 9.918306761622417e-06, + "loss": 1.0098, + "step": 2188 + }, + { + "epoch": 0.12047993835654136, + "grad_norm": 0.8443986773490906, + "learning_rate": 9.918228706695232e-06, + "loss": 0.8178, + "step": 2189 + }, + { + "epoch": 0.12053497715889702, + "grad_norm": 0.7897284626960754, + "learning_rate": 9.918150614803956e-06, + "loss": 0.8013, + "step": 2190 + }, + { + "epoch": 0.12059001596125268, + "grad_norm": 0.886012077331543, + "learning_rate": 9.91807248594918e-06, + "loss": 0.8141, + "step": 2191 + }, + { + "epoch": 0.12064505476360834, + "grad_norm": 0.8585757613182068, + "learning_rate": 9.917994320131484e-06, + "loss": 0.8381, + "step": 2192 + }, + { + "epoch": 0.120700093565964, + "grad_norm": 1.6192269325256348, + "learning_rate": 9.917916117351459e-06, + "loss": 0.9082, + "step": 2193 + }, + { + "epoch": 0.12075513236831967, + "grad_norm": 1.160414457321167, + "learning_rate": 9.917837877609695e-06, + "loss": 0.8673, + "step": 2194 + }, + { + "epoch": 0.12081017117067533, + "grad_norm": 0.8363412022590637, + "learning_rate": 9.917759600906775e-06, + "loss": 0.816, + "step": 2195 + }, + { + "epoch": 0.12086520997303099, + "grad_norm": 0.8344097137451172, + "learning_rate": 9.917681287243292e-06, + "loss": 0.8629, + "step": 2196 + }, + { + "epoch": 0.12092024877538665, + "grad_norm": 0.9817582368850708, + "learning_rate": 9.917602936619834e-06, + "loss": 0.8106, + "step": 2197 + }, + { + "epoch": 0.12097528757774231, + "grad_norm": 0.8828088641166687, + "learning_rate": 9.917524549036987e-06, + "loss": 0.8465, + "step": 2198 + }, + { + "epoch": 0.12103032638009797, + "grad_norm": 0.8428277969360352, + "learning_rate": 9.917446124495344e-06, + "loss": 0.7721, + "step": 2199 + }, + { + "epoch": 0.12108536518245364, + "grad_norm": 0.8748664855957031, + "learning_rate": 9.917367662995489e-06, + "loss": 0.8679, + "step": 2200 + }, + { + "epoch": 0.1211404039848093, + "grad_norm": 0.8652347922325134, + "learning_rate": 9.917289164538018e-06, + "loss": 0.8906, + "step": 2201 + }, + { + "epoch": 0.12119544278716494, + "grad_norm": 1.157142162322998, + "learning_rate": 9.917210629123518e-06, + "loss": 0.9046, + "step": 2202 + }, + { + "epoch": 0.1212504815895206, + "grad_norm": 0.8186333179473877, + "learning_rate": 9.917132056752576e-06, + "loss": 0.8494, + "step": 2203 + }, + { + "epoch": 0.12130552039187627, + "grad_norm": 0.7769078612327576, + "learning_rate": 9.917053447425788e-06, + "loss": 0.8018, + "step": 2204 + }, + { + "epoch": 0.12136055919423193, + "grad_norm": 0.9190469980239868, + "learning_rate": 9.916974801143742e-06, + "loss": 0.8206, + "step": 2205 + }, + { + "epoch": 0.12141559799658759, + "grad_norm": 1.2200725078582764, + "learning_rate": 9.91689611790703e-06, + "loss": 0.9109, + "step": 2206 + }, + { + "epoch": 0.12147063679894325, + "grad_norm": 0.7902093529701233, + "learning_rate": 9.916817397716243e-06, + "loss": 0.8314, + "step": 2207 + }, + { + "epoch": 0.12152567560129891, + "grad_norm": 0.8160610198974609, + "learning_rate": 9.91673864057197e-06, + "loss": 0.8605, + "step": 2208 + }, + { + "epoch": 0.12158071440365457, + "grad_norm": 0.833163857460022, + "learning_rate": 9.916659846474807e-06, + "loss": 0.8125, + "step": 2209 + }, + { + "epoch": 0.12163575320601024, + "grad_norm": 0.776314377784729, + "learning_rate": 9.916581015425346e-06, + "loss": 0.8137, + "step": 2210 + }, + { + "epoch": 0.1216907920083659, + "grad_norm": 0.8525915145874023, + "learning_rate": 9.916502147424178e-06, + "loss": 0.8703, + "step": 2211 + }, + { + "epoch": 0.12174583081072156, + "grad_norm": 0.8268684148788452, + "learning_rate": 9.916423242471895e-06, + "loss": 0.7775, + "step": 2212 + }, + { + "epoch": 0.12180086961307722, + "grad_norm": 0.8717706799507141, + "learning_rate": 9.916344300569091e-06, + "loss": 0.8002, + "step": 2213 + }, + { + "epoch": 0.12185590841543288, + "grad_norm": 0.9499961137771606, + "learning_rate": 9.91626532171636e-06, + "loss": 0.8861, + "step": 2214 + }, + { + "epoch": 0.12191094721778854, + "grad_norm": 0.9521885514259338, + "learning_rate": 9.916186305914296e-06, + "loss": 0.7602, + "step": 2215 + }, + { + "epoch": 0.1219659860201442, + "grad_norm": 0.8945447206497192, + "learning_rate": 9.916107253163488e-06, + "loss": 0.8603, + "step": 2216 + }, + { + "epoch": 0.12202102482249987, + "grad_norm": 0.8232392072677612, + "learning_rate": 9.916028163464536e-06, + "loss": 0.8419, + "step": 2217 + }, + { + "epoch": 0.12207606362485553, + "grad_norm": 0.8183467984199524, + "learning_rate": 9.915949036818032e-06, + "loss": 0.9038, + "step": 2218 + }, + { + "epoch": 0.12213110242721119, + "grad_norm": 0.7805467247962952, + "learning_rate": 9.915869873224571e-06, + "loss": 0.7313, + "step": 2219 + }, + { + "epoch": 0.12218614122956685, + "grad_norm": 0.838101327419281, + "learning_rate": 9.915790672684749e-06, + "loss": 0.7973, + "step": 2220 + }, + { + "epoch": 0.12224118003192251, + "grad_norm": 0.7795171141624451, + "learning_rate": 9.915711435199158e-06, + "loss": 0.7796, + "step": 2221 + }, + { + "epoch": 0.12229621883427817, + "grad_norm": 0.7971234917640686, + "learning_rate": 9.915632160768398e-06, + "loss": 0.8309, + "step": 2222 + }, + { + "epoch": 0.12235125763663382, + "grad_norm": 0.8543851375579834, + "learning_rate": 9.915552849393061e-06, + "loss": 0.7826, + "step": 2223 + }, + { + "epoch": 0.12240629643898948, + "grad_norm": 0.9315086007118225, + "learning_rate": 9.915473501073744e-06, + "loss": 0.9294, + "step": 2224 + }, + { + "epoch": 0.12246133524134514, + "grad_norm": 0.8794427514076233, + "learning_rate": 9.915394115811046e-06, + "loss": 0.8968, + "step": 2225 + }, + { + "epoch": 0.1225163740437008, + "grad_norm": 0.9499204754829407, + "learning_rate": 9.91531469360556e-06, + "loss": 0.9841, + "step": 2226 + }, + { + "epoch": 0.12257141284605647, + "grad_norm": 0.9233788251876831, + "learning_rate": 9.915235234457885e-06, + "loss": 0.7794, + "step": 2227 + }, + { + "epoch": 0.12262645164841213, + "grad_norm": 0.8971870541572571, + "learning_rate": 9.915155738368618e-06, + "loss": 0.919, + "step": 2228 + }, + { + "epoch": 0.12268149045076779, + "grad_norm": 0.8122105002403259, + "learning_rate": 9.915076205338356e-06, + "loss": 0.8227, + "step": 2229 + }, + { + "epoch": 0.12273652925312345, + "grad_norm": 0.7878004908561707, + "learning_rate": 9.914996635367696e-06, + "loss": 0.7622, + "step": 2230 + }, + { + "epoch": 0.12279156805547911, + "grad_norm": 0.8229606747627258, + "learning_rate": 9.914917028457238e-06, + "loss": 0.8265, + "step": 2231 + }, + { + "epoch": 0.12284660685783477, + "grad_norm": 0.8972312808036804, + "learning_rate": 9.914837384607578e-06, + "loss": 0.8914, + "step": 2232 + }, + { + "epoch": 0.12290164566019043, + "grad_norm": 0.762922465801239, + "learning_rate": 9.914757703819318e-06, + "loss": 0.6853, + "step": 2233 + }, + { + "epoch": 0.1229566844625461, + "grad_norm": 0.8949442505836487, + "learning_rate": 9.914677986093054e-06, + "loss": 0.8303, + "step": 2234 + }, + { + "epoch": 0.12301172326490176, + "grad_norm": 1.0220820903778076, + "learning_rate": 9.914598231429384e-06, + "loss": 1.0027, + "step": 2235 + }, + { + "epoch": 0.12306676206725742, + "grad_norm": 0.8265436887741089, + "learning_rate": 9.914518439828911e-06, + "loss": 0.8317, + "step": 2236 + }, + { + "epoch": 0.12312180086961308, + "grad_norm": 0.780444324016571, + "learning_rate": 9.914438611292231e-06, + "loss": 0.756, + "step": 2237 + }, + { + "epoch": 0.12317683967196874, + "grad_norm": 0.8569482564926147, + "learning_rate": 9.914358745819948e-06, + "loss": 0.8126, + "step": 2238 + }, + { + "epoch": 0.1232318784743244, + "grad_norm": 0.8167145848274231, + "learning_rate": 9.91427884341266e-06, + "loss": 0.8345, + "step": 2239 + }, + { + "epoch": 0.12328691727668006, + "grad_norm": 0.7915990948677063, + "learning_rate": 9.914198904070967e-06, + "loss": 0.7416, + "step": 2240 + }, + { + "epoch": 0.12334195607903573, + "grad_norm": 0.8568083047866821, + "learning_rate": 9.91411892779547e-06, + "loss": 0.8329, + "step": 2241 + }, + { + "epoch": 0.12339699488139139, + "grad_norm": 1.1727303266525269, + "learning_rate": 9.914038914586772e-06, + "loss": 0.8421, + "step": 2242 + }, + { + "epoch": 0.12345203368374705, + "grad_norm": 0.8706398010253906, + "learning_rate": 9.913958864445472e-06, + "loss": 0.9013, + "step": 2243 + }, + { + "epoch": 0.12350707248610271, + "grad_norm": 0.8376144170761108, + "learning_rate": 9.913878777372173e-06, + "loss": 0.8456, + "step": 2244 + }, + { + "epoch": 0.12356211128845836, + "grad_norm": 0.8388974070549011, + "learning_rate": 9.913798653367478e-06, + "loss": 0.787, + "step": 2245 + }, + { + "epoch": 0.12361715009081402, + "grad_norm": 0.8625446557998657, + "learning_rate": 9.913718492431984e-06, + "loss": 0.7758, + "step": 2246 + }, + { + "epoch": 0.12367218889316968, + "grad_norm": 0.8805570006370544, + "learning_rate": 9.913638294566299e-06, + "loss": 0.8755, + "step": 2247 + }, + { + "epoch": 0.12372722769552534, + "grad_norm": 0.8102611899375916, + "learning_rate": 9.913558059771025e-06, + "loss": 0.8495, + "step": 2248 + }, + { + "epoch": 0.123782266497881, + "grad_norm": 0.8506311774253845, + "learning_rate": 9.913477788046762e-06, + "loss": 0.7413, + "step": 2249 + }, + { + "epoch": 0.12383730530023666, + "grad_norm": 1.0789196491241455, + "learning_rate": 9.913397479394116e-06, + "loss": 0.8993, + "step": 2250 + }, + { + "epoch": 0.12389234410259232, + "grad_norm": 1.5664849281311035, + "learning_rate": 9.91331713381369e-06, + "loss": 0.8322, + "step": 2251 + }, + { + "epoch": 0.12394738290494799, + "grad_norm": 1.1347390413284302, + "learning_rate": 9.913236751306085e-06, + "loss": 0.8756, + "step": 2252 + }, + { + "epoch": 0.12400242170730365, + "grad_norm": 0.8111063241958618, + "learning_rate": 9.913156331871911e-06, + "loss": 0.831, + "step": 2253 + }, + { + "epoch": 0.12405746050965931, + "grad_norm": 0.817812979221344, + "learning_rate": 9.913075875511769e-06, + "loss": 0.8531, + "step": 2254 + }, + { + "epoch": 0.12411249931201497, + "grad_norm": 0.7678318619728088, + "learning_rate": 9.912995382226263e-06, + "loss": 0.8028, + "step": 2255 + }, + { + "epoch": 0.12416753811437063, + "grad_norm": 0.8207805156707764, + "learning_rate": 9.912914852015998e-06, + "loss": 0.8856, + "step": 2256 + }, + { + "epoch": 0.1242225769167263, + "grad_norm": 0.978484570980072, + "learning_rate": 9.912834284881582e-06, + "loss": 0.933, + "step": 2257 + }, + { + "epoch": 0.12427761571908195, + "grad_norm": 0.9215858578681946, + "learning_rate": 9.912753680823617e-06, + "loss": 0.7771, + "step": 2258 + }, + { + "epoch": 0.12433265452143762, + "grad_norm": 0.8542179465293884, + "learning_rate": 9.91267303984271e-06, + "loss": 0.8652, + "step": 2259 + }, + { + "epoch": 0.12438769332379328, + "grad_norm": 0.7985575199127197, + "learning_rate": 9.912592361939469e-06, + "loss": 0.7011, + "step": 2260 + }, + { + "epoch": 0.12444273212614894, + "grad_norm": 0.8868670463562012, + "learning_rate": 9.912511647114498e-06, + "loss": 0.8222, + "step": 2261 + }, + { + "epoch": 0.1244977709285046, + "grad_norm": 0.7966209650039673, + "learning_rate": 9.912430895368405e-06, + "loss": 0.776, + "step": 2262 + }, + { + "epoch": 0.12455280973086026, + "grad_norm": 0.7844830751419067, + "learning_rate": 9.912350106701796e-06, + "loss": 0.7513, + "step": 2263 + }, + { + "epoch": 0.12460784853321592, + "grad_norm": 0.7788559794425964, + "learning_rate": 9.912269281115278e-06, + "loss": 0.8517, + "step": 2264 + }, + { + "epoch": 0.12466288733557158, + "grad_norm": 0.778225839138031, + "learning_rate": 9.912188418609461e-06, + "loss": 0.7504, + "step": 2265 + }, + { + "epoch": 0.12471792613792723, + "grad_norm": 0.7955968976020813, + "learning_rate": 9.912107519184947e-06, + "loss": 0.8152, + "step": 2266 + }, + { + "epoch": 0.1247729649402829, + "grad_norm": 1.1202566623687744, + "learning_rate": 9.912026582842352e-06, + "loss": 0.9325, + "step": 2267 + }, + { + "epoch": 0.12482800374263855, + "grad_norm": 0.9762749671936035, + "learning_rate": 9.911945609582279e-06, + "loss": 0.9027, + "step": 2268 + }, + { + "epoch": 0.12488304254499422, + "grad_norm": 0.8311051726341248, + "learning_rate": 9.911864599405336e-06, + "loss": 0.838, + "step": 2269 + }, + { + "epoch": 0.12493808134734988, + "grad_norm": 1.0136815309524536, + "learning_rate": 9.911783552312134e-06, + "loss": 0.9288, + "step": 2270 + }, + { + "epoch": 0.12499312014970554, + "grad_norm": 0.7960494160652161, + "learning_rate": 9.911702468303282e-06, + "loss": 0.8007, + "step": 2271 + }, + { + "epoch": 0.1250481589520612, + "grad_norm": 0.9980880618095398, + "learning_rate": 9.911621347379388e-06, + "loss": 0.8613, + "step": 2272 + }, + { + "epoch": 0.12510319775441686, + "grad_norm": 0.8916807770729065, + "learning_rate": 9.911540189541065e-06, + "loss": 0.8783, + "step": 2273 + }, + { + "epoch": 0.12515823655677252, + "grad_norm": 0.9455892443656921, + "learning_rate": 9.911458994788919e-06, + "loss": 0.8676, + "step": 2274 + }, + { + "epoch": 0.12521327535912818, + "grad_norm": 0.7649906277656555, + "learning_rate": 9.911377763123561e-06, + "loss": 0.7763, + "step": 2275 + }, + { + "epoch": 0.12526831416148385, + "grad_norm": 0.8971202373504639, + "learning_rate": 9.911296494545604e-06, + "loss": 0.9022, + "step": 2276 + }, + { + "epoch": 0.1253233529638395, + "grad_norm": 0.833678126335144, + "learning_rate": 9.911215189055657e-06, + "loss": 0.8401, + "step": 2277 + }, + { + "epoch": 0.12537839176619517, + "grad_norm": 0.8967958688735962, + "learning_rate": 9.911133846654331e-06, + "loss": 0.8678, + "step": 2278 + }, + { + "epoch": 0.12543343056855083, + "grad_norm": 0.8195546865463257, + "learning_rate": 9.911052467342239e-06, + "loss": 0.842, + "step": 2279 + }, + { + "epoch": 0.1254884693709065, + "grad_norm": 1.095815896987915, + "learning_rate": 9.910971051119988e-06, + "loss": 0.845, + "step": 2280 + }, + { + "epoch": 0.12554350817326215, + "grad_norm": 0.9452629685401917, + "learning_rate": 9.910889597988197e-06, + "loss": 0.8971, + "step": 2281 + }, + { + "epoch": 0.12559854697561781, + "grad_norm": 0.9872332215309143, + "learning_rate": 9.910808107947471e-06, + "loss": 0.7994, + "step": 2282 + }, + { + "epoch": 0.12565358577797348, + "grad_norm": 0.7761966586112976, + "learning_rate": 9.910726580998427e-06, + "loss": 0.7791, + "step": 2283 + }, + { + "epoch": 0.12570862458032914, + "grad_norm": 0.8950315713882446, + "learning_rate": 9.910645017141678e-06, + "loss": 0.8499, + "step": 2284 + }, + { + "epoch": 0.1257636633826848, + "grad_norm": 0.8796371221542358, + "learning_rate": 9.910563416377834e-06, + "loss": 0.8587, + "step": 2285 + }, + { + "epoch": 0.12581870218504046, + "grad_norm": 0.8291982412338257, + "learning_rate": 9.91048177870751e-06, + "loss": 0.9166, + "step": 2286 + }, + { + "epoch": 0.12587374098739612, + "grad_norm": 0.758369505405426, + "learning_rate": 9.91040010413132e-06, + "loss": 0.8305, + "step": 2287 + }, + { + "epoch": 0.12592877978975178, + "grad_norm": 0.8775640726089478, + "learning_rate": 9.910318392649876e-06, + "loss": 0.8513, + "step": 2288 + }, + { + "epoch": 0.12598381859210744, + "grad_norm": 0.8581671118736267, + "learning_rate": 9.910236644263796e-06, + "loss": 0.8134, + "step": 2289 + }, + { + "epoch": 0.1260388573944631, + "grad_norm": 0.8570736050605774, + "learning_rate": 9.910154858973689e-06, + "loss": 0.826, + "step": 2290 + }, + { + "epoch": 0.12609389619681877, + "grad_norm": 0.8712487816810608, + "learning_rate": 9.910073036780173e-06, + "loss": 0.8042, + "step": 2291 + }, + { + "epoch": 0.12614893499917443, + "grad_norm": 0.7584837675094604, + "learning_rate": 9.909991177683862e-06, + "loss": 0.7715, + "step": 2292 + }, + { + "epoch": 0.1262039738015301, + "grad_norm": 0.8618917465209961, + "learning_rate": 9.909909281685373e-06, + "loss": 0.8755, + "step": 2293 + }, + { + "epoch": 0.12625901260388575, + "grad_norm": 0.9530277848243713, + "learning_rate": 9.90982734878532e-06, + "loss": 0.8538, + "step": 2294 + }, + { + "epoch": 0.1263140514062414, + "grad_norm": 0.8394436836242676, + "learning_rate": 9.909745378984319e-06, + "loss": 0.8401, + "step": 2295 + }, + { + "epoch": 0.12636909020859707, + "grad_norm": 0.8224034309387207, + "learning_rate": 9.909663372282984e-06, + "loss": 0.7201, + "step": 2296 + }, + { + "epoch": 0.12642412901095273, + "grad_norm": 0.8215349912643433, + "learning_rate": 9.909581328681934e-06, + "loss": 0.8824, + "step": 2297 + }, + { + "epoch": 0.12647916781330837, + "grad_norm": 0.839389443397522, + "learning_rate": 9.909499248181786e-06, + "loss": 0.8056, + "step": 2298 + }, + { + "epoch": 0.12653420661566403, + "grad_norm": 0.9440048933029175, + "learning_rate": 9.909417130783156e-06, + "loss": 0.908, + "step": 2299 + }, + { + "epoch": 0.1265892454180197, + "grad_norm": 0.8336486220359802, + "learning_rate": 9.90933497648666e-06, + "loss": 0.8382, + "step": 2300 + }, + { + "epoch": 0.12664428422037535, + "grad_norm": 1.1541366577148438, + "learning_rate": 9.909252785292918e-06, + "loss": 0.8782, + "step": 2301 + }, + { + "epoch": 0.12669932302273101, + "grad_norm": 0.8730320334434509, + "learning_rate": 9.909170557202545e-06, + "loss": 0.7687, + "step": 2302 + }, + { + "epoch": 0.12675436182508668, + "grad_norm": 0.9927527904510498, + "learning_rate": 9.90908829221616e-06, + "loss": 0.8134, + "step": 2303 + }, + { + "epoch": 0.12680940062744234, + "grad_norm": 0.9521791338920593, + "learning_rate": 9.909005990334381e-06, + "loss": 0.9187, + "step": 2304 + }, + { + "epoch": 0.126864439429798, + "grad_norm": 0.8012455701828003, + "learning_rate": 9.908923651557828e-06, + "loss": 0.8581, + "step": 2305 + }, + { + "epoch": 0.12691947823215366, + "grad_norm": 0.8882689476013184, + "learning_rate": 9.90884127588712e-06, + "loss": 0.9317, + "step": 2306 + }, + { + "epoch": 0.12697451703450932, + "grad_norm": 0.8408340215682983, + "learning_rate": 9.908758863322872e-06, + "loss": 0.8444, + "step": 2307 + }, + { + "epoch": 0.12702955583686498, + "grad_norm": 0.7856307029724121, + "learning_rate": 9.908676413865709e-06, + "loss": 0.8457, + "step": 2308 + }, + { + "epoch": 0.12708459463922064, + "grad_norm": 0.9459167718887329, + "learning_rate": 9.908593927516247e-06, + "loss": 0.8153, + "step": 2309 + }, + { + "epoch": 0.1271396334415763, + "grad_norm": 0.8629655838012695, + "learning_rate": 9.908511404275107e-06, + "loss": 0.8279, + "step": 2310 + }, + { + "epoch": 0.12719467224393197, + "grad_norm": 1.2012875080108643, + "learning_rate": 9.90842884414291e-06, + "loss": 1.4388, + "step": 2311 + }, + { + "epoch": 0.12724971104628763, + "grad_norm": 1.20725417137146, + "learning_rate": 9.908346247120274e-06, + "loss": 0.8704, + "step": 2312 + }, + { + "epoch": 0.1273047498486433, + "grad_norm": 0.8152929544448853, + "learning_rate": 9.908263613207822e-06, + "loss": 0.8618, + "step": 2313 + }, + { + "epoch": 0.12735978865099895, + "grad_norm": 0.8400965332984924, + "learning_rate": 9.908180942406175e-06, + "loss": 0.7881, + "step": 2314 + }, + { + "epoch": 0.1274148274533546, + "grad_norm": 0.8856974840164185, + "learning_rate": 9.908098234715956e-06, + "loss": 0.9073, + "step": 2315 + }, + { + "epoch": 0.12746986625571027, + "grad_norm": 0.8708439469337463, + "learning_rate": 9.908015490137782e-06, + "loss": 0.8099, + "step": 2316 + }, + { + "epoch": 0.12752490505806593, + "grad_norm": 0.8632444143295288, + "learning_rate": 9.907932708672277e-06, + "loss": 0.8472, + "step": 2317 + }, + { + "epoch": 0.1275799438604216, + "grad_norm": 0.8977149128913879, + "learning_rate": 9.907849890320062e-06, + "loss": 0.8878, + "step": 2318 + }, + { + "epoch": 0.12763498266277726, + "grad_norm": 0.8589425086975098, + "learning_rate": 9.907767035081765e-06, + "loss": 0.7905, + "step": 2319 + }, + { + "epoch": 0.12769002146513292, + "grad_norm": 0.9873501062393188, + "learning_rate": 9.907684142958002e-06, + "loss": 0.9002, + "step": 2320 + }, + { + "epoch": 0.12774506026748858, + "grad_norm": 0.8963840007781982, + "learning_rate": 9.9076012139494e-06, + "loss": 0.92, + "step": 2321 + }, + { + "epoch": 0.12780009906984424, + "grad_norm": 0.7933574318885803, + "learning_rate": 9.90751824805658e-06, + "loss": 0.7664, + "step": 2322 + }, + { + "epoch": 0.1278551378721999, + "grad_norm": 0.9660933017730713, + "learning_rate": 9.907435245280167e-06, + "loss": 0.9162, + "step": 2323 + }, + { + "epoch": 0.12791017667455556, + "grad_norm": 0.8698949217796326, + "learning_rate": 9.907352205620783e-06, + "loss": 0.7988, + "step": 2324 + }, + { + "epoch": 0.12796521547691123, + "grad_norm": 0.9077615141868591, + "learning_rate": 9.907269129079055e-06, + "loss": 0.8581, + "step": 2325 + }, + { + "epoch": 0.1280202542792669, + "grad_norm": 0.9128179550170898, + "learning_rate": 9.907186015655607e-06, + "loss": 0.8552, + "step": 2326 + }, + { + "epoch": 0.12807529308162255, + "grad_norm": 0.9321265816688538, + "learning_rate": 9.907102865351062e-06, + "loss": 0.889, + "step": 2327 + }, + { + "epoch": 0.1281303318839782, + "grad_norm": 0.9687464833259583, + "learning_rate": 9.907019678166044e-06, + "loss": 0.7944, + "step": 2328 + }, + { + "epoch": 0.12818537068633387, + "grad_norm": 0.862223207950592, + "learning_rate": 9.90693645410118e-06, + "loss": 0.7699, + "step": 2329 + }, + { + "epoch": 0.12824040948868953, + "grad_norm": 0.9662127494812012, + "learning_rate": 9.906853193157095e-06, + "loss": 0.7818, + "step": 2330 + }, + { + "epoch": 0.1282954482910452, + "grad_norm": 0.8008295297622681, + "learning_rate": 9.906769895334413e-06, + "loss": 0.8443, + "step": 2331 + }, + { + "epoch": 0.12835048709340086, + "grad_norm": 0.8638464212417603, + "learning_rate": 9.906686560633765e-06, + "loss": 0.8438, + "step": 2332 + }, + { + "epoch": 0.12840552589575652, + "grad_norm": 0.9215866327285767, + "learning_rate": 9.906603189055773e-06, + "loss": 0.7481, + "step": 2333 + }, + { + "epoch": 0.12846056469811218, + "grad_norm": 0.7926739454269409, + "learning_rate": 9.906519780601066e-06, + "loss": 0.7404, + "step": 2334 + }, + { + "epoch": 0.12851560350046784, + "grad_norm": 0.9590242505073547, + "learning_rate": 9.906436335270268e-06, + "loss": 0.8319, + "step": 2335 + }, + { + "epoch": 0.1285706423028235, + "grad_norm": 1.0300076007843018, + "learning_rate": 9.906352853064009e-06, + "loss": 0.8635, + "step": 2336 + }, + { + "epoch": 0.12862568110517916, + "grad_norm": 0.8401443958282471, + "learning_rate": 9.906269333982915e-06, + "loss": 0.9584, + "step": 2337 + }, + { + "epoch": 0.12868071990753482, + "grad_norm": 0.8144069910049438, + "learning_rate": 9.906185778027613e-06, + "loss": 0.7375, + "step": 2338 + }, + { + "epoch": 0.12873575870989049, + "grad_norm": 0.8513948917388916, + "learning_rate": 9.906102185198733e-06, + "loss": 0.8353, + "step": 2339 + }, + { + "epoch": 0.12879079751224615, + "grad_norm": 0.8243077397346497, + "learning_rate": 9.906018555496903e-06, + "loss": 0.8665, + "step": 2340 + }, + { + "epoch": 0.12884583631460178, + "grad_norm": 0.8699066042900085, + "learning_rate": 9.905934888922749e-06, + "loss": 0.8537, + "step": 2341 + }, + { + "epoch": 0.12890087511695744, + "grad_norm": 1.0980210304260254, + "learning_rate": 9.905851185476902e-06, + "loss": 0.8887, + "step": 2342 + }, + { + "epoch": 0.1289559139193131, + "grad_norm": 0.8189190030097961, + "learning_rate": 9.905767445159992e-06, + "loss": 0.8467, + "step": 2343 + }, + { + "epoch": 0.12901095272166876, + "grad_norm": 0.8273541331291199, + "learning_rate": 9.905683667972645e-06, + "loss": 0.8701, + "step": 2344 + }, + { + "epoch": 0.12906599152402443, + "grad_norm": 0.8987969160079956, + "learning_rate": 9.905599853915496e-06, + "loss": 0.909, + "step": 2345 + }, + { + "epoch": 0.1291210303263801, + "grad_norm": 0.818268895149231, + "learning_rate": 9.905516002989168e-06, + "loss": 0.7946, + "step": 2346 + }, + { + "epoch": 0.12917606912873575, + "grad_norm": 0.7401725053787231, + "learning_rate": 9.905432115194296e-06, + "loss": 0.7006, + "step": 2347 + }, + { + "epoch": 0.1292311079310914, + "grad_norm": 0.8263179659843445, + "learning_rate": 9.905348190531511e-06, + "loss": 0.7768, + "step": 2348 + }, + { + "epoch": 0.12928614673344707, + "grad_norm": 0.9241918921470642, + "learning_rate": 9.90526422900144e-06, + "loss": 0.8593, + "step": 2349 + }, + { + "epoch": 0.12934118553580273, + "grad_norm": 0.7804501056671143, + "learning_rate": 9.905180230604718e-06, + "loss": 0.7607, + "step": 2350 + }, + { + "epoch": 0.1293962243381584, + "grad_norm": 0.9408491253852844, + "learning_rate": 9.905096195341973e-06, + "loss": 0.8906, + "step": 2351 + }, + { + "epoch": 0.12945126314051406, + "grad_norm": 1.0356301069259644, + "learning_rate": 9.905012123213838e-06, + "loss": 0.8051, + "step": 2352 + }, + { + "epoch": 0.12950630194286972, + "grad_norm": 0.8546886444091797, + "learning_rate": 9.904928014220945e-06, + "loss": 0.7543, + "step": 2353 + }, + { + "epoch": 0.12956134074522538, + "grad_norm": 0.9229897856712341, + "learning_rate": 9.904843868363927e-06, + "loss": 0.8823, + "step": 2354 + }, + { + "epoch": 0.12961637954758104, + "grad_norm": 0.8364199995994568, + "learning_rate": 9.904759685643414e-06, + "loss": 0.8825, + "step": 2355 + }, + { + "epoch": 0.1296714183499367, + "grad_norm": 0.9092077016830444, + "learning_rate": 9.90467546606004e-06, + "loss": 0.8721, + "step": 2356 + }, + { + "epoch": 0.12972645715229236, + "grad_norm": 1.042973518371582, + "learning_rate": 9.904591209614441e-06, + "loss": 0.7984, + "step": 2357 + }, + { + "epoch": 0.12978149595464802, + "grad_norm": 0.7262618541717529, + "learning_rate": 9.904506916307243e-06, + "loss": 0.6721, + "step": 2358 + }, + { + "epoch": 0.12983653475700369, + "grad_norm": 0.7562826871871948, + "learning_rate": 9.904422586139086e-06, + "loss": 0.7702, + "step": 2359 + }, + { + "epoch": 0.12989157355935935, + "grad_norm": 0.8821595907211304, + "learning_rate": 9.904338219110603e-06, + "loss": 0.8555, + "step": 2360 + }, + { + "epoch": 0.129946612361715, + "grad_norm": 1.0340098142623901, + "learning_rate": 9.904253815222424e-06, + "loss": 0.9004, + "step": 2361 + }, + { + "epoch": 0.13000165116407067, + "grad_norm": 0.8533693552017212, + "learning_rate": 9.904169374475188e-06, + "loss": 0.836, + "step": 2362 + }, + { + "epoch": 0.13005668996642633, + "grad_norm": 0.8564199805259705, + "learning_rate": 9.904084896869528e-06, + "loss": 0.9281, + "step": 2363 + }, + { + "epoch": 0.130111728768782, + "grad_norm": 0.7817538976669312, + "learning_rate": 9.904000382406079e-06, + "loss": 0.7444, + "step": 2364 + }, + { + "epoch": 0.13016676757113765, + "grad_norm": 1.1420893669128418, + "learning_rate": 9.903915831085473e-06, + "loss": 0.9116, + "step": 2365 + }, + { + "epoch": 0.13022180637349332, + "grad_norm": 0.9671920537948608, + "learning_rate": 9.903831242908351e-06, + "loss": 0.899, + "step": 2366 + }, + { + "epoch": 0.13027684517584898, + "grad_norm": 0.8528717756271362, + "learning_rate": 9.903746617875345e-06, + "loss": 0.7231, + "step": 2367 + }, + { + "epoch": 0.13033188397820464, + "grad_norm": 0.786960244178772, + "learning_rate": 9.903661955987091e-06, + "loss": 0.7997, + "step": 2368 + }, + { + "epoch": 0.1303869227805603, + "grad_norm": 0.941683292388916, + "learning_rate": 9.903577257244228e-06, + "loss": 0.9127, + "step": 2369 + }, + { + "epoch": 0.13044196158291596, + "grad_norm": 0.886900007724762, + "learning_rate": 9.903492521647391e-06, + "loss": 0.9086, + "step": 2370 + }, + { + "epoch": 0.13049700038527162, + "grad_norm": 0.9924801588058472, + "learning_rate": 9.903407749197216e-06, + "loss": 0.9055, + "step": 2371 + }, + { + "epoch": 0.13055203918762728, + "grad_norm": 0.6998724341392517, + "learning_rate": 9.903322939894342e-06, + "loss": 0.6972, + "step": 2372 + }, + { + "epoch": 0.13060707798998294, + "grad_norm": 0.8448702096939087, + "learning_rate": 9.903238093739404e-06, + "loss": 0.7862, + "step": 2373 + }, + { + "epoch": 0.1306621167923386, + "grad_norm": 0.8557441830635071, + "learning_rate": 9.90315321073304e-06, + "loss": 0.8364, + "step": 2374 + }, + { + "epoch": 0.13071715559469427, + "grad_norm": 0.7978441119194031, + "learning_rate": 9.903068290875892e-06, + "loss": 0.7671, + "step": 2375 + }, + { + "epoch": 0.13077219439704993, + "grad_norm": 0.781315803527832, + "learning_rate": 9.902983334168594e-06, + "loss": 0.7963, + "step": 2376 + }, + { + "epoch": 0.1308272331994056, + "grad_norm": 0.7326155304908752, + "learning_rate": 9.902898340611785e-06, + "loss": 0.8, + "step": 2377 + }, + { + "epoch": 0.13088227200176125, + "grad_norm": 0.7693139314651489, + "learning_rate": 9.902813310206105e-06, + "loss": 0.8459, + "step": 2378 + }, + { + "epoch": 0.1309373108041169, + "grad_norm": 0.9441308975219727, + "learning_rate": 9.902728242952191e-06, + "loss": 0.8519, + "step": 2379 + }, + { + "epoch": 0.13099234960647257, + "grad_norm": 0.8350616693496704, + "learning_rate": 9.902643138850686e-06, + "loss": 0.876, + "step": 2380 + }, + { + "epoch": 0.13104738840882824, + "grad_norm": 0.8675554394721985, + "learning_rate": 9.902557997902227e-06, + "loss": 0.8172, + "step": 2381 + }, + { + "epoch": 0.1311024272111839, + "grad_norm": 0.9618930220603943, + "learning_rate": 9.902472820107454e-06, + "loss": 0.8852, + "step": 2382 + }, + { + "epoch": 0.13115746601353956, + "grad_norm": 0.862341046333313, + "learning_rate": 9.902387605467007e-06, + "loss": 0.9256, + "step": 2383 + }, + { + "epoch": 0.1312125048158952, + "grad_norm": 0.8749859929084778, + "learning_rate": 9.902302353981527e-06, + "loss": 0.8809, + "step": 2384 + }, + { + "epoch": 0.13126754361825085, + "grad_norm": 0.9061958193778992, + "learning_rate": 9.902217065651657e-06, + "loss": 0.779, + "step": 2385 + }, + { + "epoch": 0.13132258242060652, + "grad_norm": 0.8909298777580261, + "learning_rate": 9.902131740478033e-06, + "loss": 0.8203, + "step": 2386 + }, + { + "epoch": 0.13137762122296218, + "grad_norm": 0.8507269024848938, + "learning_rate": 9.902046378461302e-06, + "loss": 0.776, + "step": 2387 + }, + { + "epoch": 0.13143266002531784, + "grad_norm": 0.9577299356460571, + "learning_rate": 9.901960979602101e-06, + "loss": 0.8104, + "step": 2388 + }, + { + "epoch": 0.1314876988276735, + "grad_norm": 0.9244948625564575, + "learning_rate": 9.901875543901074e-06, + "loss": 0.9035, + "step": 2389 + }, + { + "epoch": 0.13154273763002916, + "grad_norm": 0.7534334063529968, + "learning_rate": 9.901790071358861e-06, + "loss": 0.7262, + "step": 2390 + }, + { + "epoch": 0.13159777643238482, + "grad_norm": 0.8920090198516846, + "learning_rate": 9.901704561976106e-06, + "loss": 0.932, + "step": 2391 + }, + { + "epoch": 0.13165281523474048, + "grad_norm": 0.8524243235588074, + "learning_rate": 9.901619015753455e-06, + "loss": 0.8107, + "step": 2392 + }, + { + "epoch": 0.13170785403709614, + "grad_norm": 0.8170381784439087, + "learning_rate": 9.901533432691543e-06, + "loss": 0.8814, + "step": 2393 + }, + { + "epoch": 0.1317628928394518, + "grad_norm": 0.8281697034835815, + "learning_rate": 9.90144781279102e-06, + "loss": 0.8221, + "step": 2394 + }, + { + "epoch": 0.13181793164180747, + "grad_norm": 0.9283351302146912, + "learning_rate": 9.901362156052528e-06, + "loss": 0.8346, + "step": 2395 + }, + { + "epoch": 0.13187297044416313, + "grad_norm": 0.8331275582313538, + "learning_rate": 9.901276462476708e-06, + "loss": 0.7498, + "step": 2396 + }, + { + "epoch": 0.1319280092465188, + "grad_norm": 0.8427191972732544, + "learning_rate": 9.901190732064207e-06, + "loss": 0.8265, + "step": 2397 + }, + { + "epoch": 0.13198304804887445, + "grad_norm": 0.8510351777076721, + "learning_rate": 9.901104964815669e-06, + "loss": 0.8369, + "step": 2398 + }, + { + "epoch": 0.1320380868512301, + "grad_norm": 0.8468914031982422, + "learning_rate": 9.901019160731738e-06, + "loss": 0.8585, + "step": 2399 + }, + { + "epoch": 0.13209312565358577, + "grad_norm": 0.8302182555198669, + "learning_rate": 9.900933319813058e-06, + "loss": 0.8611, + "step": 2400 + }, + { + "epoch": 0.13214816445594144, + "grad_norm": 0.8527448773384094, + "learning_rate": 9.900847442060277e-06, + "loss": 0.899, + "step": 2401 + }, + { + "epoch": 0.1322032032582971, + "grad_norm": 0.8354688286781311, + "learning_rate": 9.900761527474037e-06, + "loss": 0.8083, + "step": 2402 + }, + { + "epoch": 0.13225824206065276, + "grad_norm": 0.8612173795700073, + "learning_rate": 9.900675576054986e-06, + "loss": 0.8124, + "step": 2403 + }, + { + "epoch": 0.13231328086300842, + "grad_norm": 0.7424876689910889, + "learning_rate": 9.900589587803767e-06, + "loss": 0.6884, + "step": 2404 + }, + { + "epoch": 0.13236831966536408, + "grad_norm": 0.8431115746498108, + "learning_rate": 9.90050356272103e-06, + "loss": 0.9575, + "step": 2405 + }, + { + "epoch": 0.13242335846771974, + "grad_norm": 0.7958092093467712, + "learning_rate": 9.90041750080742e-06, + "loss": 0.7608, + "step": 2406 + }, + { + "epoch": 0.1324783972700754, + "grad_norm": 0.926258385181427, + "learning_rate": 9.900331402063583e-06, + "loss": 0.9072, + "step": 2407 + }, + { + "epoch": 0.13253343607243107, + "grad_norm": 0.7952526807785034, + "learning_rate": 9.900245266490169e-06, + "loss": 0.8001, + "step": 2408 + }, + { + "epoch": 0.13258847487478673, + "grad_norm": 0.8309933543205261, + "learning_rate": 9.900159094087822e-06, + "loss": 0.9154, + "step": 2409 + }, + { + "epoch": 0.1326435136771424, + "grad_norm": 0.858007550239563, + "learning_rate": 9.90007288485719e-06, + "loss": 0.855, + "step": 2410 + }, + { + "epoch": 0.13269855247949805, + "grad_norm": 0.9513822197914124, + "learning_rate": 9.899986638798923e-06, + "loss": 0.8162, + "step": 2411 + }, + { + "epoch": 0.1327535912818537, + "grad_norm": 0.8387427926063538, + "learning_rate": 9.899900355913668e-06, + "loss": 0.8955, + "step": 2412 + }, + { + "epoch": 0.13280863008420937, + "grad_norm": 0.7727940678596497, + "learning_rate": 9.899814036202073e-06, + "loss": 0.6765, + "step": 2413 + }, + { + "epoch": 0.13286366888656503, + "grad_norm": 0.7760928869247437, + "learning_rate": 9.899727679664788e-06, + "loss": 0.7179, + "step": 2414 + }, + { + "epoch": 0.1329187076889207, + "grad_norm": 0.7798073887825012, + "learning_rate": 9.899641286302462e-06, + "loss": 0.8541, + "step": 2415 + }, + { + "epoch": 0.13297374649127636, + "grad_norm": 0.8302769660949707, + "learning_rate": 9.899554856115743e-06, + "loss": 0.8925, + "step": 2416 + }, + { + "epoch": 0.13302878529363202, + "grad_norm": 0.8300751447677612, + "learning_rate": 9.89946838910528e-06, + "loss": 0.7489, + "step": 2417 + }, + { + "epoch": 0.13308382409598768, + "grad_norm": 0.8032094240188599, + "learning_rate": 9.899381885271725e-06, + "loss": 0.811, + "step": 2418 + }, + { + "epoch": 0.13313886289834334, + "grad_norm": 5.237870216369629, + "learning_rate": 9.899295344615727e-06, + "loss": 0.7609, + "step": 2419 + }, + { + "epoch": 0.133193901700699, + "grad_norm": 0.8145740628242493, + "learning_rate": 9.899208767137935e-06, + "loss": 0.8435, + "step": 2420 + }, + { + "epoch": 0.13324894050305466, + "grad_norm": 0.9716018438339233, + "learning_rate": 9.899122152839004e-06, + "loss": 0.7924, + "step": 2421 + }, + { + "epoch": 0.13330397930541033, + "grad_norm": 0.7846183776855469, + "learning_rate": 9.899035501719582e-06, + "loss": 0.8941, + "step": 2422 + }, + { + "epoch": 0.133359018107766, + "grad_norm": 0.7653689980506897, + "learning_rate": 9.89894881378032e-06, + "loss": 0.811, + "step": 2423 + }, + { + "epoch": 0.13341405691012165, + "grad_norm": 0.8221875429153442, + "learning_rate": 9.89886208902187e-06, + "loss": 0.8131, + "step": 2424 + }, + { + "epoch": 0.1334690957124773, + "grad_norm": 0.7422335147857666, + "learning_rate": 9.898775327444885e-06, + "loss": 0.6366, + "step": 2425 + }, + { + "epoch": 0.13352413451483297, + "grad_norm": 0.8072695136070251, + "learning_rate": 9.898688529050014e-06, + "loss": 0.7989, + "step": 2426 + }, + { + "epoch": 0.1335791733171886, + "grad_norm": 0.7717600464820862, + "learning_rate": 9.898601693837911e-06, + "loss": 0.7524, + "step": 2427 + }, + { + "epoch": 0.13363421211954427, + "grad_norm": 0.8070919513702393, + "learning_rate": 9.898514821809231e-06, + "loss": 0.7724, + "step": 2428 + }, + { + "epoch": 0.13368925092189993, + "grad_norm": 0.8184726238250732, + "learning_rate": 9.898427912964624e-06, + "loss": 0.845, + "step": 2429 + }, + { + "epoch": 0.1337442897242556, + "grad_norm": 0.8168759346008301, + "learning_rate": 9.898340967304744e-06, + "loss": 0.8377, + "step": 2430 + }, + { + "epoch": 0.13379932852661125, + "grad_norm": 0.8701872825622559, + "learning_rate": 9.898253984830244e-06, + "loss": 0.908, + "step": 2431 + }, + { + "epoch": 0.1338543673289669, + "grad_norm": 0.8092133402824402, + "learning_rate": 9.898166965541779e-06, + "loss": 0.866, + "step": 2432 + }, + { + "epoch": 0.13390940613132257, + "grad_norm": 0.8337095975875854, + "learning_rate": 9.898079909440002e-06, + "loss": 0.8622, + "step": 2433 + }, + { + "epoch": 0.13396444493367823, + "grad_norm": 1.1016209125518799, + "learning_rate": 9.897992816525567e-06, + "loss": 0.8486, + "step": 2434 + }, + { + "epoch": 0.1340194837360339, + "grad_norm": 0.8136518597602844, + "learning_rate": 9.89790568679913e-06, + "loss": 0.8681, + "step": 2435 + }, + { + "epoch": 0.13407452253838956, + "grad_norm": 0.8202341794967651, + "learning_rate": 9.897818520261344e-06, + "loss": 0.9144, + "step": 2436 + }, + { + "epoch": 0.13412956134074522, + "grad_norm": 0.8836861848831177, + "learning_rate": 9.897731316912866e-06, + "loss": 0.8643, + "step": 2437 + }, + { + "epoch": 0.13418460014310088, + "grad_norm": 0.9040210247039795, + "learning_rate": 9.89764407675435e-06, + "loss": 0.7681, + "step": 2438 + }, + { + "epoch": 0.13423963894545654, + "grad_norm": 0.8762359619140625, + "learning_rate": 9.897556799786452e-06, + "loss": 0.8765, + "step": 2439 + }, + { + "epoch": 0.1342946777478122, + "grad_norm": 0.8859462738037109, + "learning_rate": 9.897469486009827e-06, + "loss": 0.9051, + "step": 2440 + }, + { + "epoch": 0.13434971655016786, + "grad_norm": 0.7727539539337158, + "learning_rate": 9.897382135425134e-06, + "loss": 0.7397, + "step": 2441 + }, + { + "epoch": 0.13440475535252353, + "grad_norm": 0.9018967151641846, + "learning_rate": 9.897294748033028e-06, + "loss": 0.8542, + "step": 2442 + }, + { + "epoch": 0.1344597941548792, + "grad_norm": 0.8228337168693542, + "learning_rate": 9.897207323834165e-06, + "loss": 0.7585, + "step": 2443 + }, + { + "epoch": 0.13451483295723485, + "grad_norm": 0.7509974241256714, + "learning_rate": 9.897119862829203e-06, + "loss": 0.7285, + "step": 2444 + }, + { + "epoch": 0.1345698717595905, + "grad_norm": 0.9225835800170898, + "learning_rate": 9.897032365018797e-06, + "loss": 0.8352, + "step": 2445 + }, + { + "epoch": 0.13462491056194617, + "grad_norm": 0.800981879234314, + "learning_rate": 9.896944830403609e-06, + "loss": 0.7352, + "step": 2446 + }, + { + "epoch": 0.13467994936430183, + "grad_norm": 0.8263673186302185, + "learning_rate": 9.896857258984294e-06, + "loss": 0.8426, + "step": 2447 + }, + { + "epoch": 0.1347349881666575, + "grad_norm": 0.8857110738754272, + "learning_rate": 9.89676965076151e-06, + "loss": 0.8078, + "step": 2448 + }, + { + "epoch": 0.13479002696901315, + "grad_norm": 0.8637158274650574, + "learning_rate": 9.896682005735916e-06, + "loss": 0.8688, + "step": 2449 + }, + { + "epoch": 0.13484506577136882, + "grad_norm": 0.9050095081329346, + "learning_rate": 9.89659432390817e-06, + "loss": 0.831, + "step": 2450 + }, + { + "epoch": 0.13490010457372448, + "grad_norm": 0.829757034778595, + "learning_rate": 9.896506605278933e-06, + "loss": 0.8095, + "step": 2451 + }, + { + "epoch": 0.13495514337608014, + "grad_norm": 0.8910449743270874, + "learning_rate": 9.896418849848864e-06, + "loss": 0.9134, + "step": 2452 + }, + { + "epoch": 0.1350101821784358, + "grad_norm": 0.8856307864189148, + "learning_rate": 9.89633105761862e-06, + "loss": 0.8171, + "step": 2453 + }, + { + "epoch": 0.13506522098079146, + "grad_norm": 0.8159938454627991, + "learning_rate": 9.896243228588864e-06, + "loss": 0.8205, + "step": 2454 + }, + { + "epoch": 0.13512025978314712, + "grad_norm": 0.8200929760932922, + "learning_rate": 9.896155362760254e-06, + "loss": 0.7529, + "step": 2455 + }, + { + "epoch": 0.13517529858550278, + "grad_norm": 0.7591279149055481, + "learning_rate": 9.89606746013345e-06, + "loss": 0.8205, + "step": 2456 + }, + { + "epoch": 0.13523033738785845, + "grad_norm": 0.8598676323890686, + "learning_rate": 9.895979520709114e-06, + "loss": 0.8212, + "step": 2457 + }, + { + "epoch": 0.1352853761902141, + "grad_norm": 0.7290365099906921, + "learning_rate": 9.895891544487905e-06, + "loss": 0.7893, + "step": 2458 + }, + { + "epoch": 0.13534041499256977, + "grad_norm": 0.8040594458580017, + "learning_rate": 9.895803531470487e-06, + "loss": 0.8358, + "step": 2459 + }, + { + "epoch": 0.13539545379492543, + "grad_norm": 0.9286525249481201, + "learning_rate": 9.895715481657522e-06, + "loss": 0.8104, + "step": 2460 + }, + { + "epoch": 0.1354504925972811, + "grad_norm": 0.843054473400116, + "learning_rate": 9.895627395049668e-06, + "loss": 0.7872, + "step": 2461 + }, + { + "epoch": 0.13550553139963675, + "grad_norm": 0.7894387245178223, + "learning_rate": 9.895539271647588e-06, + "loss": 0.8615, + "step": 2462 + }, + { + "epoch": 0.13556057020199241, + "grad_norm": 0.9185294508934021, + "learning_rate": 9.895451111451948e-06, + "loss": 0.8732, + "step": 2463 + }, + { + "epoch": 0.13561560900434808, + "grad_norm": 0.8586474657058716, + "learning_rate": 9.895362914463405e-06, + "loss": 0.9658, + "step": 2464 + }, + { + "epoch": 0.13567064780670374, + "grad_norm": 0.8810474276542664, + "learning_rate": 9.895274680682628e-06, + "loss": 0.8622, + "step": 2465 + }, + { + "epoch": 0.1357256866090594, + "grad_norm": 0.8862990736961365, + "learning_rate": 9.895186410110273e-06, + "loss": 0.916, + "step": 2466 + }, + { + "epoch": 0.13578072541141506, + "grad_norm": 0.7916743159294128, + "learning_rate": 9.89509810274701e-06, + "loss": 0.837, + "step": 2467 + }, + { + "epoch": 0.13583576421377072, + "grad_norm": 0.9063515663146973, + "learning_rate": 9.8950097585935e-06, + "loss": 0.8065, + "step": 2468 + }, + { + "epoch": 0.13589080301612638, + "grad_norm": 0.7656043767929077, + "learning_rate": 9.894921377650405e-06, + "loss": 0.7064, + "step": 2469 + }, + { + "epoch": 0.13594584181848202, + "grad_norm": 1.0630278587341309, + "learning_rate": 9.894832959918392e-06, + "loss": 0.8168, + "step": 2470 + }, + { + "epoch": 0.13600088062083768, + "grad_norm": 0.9118956923484802, + "learning_rate": 9.894744505398126e-06, + "loss": 0.8972, + "step": 2471 + }, + { + "epoch": 0.13605591942319334, + "grad_norm": 0.8989213705062866, + "learning_rate": 9.89465601409027e-06, + "loss": 0.8374, + "step": 2472 + }, + { + "epoch": 0.136110958225549, + "grad_norm": 0.9398229718208313, + "learning_rate": 9.894567485995489e-06, + "loss": 0.8956, + "step": 2473 + }, + { + "epoch": 0.13616599702790466, + "grad_norm": 0.7980280518531799, + "learning_rate": 9.894478921114449e-06, + "loss": 0.8055, + "step": 2474 + }, + { + "epoch": 0.13622103583026032, + "grad_norm": 0.8910034894943237, + "learning_rate": 9.894390319447816e-06, + "loss": 0.8371, + "step": 2475 + }, + { + "epoch": 0.13627607463261598, + "grad_norm": 0.7848070859909058, + "learning_rate": 9.894301680996255e-06, + "loss": 0.8024, + "step": 2476 + }, + { + "epoch": 0.13633111343497165, + "grad_norm": 0.8538175821304321, + "learning_rate": 9.894213005760434e-06, + "loss": 0.8819, + "step": 2477 + }, + { + "epoch": 0.1363861522373273, + "grad_norm": 0.7885367274284363, + "learning_rate": 9.894124293741017e-06, + "loss": 0.7916, + "step": 2478 + }, + { + "epoch": 0.13644119103968297, + "grad_norm": 0.8555673956871033, + "learning_rate": 9.894035544938672e-06, + "loss": 0.8521, + "step": 2479 + }, + { + "epoch": 0.13649622984203863, + "grad_norm": 0.8104771971702576, + "learning_rate": 9.893946759354066e-06, + "loss": 0.8437, + "step": 2480 + }, + { + "epoch": 0.1365512686443943, + "grad_norm": 0.9131864309310913, + "learning_rate": 9.893857936987866e-06, + "loss": 0.8123, + "step": 2481 + }, + { + "epoch": 0.13660630744674995, + "grad_norm": 0.9414293766021729, + "learning_rate": 9.893769077840739e-06, + "loss": 0.7897, + "step": 2482 + }, + { + "epoch": 0.13666134624910561, + "grad_norm": 0.823265016078949, + "learning_rate": 9.893680181913355e-06, + "loss": 0.847, + "step": 2483 + }, + { + "epoch": 0.13671638505146128, + "grad_norm": 0.82098788022995, + "learning_rate": 9.89359124920638e-06, + "loss": 0.7823, + "step": 2484 + }, + { + "epoch": 0.13677142385381694, + "grad_norm": 0.817551851272583, + "learning_rate": 9.893502279720483e-06, + "loss": 0.8084, + "step": 2485 + }, + { + "epoch": 0.1368264626561726, + "grad_norm": 1.0722150802612305, + "learning_rate": 9.893413273456333e-06, + "loss": 0.7394, + "step": 2486 + }, + { + "epoch": 0.13688150145852826, + "grad_norm": 0.8045433759689331, + "learning_rate": 9.893324230414598e-06, + "loss": 0.7528, + "step": 2487 + }, + { + "epoch": 0.13693654026088392, + "grad_norm": 0.8694071173667908, + "learning_rate": 9.893235150595949e-06, + "loss": 0.803, + "step": 2488 + }, + { + "epoch": 0.13699157906323958, + "grad_norm": 0.8238615989685059, + "learning_rate": 9.893146034001054e-06, + "loss": 0.7909, + "step": 2489 + }, + { + "epoch": 0.13704661786559524, + "grad_norm": 0.7782405018806458, + "learning_rate": 9.893056880630583e-06, + "loss": 0.6859, + "step": 2490 + }, + { + "epoch": 0.1371016566679509, + "grad_norm": 0.7865599989891052, + "learning_rate": 9.892967690485207e-06, + "loss": 0.7982, + "step": 2491 + }, + { + "epoch": 0.13715669547030657, + "grad_norm": 0.768120288848877, + "learning_rate": 9.892878463565595e-06, + "loss": 0.8234, + "step": 2492 + }, + { + "epoch": 0.13721173427266223, + "grad_norm": 0.812493085861206, + "learning_rate": 9.89278919987242e-06, + "loss": 0.9152, + "step": 2493 + }, + { + "epoch": 0.1372667730750179, + "grad_norm": 0.7256335616111755, + "learning_rate": 9.892699899406348e-06, + "loss": 0.6703, + "step": 2494 + }, + { + "epoch": 0.13732181187737355, + "grad_norm": 0.8022804260253906, + "learning_rate": 9.892610562168054e-06, + "loss": 0.7918, + "step": 2495 + }, + { + "epoch": 0.1373768506797292, + "grad_norm": 0.8204907774925232, + "learning_rate": 9.89252118815821e-06, + "loss": 0.9094, + "step": 2496 + }, + { + "epoch": 0.13743188948208487, + "grad_norm": 0.9986788630485535, + "learning_rate": 9.892431777377484e-06, + "loss": 0.8921, + "step": 2497 + }, + { + "epoch": 0.13748692828444053, + "grad_norm": 0.7937983870506287, + "learning_rate": 9.892342329826554e-06, + "loss": 0.8048, + "step": 2498 + }, + { + "epoch": 0.1375419670867962, + "grad_norm": 0.9295744895935059, + "learning_rate": 9.892252845506086e-06, + "loss": 0.755, + "step": 2499 + }, + { + "epoch": 0.13759700588915186, + "grad_norm": 0.7920984625816345, + "learning_rate": 9.892163324416757e-06, + "loss": 0.7603, + "step": 2500 + }, + { + "epoch": 0.13765204469150752, + "grad_norm": 0.9229464530944824, + "learning_rate": 9.892073766559236e-06, + "loss": 0.8115, + "step": 2501 + }, + { + "epoch": 0.13770708349386318, + "grad_norm": 0.8205353021621704, + "learning_rate": 9.8919841719342e-06, + "loss": 0.8357, + "step": 2502 + }, + { + "epoch": 0.13776212229621884, + "grad_norm": 0.86461341381073, + "learning_rate": 9.891894540542318e-06, + "loss": 0.748, + "step": 2503 + }, + { + "epoch": 0.1378171610985745, + "grad_norm": 0.767145574092865, + "learning_rate": 9.891804872384267e-06, + "loss": 0.7404, + "step": 2504 + }, + { + "epoch": 0.13787219990093016, + "grad_norm": 0.7492040991783142, + "learning_rate": 9.891715167460721e-06, + "loss": 0.6958, + "step": 2505 + }, + { + "epoch": 0.13792723870328583, + "grad_norm": 0.8643150329589844, + "learning_rate": 9.891625425772353e-06, + "loss": 0.8408, + "step": 2506 + }, + { + "epoch": 0.1379822775056415, + "grad_norm": 0.8026981353759766, + "learning_rate": 9.891535647319838e-06, + "loss": 0.7895, + "step": 2507 + }, + { + "epoch": 0.13803731630799715, + "grad_norm": 1.2780394554138184, + "learning_rate": 9.89144583210385e-06, + "loss": 0.9113, + "step": 2508 + }, + { + "epoch": 0.1380923551103528, + "grad_norm": 0.8476191163063049, + "learning_rate": 9.891355980125064e-06, + "loss": 0.8224, + "step": 2509 + }, + { + "epoch": 0.13814739391270847, + "grad_norm": 1.048682689666748, + "learning_rate": 9.891266091384157e-06, + "loss": 0.8913, + "step": 2510 + }, + { + "epoch": 0.13820243271506413, + "grad_norm": 1.0314993858337402, + "learning_rate": 9.891176165881801e-06, + "loss": 0.8315, + "step": 2511 + }, + { + "epoch": 0.1382574715174198, + "grad_norm": 0.9500058889389038, + "learning_rate": 9.891086203618676e-06, + "loss": 0.9185, + "step": 2512 + }, + { + "epoch": 0.13831251031977543, + "grad_norm": 0.7860653400421143, + "learning_rate": 9.890996204595457e-06, + "loss": 0.804, + "step": 2513 + }, + { + "epoch": 0.1383675491221311, + "grad_norm": 0.8354741930961609, + "learning_rate": 9.89090616881282e-06, + "loss": 0.8214, + "step": 2514 + }, + { + "epoch": 0.13842258792448675, + "grad_norm": 0.9115905165672302, + "learning_rate": 9.890816096271438e-06, + "loss": 0.8801, + "step": 2515 + }, + { + "epoch": 0.1384776267268424, + "grad_norm": 0.8852075338363647, + "learning_rate": 9.890725986971994e-06, + "loss": 0.8821, + "step": 2516 + }, + { + "epoch": 0.13853266552919807, + "grad_norm": 0.804314374923706, + "learning_rate": 9.890635840915164e-06, + "loss": 0.8412, + "step": 2517 + }, + { + "epoch": 0.13858770433155373, + "grad_norm": 0.8242805600166321, + "learning_rate": 9.890545658101623e-06, + "loss": 0.8447, + "step": 2518 + }, + { + "epoch": 0.1386427431339094, + "grad_norm": 0.8385655879974365, + "learning_rate": 9.890455438532048e-06, + "loss": 0.8161, + "step": 2519 + }, + { + "epoch": 0.13869778193626506, + "grad_norm": 0.7950524687767029, + "learning_rate": 9.89036518220712e-06, + "loss": 0.8024, + "step": 2520 + }, + { + "epoch": 0.13875282073862072, + "grad_norm": 1.0031861066818237, + "learning_rate": 9.890274889127518e-06, + "loss": 0.8399, + "step": 2521 + }, + { + "epoch": 0.13880785954097638, + "grad_norm": 0.8403242230415344, + "learning_rate": 9.890184559293917e-06, + "loss": 0.8115, + "step": 2522 + }, + { + "epoch": 0.13886289834333204, + "grad_norm": 0.8389976024627686, + "learning_rate": 9.890094192706998e-06, + "loss": 0.9573, + "step": 2523 + }, + { + "epoch": 0.1389179371456877, + "grad_norm": 0.8408516645431519, + "learning_rate": 9.890003789367442e-06, + "loss": 0.8572, + "step": 2524 + }, + { + "epoch": 0.13897297594804336, + "grad_norm": 0.7607787251472473, + "learning_rate": 9.889913349275925e-06, + "loss": 0.8119, + "step": 2525 + }, + { + "epoch": 0.13902801475039903, + "grad_norm": 0.7696373462677002, + "learning_rate": 9.889822872433127e-06, + "loss": 0.8287, + "step": 2526 + }, + { + "epoch": 0.1390830535527547, + "grad_norm": 0.8518380522727966, + "learning_rate": 9.889732358839732e-06, + "loss": 0.9008, + "step": 2527 + }, + { + "epoch": 0.13913809235511035, + "grad_norm": 0.8851314783096313, + "learning_rate": 9.889641808496416e-06, + "loss": 0.8148, + "step": 2528 + }, + { + "epoch": 0.139193131157466, + "grad_norm": 0.9245797395706177, + "learning_rate": 9.889551221403862e-06, + "loss": 0.846, + "step": 2529 + }, + { + "epoch": 0.13924816995982167, + "grad_norm": 0.8445762991905212, + "learning_rate": 9.889460597562748e-06, + "loss": 0.8306, + "step": 2530 + }, + { + "epoch": 0.13930320876217733, + "grad_norm": 0.9149277806282043, + "learning_rate": 9.88936993697376e-06, + "loss": 0.8033, + "step": 2531 + }, + { + "epoch": 0.139358247564533, + "grad_norm": 0.894666850566864, + "learning_rate": 9.889279239637572e-06, + "loss": 0.8299, + "step": 2532 + }, + { + "epoch": 0.13941328636688866, + "grad_norm": 1.2897371053695679, + "learning_rate": 9.889188505554871e-06, + "loss": 0.7776, + "step": 2533 + }, + { + "epoch": 0.13946832516924432, + "grad_norm": 0.8927022218704224, + "learning_rate": 9.889097734726341e-06, + "loss": 0.8706, + "step": 2534 + }, + { + "epoch": 0.13952336397159998, + "grad_norm": 0.7688571214675903, + "learning_rate": 9.889006927152658e-06, + "loss": 0.8191, + "step": 2535 + }, + { + "epoch": 0.13957840277395564, + "grad_norm": 0.926671028137207, + "learning_rate": 9.88891608283451e-06, + "loss": 0.7489, + "step": 2536 + }, + { + "epoch": 0.1396334415763113, + "grad_norm": 0.8316965699195862, + "learning_rate": 9.888825201772577e-06, + "loss": 0.7783, + "step": 2537 + }, + { + "epoch": 0.13968848037866696, + "grad_norm": 0.8619750738143921, + "learning_rate": 9.88873428396754e-06, + "loss": 0.8269, + "step": 2538 + }, + { + "epoch": 0.13974351918102262, + "grad_norm": 0.8588540554046631, + "learning_rate": 9.888643329420086e-06, + "loss": 0.8133, + "step": 2539 + }, + { + "epoch": 0.13979855798337829, + "grad_norm": 0.7947841882705688, + "learning_rate": 9.8885523381309e-06, + "loss": 0.8041, + "step": 2540 + }, + { + "epoch": 0.13985359678573395, + "grad_norm": 0.8440257906913757, + "learning_rate": 9.888461310100661e-06, + "loss": 0.8324, + "step": 2541 + }, + { + "epoch": 0.1399086355880896, + "grad_norm": 0.7842260003089905, + "learning_rate": 9.888370245330055e-06, + "loss": 0.8031, + "step": 2542 + }, + { + "epoch": 0.13996367439044527, + "grad_norm": 0.8108223080635071, + "learning_rate": 9.888279143819768e-06, + "loss": 0.7998, + "step": 2543 + }, + { + "epoch": 0.14001871319280093, + "grad_norm": 0.9748625159263611, + "learning_rate": 9.888188005570482e-06, + "loss": 0.9553, + "step": 2544 + }, + { + "epoch": 0.1400737519951566, + "grad_norm": 0.8465562462806702, + "learning_rate": 9.888096830582883e-06, + "loss": 0.7884, + "step": 2545 + }, + { + "epoch": 0.14012879079751225, + "grad_norm": 0.9339833855628967, + "learning_rate": 9.88800561885766e-06, + "loss": 0.8135, + "step": 2546 + }, + { + "epoch": 0.14018382959986792, + "grad_norm": 0.7749297022819519, + "learning_rate": 9.887914370395492e-06, + "loss": 0.8411, + "step": 2547 + }, + { + "epoch": 0.14023886840222358, + "grad_norm": 0.862606942653656, + "learning_rate": 9.887823085197068e-06, + "loss": 0.7631, + "step": 2548 + }, + { + "epoch": 0.14029390720457924, + "grad_norm": 1.3383793830871582, + "learning_rate": 9.887731763263076e-06, + "loss": 0.7979, + "step": 2549 + }, + { + "epoch": 0.1403489460069349, + "grad_norm": 0.8092008233070374, + "learning_rate": 9.887640404594199e-06, + "loss": 0.7566, + "step": 2550 + }, + { + "epoch": 0.14040398480929056, + "grad_norm": 0.9233745336532593, + "learning_rate": 9.887549009191126e-06, + "loss": 0.8954, + "step": 2551 + }, + { + "epoch": 0.14045902361164622, + "grad_norm": 0.8533664345741272, + "learning_rate": 9.887457577054542e-06, + "loss": 0.8311, + "step": 2552 + }, + { + "epoch": 0.14051406241400188, + "grad_norm": 0.7679287791252136, + "learning_rate": 9.887366108185135e-06, + "loss": 0.7641, + "step": 2553 + }, + { + "epoch": 0.14056910121635754, + "grad_norm": 0.7998354434967041, + "learning_rate": 9.887274602583594e-06, + "loss": 0.7759, + "step": 2554 + }, + { + "epoch": 0.1406241400187132, + "grad_norm": 0.8877138495445251, + "learning_rate": 9.887183060250605e-06, + "loss": 0.8928, + "step": 2555 + }, + { + "epoch": 0.14067917882106884, + "grad_norm": 0.8022066354751587, + "learning_rate": 9.887091481186855e-06, + "loss": 0.8233, + "step": 2556 + }, + { + "epoch": 0.1407342176234245, + "grad_norm": 0.8419097065925598, + "learning_rate": 9.886999865393035e-06, + "loss": 0.8044, + "step": 2557 + }, + { + "epoch": 0.14078925642578016, + "grad_norm": 0.9581286311149597, + "learning_rate": 9.88690821286983e-06, + "loss": 0.8531, + "step": 2558 + }, + { + "epoch": 0.14084429522813582, + "grad_norm": 0.894851803779602, + "learning_rate": 9.886816523617933e-06, + "loss": 0.8594, + "step": 2559 + }, + { + "epoch": 0.14089933403049149, + "grad_norm": 0.7813432812690735, + "learning_rate": 9.886724797638032e-06, + "loss": 0.7311, + "step": 2560 + }, + { + "epoch": 0.14095437283284715, + "grad_norm": 0.8194118142127991, + "learning_rate": 9.886633034930814e-06, + "loss": 0.8067, + "step": 2561 + }, + { + "epoch": 0.1410094116352028, + "grad_norm": 0.8091121912002563, + "learning_rate": 9.88654123549697e-06, + "loss": 0.7558, + "step": 2562 + }, + { + "epoch": 0.14106445043755847, + "grad_norm": 0.8334764242172241, + "learning_rate": 9.88644939933719e-06, + "loss": 0.8375, + "step": 2563 + }, + { + "epoch": 0.14111948923991413, + "grad_norm": 0.8283817768096924, + "learning_rate": 9.886357526452166e-06, + "loss": 0.7839, + "step": 2564 + }, + { + "epoch": 0.1411745280422698, + "grad_norm": 0.8708772659301758, + "learning_rate": 9.886265616842585e-06, + "loss": 0.8193, + "step": 2565 + }, + { + "epoch": 0.14122956684462545, + "grad_norm": 0.9883641600608826, + "learning_rate": 9.886173670509141e-06, + "loss": 0.9409, + "step": 2566 + }, + { + "epoch": 0.14128460564698112, + "grad_norm": 0.8601766228675842, + "learning_rate": 9.886081687452523e-06, + "loss": 0.9391, + "step": 2567 + }, + { + "epoch": 0.14133964444933678, + "grad_norm": 0.8729620575904846, + "learning_rate": 9.885989667673422e-06, + "loss": 0.8372, + "step": 2568 + }, + { + "epoch": 0.14139468325169244, + "grad_norm": 0.7899564504623413, + "learning_rate": 9.885897611172532e-06, + "loss": 0.7773, + "step": 2569 + }, + { + "epoch": 0.1414497220540481, + "grad_norm": 0.8120512962341309, + "learning_rate": 9.885805517950542e-06, + "loss": 0.887, + "step": 2570 + }, + { + "epoch": 0.14150476085640376, + "grad_norm": 0.8475256562232971, + "learning_rate": 9.885713388008148e-06, + "loss": 0.7935, + "step": 2571 + }, + { + "epoch": 0.14155979965875942, + "grad_norm": 0.7669919729232788, + "learning_rate": 9.885621221346038e-06, + "loss": 0.7728, + "step": 2572 + }, + { + "epoch": 0.14161483846111508, + "grad_norm": 0.8298916220664978, + "learning_rate": 9.885529017964906e-06, + "loss": 0.7723, + "step": 2573 + }, + { + "epoch": 0.14166987726347074, + "grad_norm": 0.8630721569061279, + "learning_rate": 9.885436777865447e-06, + "loss": 0.8395, + "step": 2574 + }, + { + "epoch": 0.1417249160658264, + "grad_norm": 0.7566008567810059, + "learning_rate": 9.885344501048352e-06, + "loss": 0.806, + "step": 2575 + }, + { + "epoch": 0.14177995486818207, + "grad_norm": 0.7870769500732422, + "learning_rate": 9.885252187514316e-06, + "loss": 0.7683, + "step": 2576 + }, + { + "epoch": 0.14183499367053773, + "grad_norm": 0.879648745059967, + "learning_rate": 9.885159837264033e-06, + "loss": 0.8472, + "step": 2577 + }, + { + "epoch": 0.1418900324728934, + "grad_norm": 0.76839280128479, + "learning_rate": 9.885067450298196e-06, + "loss": 0.8534, + "step": 2578 + }, + { + "epoch": 0.14194507127524905, + "grad_norm": 0.8268701434135437, + "learning_rate": 9.884975026617498e-06, + "loss": 0.7799, + "step": 2579 + }, + { + "epoch": 0.1420001100776047, + "grad_norm": 0.8226090669631958, + "learning_rate": 9.884882566222638e-06, + "loss": 0.6756, + "step": 2580 + }, + { + "epoch": 0.14205514887996037, + "grad_norm": 0.8299756050109863, + "learning_rate": 9.884790069114307e-06, + "loss": 0.734, + "step": 2581 + }, + { + "epoch": 0.14211018768231604, + "grad_norm": 0.8241812586784363, + "learning_rate": 9.8846975352932e-06, + "loss": 0.8335, + "step": 2582 + }, + { + "epoch": 0.1421652264846717, + "grad_norm": 0.8458926677703857, + "learning_rate": 9.884604964760016e-06, + "loss": 0.7376, + "step": 2583 + }, + { + "epoch": 0.14222026528702736, + "grad_norm": 0.876966655254364, + "learning_rate": 9.884512357515447e-06, + "loss": 0.9414, + "step": 2584 + }, + { + "epoch": 0.14227530408938302, + "grad_norm": 0.770252525806427, + "learning_rate": 9.88441971356019e-06, + "loss": 0.8312, + "step": 2585 + }, + { + "epoch": 0.14233034289173868, + "grad_norm": 0.7883023023605347, + "learning_rate": 9.884327032894945e-06, + "loss": 0.8568, + "step": 2586 + }, + { + "epoch": 0.14238538169409434, + "grad_norm": 0.9092289209365845, + "learning_rate": 9.884234315520405e-06, + "loss": 0.9078, + "step": 2587 + }, + { + "epoch": 0.14244042049645, + "grad_norm": 0.7946531176567078, + "learning_rate": 9.884141561437266e-06, + "loss": 0.6895, + "step": 2588 + }, + { + "epoch": 0.14249545929880567, + "grad_norm": 0.7791070342063904, + "learning_rate": 9.884048770646227e-06, + "loss": 0.6984, + "step": 2589 + }, + { + "epoch": 0.14255049810116133, + "grad_norm": 0.7775537371635437, + "learning_rate": 9.883955943147982e-06, + "loss": 0.7568, + "step": 2590 + }, + { + "epoch": 0.142605536903517, + "grad_norm": 0.7735158801078796, + "learning_rate": 9.883863078943234e-06, + "loss": 0.8215, + "step": 2591 + }, + { + "epoch": 0.14266057570587265, + "grad_norm": 0.881365180015564, + "learning_rate": 9.88377017803268e-06, + "loss": 0.8817, + "step": 2592 + }, + { + "epoch": 0.1427156145082283, + "grad_norm": 0.8643443584442139, + "learning_rate": 9.883677240417014e-06, + "loss": 0.8024, + "step": 2593 + }, + { + "epoch": 0.14277065331058397, + "grad_norm": 0.885713517665863, + "learning_rate": 9.883584266096938e-06, + "loss": 0.7612, + "step": 2594 + }, + { + "epoch": 0.14282569211293963, + "grad_norm": 0.771340012550354, + "learning_rate": 9.88349125507315e-06, + "loss": 0.8293, + "step": 2595 + }, + { + "epoch": 0.1428807309152953, + "grad_norm": 0.8284093737602234, + "learning_rate": 9.88339820734635e-06, + "loss": 0.8539, + "step": 2596 + }, + { + "epoch": 0.14293576971765096, + "grad_norm": 0.9597725868225098, + "learning_rate": 9.883305122917233e-06, + "loss": 0.9054, + "step": 2597 + }, + { + "epoch": 0.14299080852000662, + "grad_norm": 0.7552937269210815, + "learning_rate": 9.883212001786504e-06, + "loss": 0.8047, + "step": 2598 + }, + { + "epoch": 0.14304584732236225, + "grad_norm": 0.8008492588996887, + "learning_rate": 9.883118843954861e-06, + "loss": 0.802, + "step": 2599 + }, + { + "epoch": 0.1431008861247179, + "grad_norm": 0.8169753551483154, + "learning_rate": 9.883025649423003e-06, + "loss": 0.8837, + "step": 2600 + }, + { + "epoch": 0.14315592492707357, + "grad_norm": 0.8521036505699158, + "learning_rate": 9.882932418191632e-06, + "loss": 0.8266, + "step": 2601 + }, + { + "epoch": 0.14321096372942924, + "grad_norm": 0.8647341728210449, + "learning_rate": 9.882839150261449e-06, + "loss": 0.8949, + "step": 2602 + }, + { + "epoch": 0.1432660025317849, + "grad_norm": 0.9236162304878235, + "learning_rate": 9.882745845633153e-06, + "loss": 0.8474, + "step": 2603 + }, + { + "epoch": 0.14332104133414056, + "grad_norm": 0.8422677516937256, + "learning_rate": 9.882652504307445e-06, + "loss": 0.8396, + "step": 2604 + }, + { + "epoch": 0.14337608013649622, + "grad_norm": 0.902036190032959, + "learning_rate": 9.88255912628503e-06, + "loss": 0.8075, + "step": 2605 + }, + { + "epoch": 0.14343111893885188, + "grad_norm": 0.8972339630126953, + "learning_rate": 9.882465711566605e-06, + "loss": 0.8143, + "step": 2606 + }, + { + "epoch": 0.14348615774120754, + "grad_norm": 0.8025243282318115, + "learning_rate": 9.882372260152877e-06, + "loss": 0.771, + "step": 2607 + }, + { + "epoch": 0.1435411965435632, + "grad_norm": 0.8260911107063293, + "learning_rate": 9.882278772044545e-06, + "loss": 0.7679, + "step": 2608 + }, + { + "epoch": 0.14359623534591887, + "grad_norm": 0.8069774508476257, + "learning_rate": 9.882185247242313e-06, + "loss": 0.8489, + "step": 2609 + }, + { + "epoch": 0.14365127414827453, + "grad_norm": 0.8702567219734192, + "learning_rate": 9.882091685746883e-06, + "loss": 0.9258, + "step": 2610 + }, + { + "epoch": 0.1437063129506302, + "grad_norm": 0.8841683268547058, + "learning_rate": 9.881998087558959e-06, + "loss": 0.7858, + "step": 2611 + }, + { + "epoch": 0.14376135175298585, + "grad_norm": 0.7302986979484558, + "learning_rate": 9.881904452679246e-06, + "loss": 0.7339, + "step": 2612 + }, + { + "epoch": 0.1438163905553415, + "grad_norm": 0.7852466106414795, + "learning_rate": 9.881810781108442e-06, + "loss": 0.8397, + "step": 2613 + }, + { + "epoch": 0.14387142935769717, + "grad_norm": 0.7986249327659607, + "learning_rate": 9.881717072847258e-06, + "loss": 0.7573, + "step": 2614 + }, + { + "epoch": 0.14392646816005283, + "grad_norm": 0.750000536441803, + "learning_rate": 9.881623327896395e-06, + "loss": 0.7128, + "step": 2615 + }, + { + "epoch": 0.1439815069624085, + "grad_norm": 0.8796436786651611, + "learning_rate": 9.881529546256557e-06, + "loss": 0.9364, + "step": 2616 + }, + { + "epoch": 0.14403654576476416, + "grad_norm": 0.8621297478675842, + "learning_rate": 9.881435727928449e-06, + "loss": 0.9323, + "step": 2617 + }, + { + "epoch": 0.14409158456711982, + "grad_norm": 0.8213173151016235, + "learning_rate": 9.881341872912777e-06, + "loss": 0.7746, + "step": 2618 + }, + { + "epoch": 0.14414662336947548, + "grad_norm": 0.7761938571929932, + "learning_rate": 9.881247981210247e-06, + "loss": 0.8065, + "step": 2619 + }, + { + "epoch": 0.14420166217183114, + "grad_norm": 0.8333988785743713, + "learning_rate": 9.881154052821564e-06, + "loss": 0.8727, + "step": 2620 + }, + { + "epoch": 0.1442567009741868, + "grad_norm": 0.7263909578323364, + "learning_rate": 9.881060087747433e-06, + "loss": 0.8194, + "step": 2621 + }, + { + "epoch": 0.14431173977654246, + "grad_norm": 0.7472667098045349, + "learning_rate": 9.880966085988562e-06, + "loss": 0.77, + "step": 2622 + }, + { + "epoch": 0.14436677857889813, + "grad_norm": 0.7999943494796753, + "learning_rate": 9.880872047545656e-06, + "loss": 0.7936, + "step": 2623 + }, + { + "epoch": 0.1444218173812538, + "grad_norm": 0.8359610438346863, + "learning_rate": 9.88077797241942e-06, + "loss": 0.7946, + "step": 2624 + }, + { + "epoch": 0.14447685618360945, + "grad_norm": 0.8666403889656067, + "learning_rate": 9.880683860610566e-06, + "loss": 0.8152, + "step": 2625 + }, + { + "epoch": 0.1445318949859651, + "grad_norm": 0.7883741855621338, + "learning_rate": 9.880589712119797e-06, + "loss": 0.7972, + "step": 2626 + }, + { + "epoch": 0.14458693378832077, + "grad_norm": 0.8048827648162842, + "learning_rate": 9.880495526947824e-06, + "loss": 0.8221, + "step": 2627 + }, + { + "epoch": 0.14464197259067643, + "grad_norm": 0.718292236328125, + "learning_rate": 9.88040130509535e-06, + "loss": 0.7648, + "step": 2628 + }, + { + "epoch": 0.1446970113930321, + "grad_norm": 0.7748421430587769, + "learning_rate": 9.880307046563088e-06, + "loss": 0.8146, + "step": 2629 + }, + { + "epoch": 0.14475205019538775, + "grad_norm": 0.8015987873077393, + "learning_rate": 9.880212751351745e-06, + "loss": 0.7935, + "step": 2630 + }, + { + "epoch": 0.14480708899774342, + "grad_norm": 0.7628459930419922, + "learning_rate": 9.88011841946203e-06, + "loss": 0.7469, + "step": 2631 + }, + { + "epoch": 0.14486212780009908, + "grad_norm": 0.7152888774871826, + "learning_rate": 9.88002405089465e-06, + "loss": 0.7721, + "step": 2632 + }, + { + "epoch": 0.14491716660245474, + "grad_norm": 0.8075545430183411, + "learning_rate": 9.879929645650315e-06, + "loss": 0.8799, + "step": 2633 + }, + { + "epoch": 0.1449722054048104, + "grad_norm": 0.7981964945793152, + "learning_rate": 9.879835203729736e-06, + "loss": 0.8265, + "step": 2634 + }, + { + "epoch": 0.14502724420716606, + "grad_norm": 0.7699866890907288, + "learning_rate": 9.879740725133623e-06, + "loss": 0.8489, + "step": 2635 + }, + { + "epoch": 0.14508228300952172, + "grad_norm": 0.7991634011268616, + "learning_rate": 9.879646209862682e-06, + "loss": 0.8754, + "step": 2636 + }, + { + "epoch": 0.14513732181187738, + "grad_norm": 0.8284991383552551, + "learning_rate": 9.879551657917628e-06, + "loss": 0.811, + "step": 2637 + }, + { + "epoch": 0.14519236061423305, + "grad_norm": 0.9189227819442749, + "learning_rate": 9.87945706929917e-06, + "loss": 0.8486, + "step": 2638 + }, + { + "epoch": 0.1452473994165887, + "grad_norm": 0.8599026799201965, + "learning_rate": 9.879362444008018e-06, + "loss": 0.8383, + "step": 2639 + }, + { + "epoch": 0.14530243821894437, + "grad_norm": 0.8764603137969971, + "learning_rate": 9.879267782044885e-06, + "loss": 0.7918, + "step": 2640 + }, + { + "epoch": 0.14535747702130003, + "grad_norm": 0.8061341047286987, + "learning_rate": 9.87917308341048e-06, + "loss": 0.8292, + "step": 2641 + }, + { + "epoch": 0.14541251582365566, + "grad_norm": 1.031220555305481, + "learning_rate": 9.879078348105518e-06, + "loss": 0.6612, + "step": 2642 + }, + { + "epoch": 0.14546755462601133, + "grad_norm": 1.014491319656372, + "learning_rate": 9.878983576130708e-06, + "loss": 0.8512, + "step": 2643 + }, + { + "epoch": 0.145522593428367, + "grad_norm": 0.8365896940231323, + "learning_rate": 9.878888767486764e-06, + "loss": 0.7995, + "step": 2644 + }, + { + "epoch": 0.14557763223072265, + "grad_norm": 0.8086197972297668, + "learning_rate": 9.878793922174397e-06, + "loss": 0.8069, + "step": 2645 + }, + { + "epoch": 0.1456326710330783, + "grad_norm": 0.8075234889984131, + "learning_rate": 9.878699040194322e-06, + "loss": 0.8415, + "step": 2646 + }, + { + "epoch": 0.14568770983543397, + "grad_norm": 0.9413748979568481, + "learning_rate": 9.87860412154725e-06, + "loss": 0.7811, + "step": 2647 + }, + { + "epoch": 0.14574274863778963, + "grad_norm": 0.7744552493095398, + "learning_rate": 9.878509166233895e-06, + "loss": 0.7983, + "step": 2648 + }, + { + "epoch": 0.1457977874401453, + "grad_norm": 0.8184664845466614, + "learning_rate": 9.878414174254974e-06, + "loss": 0.8052, + "step": 2649 + }, + { + "epoch": 0.14585282624250095, + "grad_norm": 0.928814172744751, + "learning_rate": 9.878319145611195e-06, + "loss": 0.7695, + "step": 2650 + }, + { + "epoch": 0.14590786504485662, + "grad_norm": 0.9623318314552307, + "learning_rate": 9.878224080303276e-06, + "loss": 0.9025, + "step": 2651 + }, + { + "epoch": 0.14596290384721228, + "grad_norm": 0.866538405418396, + "learning_rate": 9.87812897833193e-06, + "loss": 0.7895, + "step": 2652 + }, + { + "epoch": 0.14601794264956794, + "grad_norm": 0.9248599410057068, + "learning_rate": 9.878033839697874e-06, + "loss": 0.8532, + "step": 2653 + }, + { + "epoch": 0.1460729814519236, + "grad_norm": 0.7866301536560059, + "learning_rate": 9.87793866440182e-06, + "loss": 0.8724, + "step": 2654 + }, + { + "epoch": 0.14612802025427926, + "grad_norm": 0.8471634387969971, + "learning_rate": 9.877843452444485e-06, + "loss": 0.9184, + "step": 2655 + }, + { + "epoch": 0.14618305905663492, + "grad_norm": 0.7367103695869446, + "learning_rate": 9.877748203826585e-06, + "loss": 0.7328, + "step": 2656 + }, + { + "epoch": 0.14623809785899058, + "grad_norm": 0.95980304479599, + "learning_rate": 9.877652918548834e-06, + "loss": 0.9274, + "step": 2657 + }, + { + "epoch": 0.14629313666134625, + "grad_norm": 1.0511064529418945, + "learning_rate": 9.87755759661195e-06, + "loss": 0.8223, + "step": 2658 + }, + { + "epoch": 0.1463481754637019, + "grad_norm": 0.7616510391235352, + "learning_rate": 9.877462238016649e-06, + "loss": 0.7473, + "step": 2659 + }, + { + "epoch": 0.14640321426605757, + "grad_norm": 0.7814056873321533, + "learning_rate": 9.877366842763647e-06, + "loss": 0.8898, + "step": 2660 + }, + { + "epoch": 0.14645825306841323, + "grad_norm": 0.8707298636436462, + "learning_rate": 9.877271410853662e-06, + "loss": 0.8792, + "step": 2661 + }, + { + "epoch": 0.1465132918707689, + "grad_norm": 0.8618701696395874, + "learning_rate": 9.877175942287409e-06, + "loss": 0.8761, + "step": 2662 + }, + { + "epoch": 0.14656833067312455, + "grad_norm": 0.9437732100486755, + "learning_rate": 9.877080437065609e-06, + "loss": 0.7922, + "step": 2663 + }, + { + "epoch": 0.14662336947548021, + "grad_norm": 0.9465780258178711, + "learning_rate": 9.876984895188976e-06, + "loss": 0.8449, + "step": 2664 + }, + { + "epoch": 0.14667840827783588, + "grad_norm": 0.7149911522865295, + "learning_rate": 9.876889316658233e-06, + "loss": 0.6408, + "step": 2665 + }, + { + "epoch": 0.14673344708019154, + "grad_norm": 0.9996811151504517, + "learning_rate": 9.876793701474092e-06, + "loss": 0.9324, + "step": 2666 + }, + { + "epoch": 0.1467884858825472, + "grad_norm": 0.7941329479217529, + "learning_rate": 9.876698049637277e-06, + "loss": 0.8115, + "step": 2667 + }, + { + "epoch": 0.14684352468490286, + "grad_norm": 0.754175066947937, + "learning_rate": 9.876602361148504e-06, + "loss": 0.7709, + "step": 2668 + }, + { + "epoch": 0.14689856348725852, + "grad_norm": 0.7867946624755859, + "learning_rate": 9.876506636008494e-06, + "loss": 0.8578, + "step": 2669 + }, + { + "epoch": 0.14695360228961418, + "grad_norm": 0.7441185116767883, + "learning_rate": 9.876410874217965e-06, + "loss": 0.8491, + "step": 2670 + }, + { + "epoch": 0.14700864109196984, + "grad_norm": 0.8414027690887451, + "learning_rate": 9.876315075777638e-06, + "loss": 0.8404, + "step": 2671 + }, + { + "epoch": 0.1470636798943255, + "grad_norm": 0.7911489009857178, + "learning_rate": 9.876219240688231e-06, + "loss": 0.8606, + "step": 2672 + }, + { + "epoch": 0.14711871869668117, + "grad_norm": 0.8601381778717041, + "learning_rate": 9.876123368950465e-06, + "loss": 0.7753, + "step": 2673 + }, + { + "epoch": 0.14717375749903683, + "grad_norm": 0.8672378659248352, + "learning_rate": 9.876027460565062e-06, + "loss": 0.7763, + "step": 2674 + }, + { + "epoch": 0.1472287963013925, + "grad_norm": 0.7192933559417725, + "learning_rate": 9.875931515532742e-06, + "loss": 0.7681, + "step": 2675 + }, + { + "epoch": 0.14728383510374815, + "grad_norm": 0.7483426332473755, + "learning_rate": 9.875835533854226e-06, + "loss": 0.8129, + "step": 2676 + }, + { + "epoch": 0.1473388739061038, + "grad_norm": 0.8883694410324097, + "learning_rate": 9.875739515530235e-06, + "loss": 0.8912, + "step": 2677 + }, + { + "epoch": 0.14739391270845947, + "grad_norm": 0.8440148234367371, + "learning_rate": 9.87564346056149e-06, + "loss": 0.8411, + "step": 2678 + }, + { + "epoch": 0.14744895151081513, + "grad_norm": 0.8916668891906738, + "learning_rate": 9.875547368948715e-06, + "loss": 0.8484, + "step": 2679 + }, + { + "epoch": 0.1475039903131708, + "grad_norm": 0.805258572101593, + "learning_rate": 9.875451240692631e-06, + "loss": 0.8172, + "step": 2680 + }, + { + "epoch": 0.14755902911552646, + "grad_norm": 0.8322305679321289, + "learning_rate": 9.87535507579396e-06, + "loss": 0.809, + "step": 2681 + }, + { + "epoch": 0.14761406791788212, + "grad_norm": 0.7320597767829895, + "learning_rate": 9.875258874253424e-06, + "loss": 0.7346, + "step": 2682 + }, + { + "epoch": 0.14766910672023778, + "grad_norm": 1.018036127090454, + "learning_rate": 9.875162636071749e-06, + "loss": 0.931, + "step": 2683 + }, + { + "epoch": 0.14772414552259344, + "grad_norm": 0.8601503968238831, + "learning_rate": 9.875066361249657e-06, + "loss": 0.7689, + "step": 2684 + }, + { + "epoch": 0.14777918432494908, + "grad_norm": 0.8478472232818604, + "learning_rate": 9.87497004978787e-06, + "loss": 0.9545, + "step": 2685 + }, + { + "epoch": 0.14783422312730474, + "grad_norm": 0.7510890364646912, + "learning_rate": 9.874873701687115e-06, + "loss": 0.7794, + "step": 2686 + }, + { + "epoch": 0.1478892619296604, + "grad_norm": 0.8226999044418335, + "learning_rate": 9.874777316948112e-06, + "loss": 0.8477, + "step": 2687 + }, + { + "epoch": 0.14794430073201606, + "grad_norm": 0.8284991979598999, + "learning_rate": 9.874680895571588e-06, + "loss": 0.8498, + "step": 2688 + }, + { + "epoch": 0.14799933953437172, + "grad_norm": 0.9007356762886047, + "learning_rate": 9.874584437558267e-06, + "loss": 0.8526, + "step": 2689 + }, + { + "epoch": 0.14805437833672738, + "grad_norm": 0.8770126104354858, + "learning_rate": 9.874487942908877e-06, + "loss": 0.844, + "step": 2690 + }, + { + "epoch": 0.14810941713908304, + "grad_norm": 1.1561466455459595, + "learning_rate": 9.874391411624138e-06, + "loss": 0.976, + "step": 2691 + }, + { + "epoch": 0.1481644559414387, + "grad_norm": 0.8162640929222107, + "learning_rate": 9.874294843704777e-06, + "loss": 0.8581, + "step": 2692 + }, + { + "epoch": 0.14821949474379437, + "grad_norm": 0.8308132290840149, + "learning_rate": 9.874198239151522e-06, + "loss": 0.8303, + "step": 2693 + }, + { + "epoch": 0.14827453354615003, + "grad_norm": 0.771024227142334, + "learning_rate": 9.874101597965098e-06, + "loss": 0.8351, + "step": 2694 + }, + { + "epoch": 0.1483295723485057, + "grad_norm": 0.7588162422180176, + "learning_rate": 9.874004920146232e-06, + "loss": 0.7858, + "step": 2695 + }, + { + "epoch": 0.14838461115086135, + "grad_norm": 0.8282446265220642, + "learning_rate": 9.873908205695648e-06, + "loss": 0.8465, + "step": 2696 + }, + { + "epoch": 0.148439649953217, + "grad_norm": 0.8342786431312561, + "learning_rate": 9.873811454614076e-06, + "loss": 0.8688, + "step": 2697 + }, + { + "epoch": 0.14849468875557267, + "grad_norm": 0.7957108020782471, + "learning_rate": 9.87371466690224e-06, + "loss": 0.8381, + "step": 2698 + }, + { + "epoch": 0.14854972755792833, + "grad_norm": 0.8763726353645325, + "learning_rate": 9.87361784256087e-06, + "loss": 0.8922, + "step": 2699 + }, + { + "epoch": 0.148604766360284, + "grad_norm": 0.7760055661201477, + "learning_rate": 9.873520981590693e-06, + "loss": 0.8384, + "step": 2700 + }, + { + "epoch": 0.14865980516263966, + "grad_norm": 0.9691097736358643, + "learning_rate": 9.873424083992436e-06, + "loss": 0.8581, + "step": 2701 + }, + { + "epoch": 0.14871484396499532, + "grad_norm": 0.9072558879852295, + "learning_rate": 9.87332714976683e-06, + "loss": 0.8942, + "step": 2702 + }, + { + "epoch": 0.14876988276735098, + "grad_norm": 0.8961714506149292, + "learning_rate": 9.8732301789146e-06, + "loss": 0.8062, + "step": 2703 + }, + { + "epoch": 0.14882492156970664, + "grad_norm": 1.4835050106048584, + "learning_rate": 9.873133171436477e-06, + "loss": 0.886, + "step": 2704 + }, + { + "epoch": 0.1488799603720623, + "grad_norm": 0.8153702616691589, + "learning_rate": 9.87303612733319e-06, + "loss": 0.8369, + "step": 2705 + }, + { + "epoch": 0.14893499917441796, + "grad_norm": 0.8755800724029541, + "learning_rate": 9.872939046605467e-06, + "loss": 0.7591, + "step": 2706 + }, + { + "epoch": 0.14899003797677363, + "grad_norm": 0.8173243403434753, + "learning_rate": 9.872841929254038e-06, + "loss": 0.8626, + "step": 2707 + }, + { + "epoch": 0.1490450767791293, + "grad_norm": 0.7891639471054077, + "learning_rate": 9.872744775279634e-06, + "loss": 0.737, + "step": 2708 + }, + { + "epoch": 0.14910011558148495, + "grad_norm": 1.0270631313323975, + "learning_rate": 9.872647584682985e-06, + "loss": 0.9202, + "step": 2709 + }, + { + "epoch": 0.1491551543838406, + "grad_norm": 0.7736123204231262, + "learning_rate": 9.872550357464822e-06, + "loss": 0.7835, + "step": 2710 + }, + { + "epoch": 0.14921019318619627, + "grad_norm": 0.7791550159454346, + "learning_rate": 9.872453093625873e-06, + "loss": 0.8375, + "step": 2711 + }, + { + "epoch": 0.14926523198855193, + "grad_norm": 0.8410583734512329, + "learning_rate": 9.872355793166872e-06, + "loss": 0.877, + "step": 2712 + }, + { + "epoch": 0.1493202707909076, + "grad_norm": 0.8277738094329834, + "learning_rate": 9.87225845608855e-06, + "loss": 0.7255, + "step": 2713 + }, + { + "epoch": 0.14937530959326326, + "grad_norm": 0.8617290258407593, + "learning_rate": 9.872161082391635e-06, + "loss": 0.7885, + "step": 2714 + }, + { + "epoch": 0.14943034839561892, + "grad_norm": 0.8866406679153442, + "learning_rate": 9.872063672076864e-06, + "loss": 0.8621, + "step": 2715 + }, + { + "epoch": 0.14948538719797458, + "grad_norm": 0.7581049799919128, + "learning_rate": 9.871966225144964e-06, + "loss": 0.8177, + "step": 2716 + }, + { + "epoch": 0.14954042600033024, + "grad_norm": 0.833696722984314, + "learning_rate": 9.871868741596673e-06, + "loss": 0.8382, + "step": 2717 + }, + { + "epoch": 0.1495954648026859, + "grad_norm": 1.0857365131378174, + "learning_rate": 9.871771221432718e-06, + "loss": 0.9254, + "step": 2718 + }, + { + "epoch": 0.14965050360504156, + "grad_norm": 0.7622446417808533, + "learning_rate": 9.871673664653837e-06, + "loss": 0.832, + "step": 2719 + }, + { + "epoch": 0.14970554240739722, + "grad_norm": 0.7436832785606384, + "learning_rate": 9.871576071260758e-06, + "loss": 0.7642, + "step": 2720 + }, + { + "epoch": 0.14976058120975289, + "grad_norm": 0.8547641634941101, + "learning_rate": 9.87147844125422e-06, + "loss": 0.7584, + "step": 2721 + }, + { + "epoch": 0.14981562001210855, + "grad_norm": 0.7634096145629883, + "learning_rate": 9.871380774634953e-06, + "loss": 0.8332, + "step": 2722 + }, + { + "epoch": 0.1498706588144642, + "grad_norm": 0.7949081063270569, + "learning_rate": 9.871283071403692e-06, + "loss": 0.7812, + "step": 2723 + }, + { + "epoch": 0.14992569761681987, + "grad_norm": 0.8089914321899414, + "learning_rate": 9.871185331561171e-06, + "loss": 0.8503, + "step": 2724 + }, + { + "epoch": 0.14998073641917553, + "grad_norm": 0.8451627492904663, + "learning_rate": 9.871087555108125e-06, + "loss": 0.9101, + "step": 2725 + }, + { + "epoch": 0.1500357752215312, + "grad_norm": 0.8399865627288818, + "learning_rate": 9.87098974204529e-06, + "loss": 0.8222, + "step": 2726 + }, + { + "epoch": 0.15009081402388685, + "grad_norm": 0.7786773443222046, + "learning_rate": 9.870891892373397e-06, + "loss": 0.8069, + "step": 2727 + }, + { + "epoch": 0.1501458528262425, + "grad_norm": 0.8530564308166504, + "learning_rate": 9.870794006093188e-06, + "loss": 0.9229, + "step": 2728 + }, + { + "epoch": 0.15020089162859815, + "grad_norm": 0.7640067934989929, + "learning_rate": 9.870696083205394e-06, + "loss": 0.761, + "step": 2729 + }, + { + "epoch": 0.1502559304309538, + "grad_norm": 0.8953121900558472, + "learning_rate": 9.87059812371075e-06, + "loss": 0.8537, + "step": 2730 + }, + { + "epoch": 0.15031096923330947, + "grad_norm": 0.7779926657676697, + "learning_rate": 9.870500127609996e-06, + "loss": 0.8184, + "step": 2731 + }, + { + "epoch": 0.15036600803566513, + "grad_norm": 0.9181544184684753, + "learning_rate": 9.870402094903865e-06, + "loss": 0.8583, + "step": 2732 + }, + { + "epoch": 0.1504210468380208, + "grad_norm": 0.7629374861717224, + "learning_rate": 9.870304025593097e-06, + "loss": 0.6741, + "step": 2733 + }, + { + "epoch": 0.15047608564037646, + "grad_norm": 1.1455601453781128, + "learning_rate": 9.87020591967843e-06, + "loss": 0.8602, + "step": 2734 + }, + { + "epoch": 0.15053112444273212, + "grad_norm": 0.83924800157547, + "learning_rate": 9.870107777160596e-06, + "loss": 0.8847, + "step": 2735 + }, + { + "epoch": 0.15058616324508778, + "grad_norm": 0.9293402433395386, + "learning_rate": 9.870009598040336e-06, + "loss": 0.9008, + "step": 2736 + }, + { + "epoch": 0.15064120204744344, + "grad_norm": 0.8198057413101196, + "learning_rate": 9.869911382318389e-06, + "loss": 0.8004, + "step": 2737 + }, + { + "epoch": 0.1506962408497991, + "grad_norm": 0.8139753341674805, + "learning_rate": 9.86981312999549e-06, + "loss": 0.7316, + "step": 2738 + }, + { + "epoch": 0.15075127965215476, + "grad_norm": 0.854184091091156, + "learning_rate": 9.86971484107238e-06, + "loss": 0.9424, + "step": 2739 + }, + { + "epoch": 0.15080631845451042, + "grad_norm": 0.8626797199249268, + "learning_rate": 9.869616515549797e-06, + "loss": 0.8882, + "step": 2740 + }, + { + "epoch": 0.15086135725686609, + "grad_norm": 0.8447514176368713, + "learning_rate": 9.869518153428479e-06, + "loss": 0.7762, + "step": 2741 + }, + { + "epoch": 0.15091639605922175, + "grad_norm": 1.1359349489212036, + "learning_rate": 9.869419754709166e-06, + "loss": 0.9233, + "step": 2742 + }, + { + "epoch": 0.1509714348615774, + "grad_norm": 0.8095758557319641, + "learning_rate": 9.869321319392597e-06, + "loss": 0.8833, + "step": 2743 + }, + { + "epoch": 0.15102647366393307, + "grad_norm": 0.8364169001579285, + "learning_rate": 9.869222847479514e-06, + "loss": 0.833, + "step": 2744 + }, + { + "epoch": 0.15108151246628873, + "grad_norm": 0.7664803266525269, + "learning_rate": 9.869124338970653e-06, + "loss": 0.8125, + "step": 2745 + }, + { + "epoch": 0.1511365512686444, + "grad_norm": 0.8129634857177734, + "learning_rate": 9.86902579386676e-06, + "loss": 0.8277, + "step": 2746 + }, + { + "epoch": 0.15119159007100005, + "grad_norm": 0.8195592164993286, + "learning_rate": 9.86892721216857e-06, + "loss": 0.8489, + "step": 2747 + }, + { + "epoch": 0.15124662887335572, + "grad_norm": 0.8116651177406311, + "learning_rate": 9.868828593876827e-06, + "loss": 0.7831, + "step": 2748 + }, + { + "epoch": 0.15130166767571138, + "grad_norm": 0.8200114369392395, + "learning_rate": 9.868729938992272e-06, + "loss": 0.8956, + "step": 2749 + }, + { + "epoch": 0.15135670647806704, + "grad_norm": 0.8521816730499268, + "learning_rate": 9.868631247515645e-06, + "loss": 0.804, + "step": 2750 + }, + { + "epoch": 0.1514117452804227, + "grad_norm": 1.0386497974395752, + "learning_rate": 9.868532519447691e-06, + "loss": 0.8563, + "step": 2751 + }, + { + "epoch": 0.15146678408277836, + "grad_norm": 0.8345486521720886, + "learning_rate": 9.868433754789149e-06, + "loss": 0.9838, + "step": 2752 + }, + { + "epoch": 0.15152182288513402, + "grad_norm": 0.7207526564598083, + "learning_rate": 9.868334953540762e-06, + "loss": 0.6711, + "step": 2753 + }, + { + "epoch": 0.15157686168748968, + "grad_norm": 0.8159164786338806, + "learning_rate": 9.86823611570327e-06, + "loss": 0.7591, + "step": 2754 + }, + { + "epoch": 0.15163190048984534, + "grad_norm": 0.9062225818634033, + "learning_rate": 9.868137241277422e-06, + "loss": 0.8009, + "step": 2755 + }, + { + "epoch": 0.151686939292201, + "grad_norm": 0.8136696219444275, + "learning_rate": 9.868038330263957e-06, + "loss": 0.7014, + "step": 2756 + }, + { + "epoch": 0.15174197809455667, + "grad_norm": 0.7237691283226013, + "learning_rate": 9.867939382663618e-06, + "loss": 0.7766, + "step": 2757 + }, + { + "epoch": 0.15179701689691233, + "grad_norm": 0.8913742303848267, + "learning_rate": 9.86784039847715e-06, + "loss": 0.9362, + "step": 2758 + }, + { + "epoch": 0.151852055699268, + "grad_norm": 0.7763763070106506, + "learning_rate": 9.867741377705296e-06, + "loss": 0.7843, + "step": 2759 + }, + { + "epoch": 0.15190709450162365, + "grad_norm": 0.8973854780197144, + "learning_rate": 9.867642320348803e-06, + "loss": 0.911, + "step": 2760 + }, + { + "epoch": 0.1519621333039793, + "grad_norm": 0.7979685664176941, + "learning_rate": 9.86754322640841e-06, + "loss": 0.81, + "step": 2761 + }, + { + "epoch": 0.15201717210633497, + "grad_norm": 0.7740911841392517, + "learning_rate": 9.867444095884867e-06, + "loss": 0.8197, + "step": 2762 + }, + { + "epoch": 0.15207221090869064, + "grad_norm": 0.8400475978851318, + "learning_rate": 9.867344928778916e-06, + "loss": 0.8809, + "step": 2763 + }, + { + "epoch": 0.1521272497110463, + "grad_norm": 0.8995040655136108, + "learning_rate": 9.867245725091305e-06, + "loss": 0.8382, + "step": 2764 + }, + { + "epoch": 0.15218228851340196, + "grad_norm": 0.8162381052970886, + "learning_rate": 9.867146484822779e-06, + "loss": 0.9238, + "step": 2765 + }, + { + "epoch": 0.15223732731575762, + "grad_norm": 0.7668827176094055, + "learning_rate": 9.867047207974079e-06, + "loss": 0.8345, + "step": 2766 + }, + { + "epoch": 0.15229236611811328, + "grad_norm": 0.8719204664230347, + "learning_rate": 9.866947894545957e-06, + "loss": 0.7899, + "step": 2767 + }, + { + "epoch": 0.15234740492046894, + "grad_norm": 0.9043570756912231, + "learning_rate": 9.866848544539159e-06, + "loss": 0.8783, + "step": 2768 + }, + { + "epoch": 0.1524024437228246, + "grad_norm": 0.8859694004058838, + "learning_rate": 9.866749157954428e-06, + "loss": 0.862, + "step": 2769 + }, + { + "epoch": 0.15245748252518027, + "grad_norm": 1.022719144821167, + "learning_rate": 9.866649734792514e-06, + "loss": 0.8943, + "step": 2770 + }, + { + "epoch": 0.1525125213275359, + "grad_norm": 0.8710635900497437, + "learning_rate": 9.866550275054163e-06, + "loss": 0.7002, + "step": 2771 + }, + { + "epoch": 0.15256756012989156, + "grad_norm": 0.8482942581176758, + "learning_rate": 9.866450778740122e-06, + "loss": 0.7529, + "step": 2772 + }, + { + "epoch": 0.15262259893224722, + "grad_norm": 0.9637784361839294, + "learning_rate": 9.866351245851142e-06, + "loss": 0.8147, + "step": 2773 + }, + { + "epoch": 0.15267763773460288, + "grad_norm": 1.0472246408462524, + "learning_rate": 9.866251676387967e-06, + "loss": 0.8019, + "step": 2774 + }, + { + "epoch": 0.15273267653695854, + "grad_norm": 0.7916847467422485, + "learning_rate": 9.866152070351347e-06, + "loss": 0.7698, + "step": 2775 + }, + { + "epoch": 0.1527877153393142, + "grad_norm": 0.8421853184700012, + "learning_rate": 9.86605242774203e-06, + "loss": 0.8085, + "step": 2776 + }, + { + "epoch": 0.15284275414166987, + "grad_norm": 0.7990233898162842, + "learning_rate": 9.865952748560768e-06, + "loss": 0.8878, + "step": 2777 + }, + { + "epoch": 0.15289779294402553, + "grad_norm": 0.8017451167106628, + "learning_rate": 9.865853032808305e-06, + "loss": 0.8707, + "step": 2778 + }, + { + "epoch": 0.1529528317463812, + "grad_norm": 0.739850640296936, + "learning_rate": 9.865753280485393e-06, + "loss": 0.7884, + "step": 2779 + }, + { + "epoch": 0.15300787054873685, + "grad_norm": 1.0682430267333984, + "learning_rate": 9.865653491592784e-06, + "loss": 0.8548, + "step": 2780 + }, + { + "epoch": 0.1530629093510925, + "grad_norm": 0.7766296863555908, + "learning_rate": 9.865553666131225e-06, + "loss": 0.7786, + "step": 2781 + }, + { + "epoch": 0.15311794815344817, + "grad_norm": 0.8903290629386902, + "learning_rate": 9.865453804101466e-06, + "loss": 0.8978, + "step": 2782 + }, + { + "epoch": 0.15317298695580384, + "grad_norm": 0.8624514937400818, + "learning_rate": 9.86535390550426e-06, + "loss": 0.8472, + "step": 2783 + }, + { + "epoch": 0.1532280257581595, + "grad_norm": 0.7765294909477234, + "learning_rate": 9.865253970340356e-06, + "loss": 0.7702, + "step": 2784 + }, + { + "epoch": 0.15328306456051516, + "grad_norm": 0.9349095225334167, + "learning_rate": 9.865153998610504e-06, + "loss": 0.9154, + "step": 2785 + }, + { + "epoch": 0.15333810336287082, + "grad_norm": 0.8435478210449219, + "learning_rate": 9.865053990315458e-06, + "loss": 0.8986, + "step": 2786 + }, + { + "epoch": 0.15339314216522648, + "grad_norm": 0.8003486394882202, + "learning_rate": 9.864953945455968e-06, + "loss": 0.767, + "step": 2787 + }, + { + "epoch": 0.15344818096758214, + "grad_norm": 0.8060823678970337, + "learning_rate": 9.86485386403279e-06, + "loss": 0.8332, + "step": 2788 + }, + { + "epoch": 0.1535032197699378, + "grad_norm": 0.7914995551109314, + "learning_rate": 9.864753746046668e-06, + "loss": 0.6706, + "step": 2789 + }, + { + "epoch": 0.15355825857229347, + "grad_norm": 0.7792215943336487, + "learning_rate": 9.86465359149836e-06, + "loss": 0.8721, + "step": 2790 + }, + { + "epoch": 0.15361329737464913, + "grad_norm": 0.8572536110877991, + "learning_rate": 9.864553400388619e-06, + "loss": 0.8378, + "step": 2791 + }, + { + "epoch": 0.1536683361770048, + "grad_norm": 0.7645615339279175, + "learning_rate": 9.864453172718195e-06, + "loss": 0.6909, + "step": 2792 + }, + { + "epoch": 0.15372337497936045, + "grad_norm": 0.7627308964729309, + "learning_rate": 9.864352908487846e-06, + "loss": 0.7918, + "step": 2793 + }, + { + "epoch": 0.1537784137817161, + "grad_norm": 1.0830100774765015, + "learning_rate": 9.86425260769832e-06, + "loss": 0.9007, + "step": 2794 + }, + { + "epoch": 0.15383345258407177, + "grad_norm": 0.7667998671531677, + "learning_rate": 9.864152270350374e-06, + "loss": 0.832, + "step": 2795 + }, + { + "epoch": 0.15388849138642743, + "grad_norm": 0.9967591762542725, + "learning_rate": 9.864051896444764e-06, + "loss": 0.8917, + "step": 2796 + }, + { + "epoch": 0.1539435301887831, + "grad_norm": 0.8948462605476379, + "learning_rate": 9.86395148598224e-06, + "loss": 0.983, + "step": 2797 + }, + { + "epoch": 0.15399856899113876, + "grad_norm": 0.7857423424720764, + "learning_rate": 9.863851038963556e-06, + "loss": 0.7826, + "step": 2798 + }, + { + "epoch": 0.15405360779349442, + "grad_norm": 0.8821337223052979, + "learning_rate": 9.863750555389473e-06, + "loss": 0.8918, + "step": 2799 + }, + { + "epoch": 0.15410864659585008, + "grad_norm": 0.7896875143051147, + "learning_rate": 9.863650035260742e-06, + "loss": 0.8199, + "step": 2800 + }, + { + "epoch": 0.15416368539820574, + "grad_norm": 0.8046941161155701, + "learning_rate": 9.86354947857812e-06, + "loss": 0.8572, + "step": 2801 + }, + { + "epoch": 0.1542187242005614, + "grad_norm": 0.7266830205917358, + "learning_rate": 9.863448885342361e-06, + "loss": 0.8315, + "step": 2802 + }, + { + "epoch": 0.15427376300291706, + "grad_norm": 0.9009475708007812, + "learning_rate": 9.863348255554222e-06, + "loss": 0.7928, + "step": 2803 + }, + { + "epoch": 0.15432880180527273, + "grad_norm": 0.963364839553833, + "learning_rate": 9.863247589214459e-06, + "loss": 0.918, + "step": 2804 + }, + { + "epoch": 0.1543838406076284, + "grad_norm": 0.8278035521507263, + "learning_rate": 9.863146886323829e-06, + "loss": 0.8497, + "step": 2805 + }, + { + "epoch": 0.15443887940998405, + "grad_norm": 0.7360561490058899, + "learning_rate": 9.86304614688309e-06, + "loss": 0.676, + "step": 2806 + }, + { + "epoch": 0.1544939182123397, + "grad_norm": 0.7679837346076965, + "learning_rate": 9.862945370892996e-06, + "loss": 0.8114, + "step": 2807 + }, + { + "epoch": 0.15454895701469537, + "grad_norm": 0.8550567030906677, + "learning_rate": 9.862844558354309e-06, + "loss": 0.8222, + "step": 2808 + }, + { + "epoch": 0.15460399581705103, + "grad_norm": 0.7852397561073303, + "learning_rate": 9.86274370926778e-06, + "loss": 0.7449, + "step": 2809 + }, + { + "epoch": 0.1546590346194067, + "grad_norm": 0.9120833277702332, + "learning_rate": 9.862642823634175e-06, + "loss": 0.8702, + "step": 2810 + }, + { + "epoch": 0.15471407342176235, + "grad_norm": 0.8729703426361084, + "learning_rate": 9.862541901454246e-06, + "loss": 0.8064, + "step": 2811 + }, + { + "epoch": 0.15476911222411802, + "grad_norm": 0.7935470342636108, + "learning_rate": 9.862440942728754e-06, + "loss": 0.8502, + "step": 2812 + }, + { + "epoch": 0.15482415102647368, + "grad_norm": 0.8640689849853516, + "learning_rate": 9.86233994745846e-06, + "loss": 0.8159, + "step": 2813 + }, + { + "epoch": 0.1548791898288293, + "grad_norm": 0.9959222078323364, + "learning_rate": 9.862238915644116e-06, + "loss": 0.7767, + "step": 2814 + }, + { + "epoch": 0.15493422863118497, + "grad_norm": 0.7889506220817566, + "learning_rate": 9.862137847286487e-06, + "loss": 0.8293, + "step": 2815 + }, + { + "epoch": 0.15498926743354063, + "grad_norm": 0.8764606714248657, + "learning_rate": 9.862036742386335e-06, + "loss": 0.856, + "step": 2816 + }, + { + "epoch": 0.1550443062358963, + "grad_norm": 0.743727445602417, + "learning_rate": 9.861935600944413e-06, + "loss": 0.7099, + "step": 2817 + }, + { + "epoch": 0.15509934503825196, + "grad_norm": 0.7866224050521851, + "learning_rate": 9.861834422961485e-06, + "loss": 0.8805, + "step": 2818 + }, + { + "epoch": 0.15515438384060762, + "grad_norm": 0.8333723545074463, + "learning_rate": 9.861733208438311e-06, + "loss": 0.8486, + "step": 2819 + }, + { + "epoch": 0.15520942264296328, + "grad_norm": 0.8261659741401672, + "learning_rate": 9.861631957375652e-06, + "loss": 0.8896, + "step": 2820 + }, + { + "epoch": 0.15526446144531894, + "grad_norm": 0.8381538987159729, + "learning_rate": 9.861530669774268e-06, + "loss": 0.8686, + "step": 2821 + }, + { + "epoch": 0.1553195002476746, + "grad_norm": 0.9184440970420837, + "learning_rate": 9.861429345634923e-06, + "loss": 0.9702, + "step": 2822 + }, + { + "epoch": 0.15537453905003026, + "grad_norm": 0.8170294165611267, + "learning_rate": 9.861327984958374e-06, + "loss": 0.8298, + "step": 2823 + }, + { + "epoch": 0.15542957785238593, + "grad_norm": 0.8361968398094177, + "learning_rate": 9.861226587745385e-06, + "loss": 0.8232, + "step": 2824 + }, + { + "epoch": 0.1554846166547416, + "grad_norm": 0.7437820434570312, + "learning_rate": 9.861125153996718e-06, + "loss": 0.8271, + "step": 2825 + }, + { + "epoch": 0.15553965545709725, + "grad_norm": 0.715887188911438, + "learning_rate": 9.861023683713137e-06, + "loss": 0.7726, + "step": 2826 + }, + { + "epoch": 0.1555946942594529, + "grad_norm": 0.8358462452888489, + "learning_rate": 9.860922176895403e-06, + "loss": 0.8247, + "step": 2827 + }, + { + "epoch": 0.15564973306180857, + "grad_norm": 0.8620158433914185, + "learning_rate": 9.860820633544278e-06, + "loss": 0.8804, + "step": 2828 + }, + { + "epoch": 0.15570477186416423, + "grad_norm": 0.9035346508026123, + "learning_rate": 9.860719053660527e-06, + "loss": 0.7973, + "step": 2829 + }, + { + "epoch": 0.1557598106665199, + "grad_norm": 0.8014782071113586, + "learning_rate": 9.860617437244914e-06, + "loss": 0.7914, + "step": 2830 + }, + { + "epoch": 0.15581484946887555, + "grad_norm": 0.7788864970207214, + "learning_rate": 9.8605157842982e-06, + "loss": 0.7377, + "step": 2831 + }, + { + "epoch": 0.15586988827123122, + "grad_norm": 0.7475222945213318, + "learning_rate": 9.860414094821152e-06, + "loss": 0.7173, + "step": 2832 + }, + { + "epoch": 0.15592492707358688, + "grad_norm": 0.8866652846336365, + "learning_rate": 9.86031236881453e-06, + "loss": 0.8231, + "step": 2833 + }, + { + "epoch": 0.15597996587594254, + "grad_norm": 0.8725677728652954, + "learning_rate": 9.860210606279102e-06, + "loss": 0.9025, + "step": 2834 + }, + { + "epoch": 0.1560350046782982, + "grad_norm": 0.7608423233032227, + "learning_rate": 9.860108807215634e-06, + "loss": 0.8385, + "step": 2835 + }, + { + "epoch": 0.15609004348065386, + "grad_norm": 0.8237566351890564, + "learning_rate": 9.860006971624887e-06, + "loss": 0.8635, + "step": 2836 + }, + { + "epoch": 0.15614508228300952, + "grad_norm": 0.8078347444534302, + "learning_rate": 9.859905099507629e-06, + "loss": 0.7916, + "step": 2837 + }, + { + "epoch": 0.15620012108536518, + "grad_norm": 0.8282070755958557, + "learning_rate": 9.859803190864626e-06, + "loss": 0.8141, + "step": 2838 + }, + { + "epoch": 0.15625515988772085, + "grad_norm": 0.7639191150665283, + "learning_rate": 9.859701245696642e-06, + "loss": 0.7457, + "step": 2839 + }, + { + "epoch": 0.1563101986900765, + "grad_norm": 0.8429144620895386, + "learning_rate": 9.859599264004446e-06, + "loss": 0.9176, + "step": 2840 + }, + { + "epoch": 0.15636523749243217, + "grad_norm": 0.7792791724205017, + "learning_rate": 9.859497245788801e-06, + "loss": 0.8738, + "step": 2841 + }, + { + "epoch": 0.15642027629478783, + "grad_norm": 0.9018417596817017, + "learning_rate": 9.859395191050476e-06, + "loss": 0.841, + "step": 2842 + }, + { + "epoch": 0.1564753150971435, + "grad_norm": 0.7113705277442932, + "learning_rate": 9.859293099790239e-06, + "loss": 0.6576, + "step": 2843 + }, + { + "epoch": 0.15653035389949915, + "grad_norm": 0.8376311659812927, + "learning_rate": 9.859190972008853e-06, + "loss": 0.8559, + "step": 2844 + }, + { + "epoch": 0.15658539270185481, + "grad_norm": 0.7689141035079956, + "learning_rate": 9.859088807707092e-06, + "loss": 0.7844, + "step": 2845 + }, + { + "epoch": 0.15664043150421048, + "grad_norm": 0.7559483647346497, + "learning_rate": 9.858986606885717e-06, + "loss": 0.8676, + "step": 2846 + }, + { + "epoch": 0.15669547030656614, + "grad_norm": 0.7743827700614929, + "learning_rate": 9.8588843695455e-06, + "loss": 0.7995, + "step": 2847 + }, + { + "epoch": 0.1567505091089218, + "grad_norm": 0.8631327152252197, + "learning_rate": 9.85878209568721e-06, + "loss": 0.801, + "step": 2848 + }, + { + "epoch": 0.15680554791127746, + "grad_norm": 0.7454009056091309, + "learning_rate": 9.858679785311613e-06, + "loss": 0.8172, + "step": 2849 + }, + { + "epoch": 0.15686058671363312, + "grad_norm": 0.7915313839912415, + "learning_rate": 9.858577438419479e-06, + "loss": 0.833, + "step": 2850 + }, + { + "epoch": 0.15691562551598878, + "grad_norm": 0.8472526669502258, + "learning_rate": 9.858475055011578e-06, + "loss": 0.8249, + "step": 2851 + }, + { + "epoch": 0.15697066431834444, + "grad_norm": 0.7967580556869507, + "learning_rate": 9.85837263508868e-06, + "loss": 0.7533, + "step": 2852 + }, + { + "epoch": 0.1570257031207001, + "grad_norm": 0.7476257085800171, + "learning_rate": 9.858270178651554e-06, + "loss": 0.7918, + "step": 2853 + }, + { + "epoch": 0.15708074192305577, + "grad_norm": 0.8736184239387512, + "learning_rate": 9.858167685700968e-06, + "loss": 0.8254, + "step": 2854 + }, + { + "epoch": 0.15713578072541143, + "grad_norm": 0.8734819889068604, + "learning_rate": 9.858065156237694e-06, + "loss": 0.749, + "step": 2855 + }, + { + "epoch": 0.1571908195277671, + "grad_norm": 1.0344874858856201, + "learning_rate": 9.857962590262506e-06, + "loss": 0.9578, + "step": 2856 + }, + { + "epoch": 0.15724585833012272, + "grad_norm": 0.81183922290802, + "learning_rate": 9.857859987776168e-06, + "loss": 0.8845, + "step": 2857 + }, + { + "epoch": 0.15730089713247838, + "grad_norm": 0.8252540230751038, + "learning_rate": 9.857757348779456e-06, + "loss": 0.7862, + "step": 2858 + }, + { + "epoch": 0.15735593593483405, + "grad_norm": 0.7468119859695435, + "learning_rate": 9.85765467327314e-06, + "loss": 0.7587, + "step": 2859 + }, + { + "epoch": 0.1574109747371897, + "grad_norm": 0.8095998167991638, + "learning_rate": 9.857551961257993e-06, + "loss": 0.7467, + "step": 2860 + }, + { + "epoch": 0.15746601353954537, + "grad_norm": 0.8908564448356628, + "learning_rate": 9.857449212734785e-06, + "loss": 0.8199, + "step": 2861 + }, + { + "epoch": 0.15752105234190103, + "grad_norm": 0.7605593204498291, + "learning_rate": 9.857346427704288e-06, + "loss": 0.7196, + "step": 2862 + }, + { + "epoch": 0.1575760911442567, + "grad_norm": 0.9250784516334534, + "learning_rate": 9.857243606167276e-06, + "loss": 0.7366, + "step": 2863 + }, + { + "epoch": 0.15763112994661235, + "grad_norm": 0.882796585559845, + "learning_rate": 9.85714074812452e-06, + "loss": 0.8422, + "step": 2864 + }, + { + "epoch": 0.15768616874896801, + "grad_norm": 1.0014574527740479, + "learning_rate": 9.857037853576797e-06, + "loss": 0.8762, + "step": 2865 + }, + { + "epoch": 0.15774120755132368, + "grad_norm": 0.86713045835495, + "learning_rate": 9.856934922524877e-06, + "loss": 0.9282, + "step": 2866 + }, + { + "epoch": 0.15779624635367934, + "grad_norm": 1.1457390785217285, + "learning_rate": 9.856831954969532e-06, + "loss": 0.7947, + "step": 2867 + }, + { + "epoch": 0.157851285156035, + "grad_norm": 0.8902556896209717, + "learning_rate": 9.85672895091154e-06, + "loss": 0.928, + "step": 2868 + }, + { + "epoch": 0.15790632395839066, + "grad_norm": 0.7978467345237732, + "learning_rate": 9.856625910351674e-06, + "loss": 0.7382, + "step": 2869 + }, + { + "epoch": 0.15796136276074632, + "grad_norm": 0.741457462310791, + "learning_rate": 9.856522833290705e-06, + "loss": 0.7736, + "step": 2870 + }, + { + "epoch": 0.15801640156310198, + "grad_norm": 0.8330628871917725, + "learning_rate": 9.856419719729413e-06, + "loss": 0.8396, + "step": 2871 + }, + { + "epoch": 0.15807144036545764, + "grad_norm": 0.8771876692771912, + "learning_rate": 9.85631656966857e-06, + "loss": 0.6669, + "step": 2872 + }, + { + "epoch": 0.1581264791678133, + "grad_norm": 0.8073394298553467, + "learning_rate": 9.85621338310895e-06, + "loss": 0.8206, + "step": 2873 + }, + { + "epoch": 0.15818151797016897, + "grad_norm": 1.1058349609375, + "learning_rate": 9.85611016005133e-06, + "loss": 0.9526, + "step": 2874 + }, + { + "epoch": 0.15823655677252463, + "grad_norm": 0.7734992504119873, + "learning_rate": 9.856006900496488e-06, + "loss": 0.7477, + "step": 2875 + }, + { + "epoch": 0.1582915955748803, + "grad_norm": 0.9053219556808472, + "learning_rate": 9.855903604445196e-06, + "loss": 0.8009, + "step": 2876 + }, + { + "epoch": 0.15834663437723595, + "grad_norm": 0.8774041533470154, + "learning_rate": 9.855800271898233e-06, + "loss": 0.854, + "step": 2877 + }, + { + "epoch": 0.1584016731795916, + "grad_norm": 0.8346550464630127, + "learning_rate": 9.855696902856376e-06, + "loss": 0.7976, + "step": 2878 + }, + { + "epoch": 0.15845671198194727, + "grad_norm": 0.7781139016151428, + "learning_rate": 9.855593497320401e-06, + "loss": 0.7693, + "step": 2879 + }, + { + "epoch": 0.15851175078430293, + "grad_norm": 0.8707864880561829, + "learning_rate": 9.855490055291084e-06, + "loss": 0.882, + "step": 2880 + }, + { + "epoch": 0.1585667895866586, + "grad_norm": 0.7982275485992432, + "learning_rate": 9.855386576769203e-06, + "loss": 0.8457, + "step": 2881 + }, + { + "epoch": 0.15862182838901426, + "grad_norm": 0.7577090263366699, + "learning_rate": 9.855283061755536e-06, + "loss": 0.754, + "step": 2882 + }, + { + "epoch": 0.15867686719136992, + "grad_norm": 0.7422069311141968, + "learning_rate": 9.855179510250863e-06, + "loss": 0.673, + "step": 2883 + }, + { + "epoch": 0.15873190599372558, + "grad_norm": 0.7730041742324829, + "learning_rate": 9.85507592225596e-06, + "loss": 0.7888, + "step": 2884 + }, + { + "epoch": 0.15878694479608124, + "grad_norm": 0.7370560169219971, + "learning_rate": 9.854972297771605e-06, + "loss": 0.7762, + "step": 2885 + }, + { + "epoch": 0.1588419835984369, + "grad_norm": 0.725074291229248, + "learning_rate": 9.854868636798577e-06, + "loss": 0.7951, + "step": 2886 + }, + { + "epoch": 0.15889702240079256, + "grad_norm": 0.8088375926017761, + "learning_rate": 9.854764939337657e-06, + "loss": 0.8557, + "step": 2887 + }, + { + "epoch": 0.15895206120314823, + "grad_norm": 0.8268256187438965, + "learning_rate": 9.854661205389624e-06, + "loss": 0.7641, + "step": 2888 + }, + { + "epoch": 0.1590071000055039, + "grad_norm": 0.8079462051391602, + "learning_rate": 9.854557434955257e-06, + "loss": 0.7947, + "step": 2889 + }, + { + "epoch": 0.15906213880785955, + "grad_norm": 0.8257912993431091, + "learning_rate": 9.854453628035335e-06, + "loss": 0.771, + "step": 2890 + }, + { + "epoch": 0.1591171776102152, + "grad_norm": 0.8901774287223816, + "learning_rate": 9.85434978463064e-06, + "loss": 0.9415, + "step": 2891 + }, + { + "epoch": 0.15917221641257087, + "grad_norm": 0.8283013105392456, + "learning_rate": 9.854245904741948e-06, + "loss": 0.7267, + "step": 2892 + }, + { + "epoch": 0.15922725521492653, + "grad_norm": 0.8665382266044617, + "learning_rate": 9.854141988370045e-06, + "loss": 0.8681, + "step": 2893 + }, + { + "epoch": 0.1592822940172822, + "grad_norm": 0.786494255065918, + "learning_rate": 9.854038035515712e-06, + "loss": 0.7614, + "step": 2894 + }, + { + "epoch": 0.15933733281963786, + "grad_norm": 1.0548759698867798, + "learning_rate": 9.853934046179727e-06, + "loss": 0.861, + "step": 2895 + }, + { + "epoch": 0.15939237162199352, + "grad_norm": 0.8565425276756287, + "learning_rate": 9.853830020362873e-06, + "loss": 0.7858, + "step": 2896 + }, + { + "epoch": 0.15944741042434918, + "grad_norm": 0.7982691526412964, + "learning_rate": 9.853725958065933e-06, + "loss": 0.8797, + "step": 2897 + }, + { + "epoch": 0.15950244922670484, + "grad_norm": 0.8613169193267822, + "learning_rate": 9.853621859289686e-06, + "loss": 0.9217, + "step": 2898 + }, + { + "epoch": 0.1595574880290605, + "grad_norm": 0.950639009475708, + "learning_rate": 9.853517724034918e-06, + "loss": 0.8315, + "step": 2899 + }, + { + "epoch": 0.15961252683141613, + "grad_norm": 0.7940176129341125, + "learning_rate": 9.853413552302409e-06, + "loss": 0.7713, + "step": 2900 + }, + { + "epoch": 0.1596675656337718, + "grad_norm": 0.7716153264045715, + "learning_rate": 9.853309344092944e-06, + "loss": 0.7922, + "step": 2901 + }, + { + "epoch": 0.15972260443612746, + "grad_norm": 0.7626190781593323, + "learning_rate": 9.853205099407303e-06, + "loss": 0.7278, + "step": 2902 + }, + { + "epoch": 0.15977764323848312, + "grad_norm": 0.8523116707801819, + "learning_rate": 9.853100818246274e-06, + "loss": 0.8136, + "step": 2903 + }, + { + "epoch": 0.15983268204083878, + "grad_norm": 0.7636643052101135, + "learning_rate": 9.852996500610637e-06, + "loss": 0.6984, + "step": 2904 + }, + { + "epoch": 0.15988772084319444, + "grad_norm": 0.799201250076294, + "learning_rate": 9.852892146501179e-06, + "loss": 0.8319, + "step": 2905 + }, + { + "epoch": 0.1599427596455501, + "grad_norm": 0.7743694186210632, + "learning_rate": 9.85278775591868e-06, + "loss": 0.81, + "step": 2906 + }, + { + "epoch": 0.15999779844790576, + "grad_norm": 0.8964856863021851, + "learning_rate": 9.85268332886393e-06, + "loss": 0.9227, + "step": 2907 + }, + { + "epoch": 0.16005283725026143, + "grad_norm": 0.8809369802474976, + "learning_rate": 9.852578865337708e-06, + "loss": 0.9285, + "step": 2908 + }, + { + "epoch": 0.1601078760526171, + "grad_norm": 0.8960002064704895, + "learning_rate": 9.852474365340806e-06, + "loss": 0.8611, + "step": 2909 + }, + { + "epoch": 0.16016291485497275, + "grad_norm": 0.7539754509925842, + "learning_rate": 9.852369828874002e-06, + "loss": 0.7455, + "step": 2910 + }, + { + "epoch": 0.1602179536573284, + "grad_norm": 0.8189692497253418, + "learning_rate": 9.852265255938088e-06, + "loss": 0.8321, + "step": 2911 + }, + { + "epoch": 0.16027299245968407, + "grad_norm": 0.8708549737930298, + "learning_rate": 9.852160646533844e-06, + "loss": 0.8373, + "step": 2912 + }, + { + "epoch": 0.16032803126203973, + "grad_norm": 0.7701451778411865, + "learning_rate": 9.852056000662063e-06, + "loss": 0.805, + "step": 2913 + }, + { + "epoch": 0.1603830700643954, + "grad_norm": 0.9111948609352112, + "learning_rate": 9.851951318323526e-06, + "loss": 0.8513, + "step": 2914 + }, + { + "epoch": 0.16043810886675106, + "grad_norm": 0.7863909602165222, + "learning_rate": 9.85184659951902e-06, + "loss": 0.7856, + "step": 2915 + }, + { + "epoch": 0.16049314766910672, + "grad_norm": 0.9000817537307739, + "learning_rate": 9.851741844249336e-06, + "loss": 0.9172, + "step": 2916 + }, + { + "epoch": 0.16054818647146238, + "grad_norm": 1.0953118801116943, + "learning_rate": 9.851637052515259e-06, + "loss": 0.8564, + "step": 2917 + }, + { + "epoch": 0.16060322527381804, + "grad_norm": 0.8405389785766602, + "learning_rate": 9.851532224317575e-06, + "loss": 0.8317, + "step": 2918 + }, + { + "epoch": 0.1606582640761737, + "grad_norm": 0.8524565100669861, + "learning_rate": 9.851427359657075e-06, + "loss": 0.8765, + "step": 2919 + }, + { + "epoch": 0.16071330287852936, + "grad_norm": 0.8234089016914368, + "learning_rate": 9.851322458534546e-06, + "loss": 0.7873, + "step": 2920 + }, + { + "epoch": 0.16076834168088502, + "grad_norm": 0.7879638671875, + "learning_rate": 9.851217520950775e-06, + "loss": 0.8394, + "step": 2921 + }, + { + "epoch": 0.16082338048324069, + "grad_norm": 0.8168820738792419, + "learning_rate": 9.851112546906552e-06, + "loss": 0.8223, + "step": 2922 + }, + { + "epoch": 0.16087841928559635, + "grad_norm": 0.9423845410346985, + "learning_rate": 9.851007536402666e-06, + "loss": 0.9256, + "step": 2923 + }, + { + "epoch": 0.160933458087952, + "grad_norm": 0.7875099778175354, + "learning_rate": 9.850902489439906e-06, + "loss": 0.8199, + "step": 2924 + }, + { + "epoch": 0.16098849689030767, + "grad_norm": 0.6934793591499329, + "learning_rate": 9.85079740601906e-06, + "loss": 0.671, + "step": 2925 + }, + { + "epoch": 0.16104353569266333, + "grad_norm": 0.8172206282615662, + "learning_rate": 9.85069228614092e-06, + "loss": 0.7633, + "step": 2926 + }, + { + "epoch": 0.161098574495019, + "grad_norm": 0.72749263048172, + "learning_rate": 9.850587129806274e-06, + "loss": 0.8719, + "step": 2927 + }, + { + "epoch": 0.16115361329737465, + "grad_norm": 0.8416743874549866, + "learning_rate": 9.850481937015917e-06, + "loss": 0.8438, + "step": 2928 + }, + { + "epoch": 0.16120865209973032, + "grad_norm": 0.7415444850921631, + "learning_rate": 9.850376707770633e-06, + "loss": 0.7673, + "step": 2929 + }, + { + "epoch": 0.16126369090208598, + "grad_norm": 0.9364289045333862, + "learning_rate": 9.850271442071217e-06, + "loss": 0.7224, + "step": 2930 + }, + { + "epoch": 0.16131872970444164, + "grad_norm": 0.7314212918281555, + "learning_rate": 9.85016613991846e-06, + "loss": 0.7759, + "step": 2931 + }, + { + "epoch": 0.1613737685067973, + "grad_norm": 0.8940219283103943, + "learning_rate": 9.850060801313151e-06, + "loss": 0.8432, + "step": 2932 + }, + { + "epoch": 0.16142880730915296, + "grad_norm": 0.7499691843986511, + "learning_rate": 9.849955426256084e-06, + "loss": 0.8171, + "step": 2933 + }, + { + "epoch": 0.16148384611150862, + "grad_norm": 0.8123053312301636, + "learning_rate": 9.84985001474805e-06, + "loss": 0.7839, + "step": 2934 + }, + { + "epoch": 0.16153888491386428, + "grad_norm": 0.819618821144104, + "learning_rate": 9.849744566789842e-06, + "loss": 0.9123, + "step": 2935 + }, + { + "epoch": 0.16159392371621994, + "grad_norm": 0.791088342666626, + "learning_rate": 9.849639082382251e-06, + "loss": 0.8347, + "step": 2936 + }, + { + "epoch": 0.1616489625185756, + "grad_norm": 0.8166706562042236, + "learning_rate": 9.849533561526072e-06, + "loss": 0.8309, + "step": 2937 + }, + { + "epoch": 0.16170400132093127, + "grad_norm": 0.7944774031639099, + "learning_rate": 9.849428004222098e-06, + "loss": 0.8387, + "step": 2938 + }, + { + "epoch": 0.16175904012328693, + "grad_norm": 0.7414719462394714, + "learning_rate": 9.849322410471119e-06, + "loss": 0.71, + "step": 2939 + }, + { + "epoch": 0.1618140789256426, + "grad_norm": 0.8983511924743652, + "learning_rate": 9.849216780273931e-06, + "loss": 0.8902, + "step": 2940 + }, + { + "epoch": 0.16186911772799825, + "grad_norm": 0.9058687686920166, + "learning_rate": 9.849111113631329e-06, + "loss": 0.8804, + "step": 2941 + }, + { + "epoch": 0.1619241565303539, + "grad_norm": 0.948871374130249, + "learning_rate": 9.849005410544105e-06, + "loss": 0.9871, + "step": 2942 + }, + { + "epoch": 0.16197919533270955, + "grad_norm": 0.8240115642547607, + "learning_rate": 9.848899671013055e-06, + "loss": 0.8708, + "step": 2943 + }, + { + "epoch": 0.1620342341350652, + "grad_norm": 0.879953145980835, + "learning_rate": 9.848793895038972e-06, + "loss": 0.9279, + "step": 2944 + }, + { + "epoch": 0.16208927293742087, + "grad_norm": 0.8464690446853638, + "learning_rate": 9.848688082622653e-06, + "loss": 0.8418, + "step": 2945 + }, + { + "epoch": 0.16214431173977653, + "grad_norm": 0.8990732431411743, + "learning_rate": 9.848582233764891e-06, + "loss": 0.8622, + "step": 2946 + }, + { + "epoch": 0.1621993505421322, + "grad_norm": 0.8054911494255066, + "learning_rate": 9.848476348466483e-06, + "loss": 0.8295, + "step": 2947 + }, + { + "epoch": 0.16225438934448785, + "grad_norm": 0.7904845476150513, + "learning_rate": 9.848370426728226e-06, + "loss": 0.7777, + "step": 2948 + }, + { + "epoch": 0.16230942814684352, + "grad_norm": 1.0143954753875732, + "learning_rate": 9.848264468550915e-06, + "loss": 0.8556, + "step": 2949 + }, + { + "epoch": 0.16236446694919918, + "grad_norm": 0.7201125621795654, + "learning_rate": 9.848158473935344e-06, + "loss": 0.7981, + "step": 2950 + }, + { + "epoch": 0.16241950575155484, + "grad_norm": 0.8322157263755798, + "learning_rate": 9.848052442882312e-06, + "loss": 0.8323, + "step": 2951 + }, + { + "epoch": 0.1624745445539105, + "grad_norm": 0.7740346193313599, + "learning_rate": 9.847946375392617e-06, + "loss": 0.8355, + "step": 2952 + }, + { + "epoch": 0.16252958335626616, + "grad_norm": 0.8955645561218262, + "learning_rate": 9.847840271467053e-06, + "loss": 0.7161, + "step": 2953 + }, + { + "epoch": 0.16258462215862182, + "grad_norm": 0.800364077091217, + "learning_rate": 9.847734131106421e-06, + "loss": 0.8165, + "step": 2954 + }, + { + "epoch": 0.16263966096097748, + "grad_norm": 0.8305484056472778, + "learning_rate": 9.847627954311516e-06, + "loss": 0.7846, + "step": 2955 + }, + { + "epoch": 0.16269469976333314, + "grad_norm": 0.7354590892791748, + "learning_rate": 9.847521741083136e-06, + "loss": 0.7743, + "step": 2956 + }, + { + "epoch": 0.1627497385656888, + "grad_norm": 0.8173812627792358, + "learning_rate": 9.847415491422083e-06, + "loss": 0.8626, + "step": 2957 + }, + { + "epoch": 0.16280477736804447, + "grad_norm": 0.7959356307983398, + "learning_rate": 9.84730920532915e-06, + "loss": 0.8016, + "step": 2958 + }, + { + "epoch": 0.16285981617040013, + "grad_norm": 0.8256500363349915, + "learning_rate": 9.84720288280514e-06, + "loss": 0.7407, + "step": 2959 + }, + { + "epoch": 0.1629148549727558, + "grad_norm": 0.8522148728370667, + "learning_rate": 9.84709652385085e-06, + "loss": 0.8342, + "step": 2960 + }, + { + "epoch": 0.16296989377511145, + "grad_norm": 0.7791039943695068, + "learning_rate": 9.84699012846708e-06, + "loss": 0.7239, + "step": 2961 + }, + { + "epoch": 0.1630249325774671, + "grad_norm": 0.84294193983078, + "learning_rate": 9.84688369665463e-06, + "loss": 0.7498, + "step": 2962 + }, + { + "epoch": 0.16307997137982277, + "grad_norm": 0.7948899865150452, + "learning_rate": 9.846777228414299e-06, + "loss": 0.7713, + "step": 2963 + }, + { + "epoch": 0.16313501018217844, + "grad_norm": 0.6673180460929871, + "learning_rate": 9.846670723746888e-06, + "loss": 0.6759, + "step": 2964 + }, + { + "epoch": 0.1631900489845341, + "grad_norm": 0.8141015768051147, + "learning_rate": 9.846564182653199e-06, + "loss": 0.7928, + "step": 2965 + }, + { + "epoch": 0.16324508778688976, + "grad_norm": 0.967830240726471, + "learning_rate": 9.846457605134028e-06, + "loss": 0.823, + "step": 2966 + }, + { + "epoch": 0.16330012658924542, + "grad_norm": 0.8099361658096313, + "learning_rate": 9.84635099119018e-06, + "loss": 0.8724, + "step": 2967 + }, + { + "epoch": 0.16335516539160108, + "grad_norm": 0.7913978099822998, + "learning_rate": 9.846244340822456e-06, + "loss": 0.7106, + "step": 2968 + }, + { + "epoch": 0.16341020419395674, + "grad_norm": 0.8308563828468323, + "learning_rate": 9.846137654031655e-06, + "loss": 0.7631, + "step": 2969 + }, + { + "epoch": 0.1634652429963124, + "grad_norm": 0.8634191751480103, + "learning_rate": 9.846030930818582e-06, + "loss": 0.7363, + "step": 2970 + }, + { + "epoch": 0.16352028179866807, + "grad_norm": 0.8936432600021362, + "learning_rate": 9.845924171184038e-06, + "loss": 0.8714, + "step": 2971 + }, + { + "epoch": 0.16357532060102373, + "grad_norm": 0.8776300549507141, + "learning_rate": 9.845817375128825e-06, + "loss": 0.914, + "step": 2972 + }, + { + "epoch": 0.1636303594033794, + "grad_norm": 0.8793039321899414, + "learning_rate": 9.845710542653745e-06, + "loss": 0.7999, + "step": 2973 + }, + { + "epoch": 0.16368539820573505, + "grad_norm": 0.8391743302345276, + "learning_rate": 9.845603673759603e-06, + "loss": 0.8124, + "step": 2974 + }, + { + "epoch": 0.1637404370080907, + "grad_norm": 0.8487186431884766, + "learning_rate": 9.845496768447199e-06, + "loss": 0.8551, + "step": 2975 + }, + { + "epoch": 0.16379547581044637, + "grad_norm": 0.7780638933181763, + "learning_rate": 9.845389826717339e-06, + "loss": 0.7281, + "step": 2976 + }, + { + "epoch": 0.16385051461280203, + "grad_norm": 0.7209637761116028, + "learning_rate": 9.845282848570825e-06, + "loss": 0.6737, + "step": 2977 + }, + { + "epoch": 0.1639055534151577, + "grad_norm": 0.8414756059646606, + "learning_rate": 9.845175834008464e-06, + "loss": 0.8003, + "step": 2978 + }, + { + "epoch": 0.16396059221751336, + "grad_norm": 1.2730679512023926, + "learning_rate": 9.845068783031057e-06, + "loss": 0.8243, + "step": 2979 + }, + { + "epoch": 0.16401563101986902, + "grad_norm": 0.8573475480079651, + "learning_rate": 9.844961695639413e-06, + "loss": 0.7844, + "step": 2980 + }, + { + "epoch": 0.16407066982222468, + "grad_norm": 0.8029958605766296, + "learning_rate": 9.84485457183433e-06, + "loss": 0.7722, + "step": 2981 + }, + { + "epoch": 0.16412570862458034, + "grad_norm": 0.7839805483818054, + "learning_rate": 9.844747411616619e-06, + "loss": 0.8146, + "step": 2982 + }, + { + "epoch": 0.164180747426936, + "grad_norm": 0.7563499212265015, + "learning_rate": 9.844640214987082e-06, + "loss": 0.6909, + "step": 2983 + }, + { + "epoch": 0.16423578622929166, + "grad_norm": 0.7199193239212036, + "learning_rate": 9.844532981946527e-06, + "loss": 0.702, + "step": 2984 + }, + { + "epoch": 0.16429082503164733, + "grad_norm": 0.7519383430480957, + "learning_rate": 9.844425712495758e-06, + "loss": 0.6493, + "step": 2985 + }, + { + "epoch": 0.16434586383400296, + "grad_norm": 0.7493193745613098, + "learning_rate": 9.844318406635584e-06, + "loss": 0.8318, + "step": 2986 + }, + { + "epoch": 0.16440090263635862, + "grad_norm": 0.7951106429100037, + "learning_rate": 9.84421106436681e-06, + "loss": 0.923, + "step": 2987 + }, + { + "epoch": 0.16445594143871428, + "grad_norm": 0.8350820541381836, + "learning_rate": 9.844103685690238e-06, + "loss": 0.8091, + "step": 2988 + }, + { + "epoch": 0.16451098024106994, + "grad_norm": 0.773932695388794, + "learning_rate": 9.843996270606683e-06, + "loss": 0.8016, + "step": 2989 + }, + { + "epoch": 0.1645660190434256, + "grad_norm": 0.8208432793617249, + "learning_rate": 9.843888819116947e-06, + "loss": 0.7704, + "step": 2990 + }, + { + "epoch": 0.16462105784578127, + "grad_norm": 0.8552223443984985, + "learning_rate": 9.84378133122184e-06, + "loss": 0.8519, + "step": 2991 + }, + { + "epoch": 0.16467609664813693, + "grad_norm": 0.8015661835670471, + "learning_rate": 9.84367380692217e-06, + "loss": 0.8389, + "step": 2992 + }, + { + "epoch": 0.1647311354504926, + "grad_norm": 0.7828749418258667, + "learning_rate": 9.843566246218743e-06, + "loss": 0.7385, + "step": 2993 + }, + { + "epoch": 0.16478617425284825, + "grad_norm": 0.7761647701263428, + "learning_rate": 9.84345864911237e-06, + "loss": 0.8419, + "step": 2994 + }, + { + "epoch": 0.1648412130552039, + "grad_norm": 0.8839839100837708, + "learning_rate": 9.843351015603857e-06, + "loss": 0.8069, + "step": 2995 + }, + { + "epoch": 0.16489625185755957, + "grad_norm": 0.8611735105514526, + "learning_rate": 9.843243345694014e-06, + "loss": 0.9406, + "step": 2996 + }, + { + "epoch": 0.16495129065991523, + "grad_norm": 0.9042683839797974, + "learning_rate": 9.84313563938365e-06, + "loss": 0.821, + "step": 2997 + }, + { + "epoch": 0.1650063294622709, + "grad_norm": 0.8333690762519836, + "learning_rate": 9.843027896673577e-06, + "loss": 0.781, + "step": 2998 + }, + { + "epoch": 0.16506136826462656, + "grad_norm": 0.819922924041748, + "learning_rate": 9.8429201175646e-06, + "loss": 0.869, + "step": 2999 + }, + { + "epoch": 0.16511640706698222, + "grad_norm": 0.8349948525428772, + "learning_rate": 9.842812302057534e-06, + "loss": 0.9271, + "step": 3000 + }, + { + "epoch": 0.16517144586933788, + "grad_norm": 0.8981684446334839, + "learning_rate": 9.842704450153187e-06, + "loss": 0.7384, + "step": 3001 + }, + { + "epoch": 0.16522648467169354, + "grad_norm": 0.839133083820343, + "learning_rate": 9.842596561852369e-06, + "loss": 0.9016, + "step": 3002 + }, + { + "epoch": 0.1652815234740492, + "grad_norm": 0.8303349614143372, + "learning_rate": 9.842488637155891e-06, + "loss": 0.7488, + "step": 3003 + }, + { + "epoch": 0.16533656227640486, + "grad_norm": 0.8748323917388916, + "learning_rate": 9.842380676064566e-06, + "loss": 0.8163, + "step": 3004 + }, + { + "epoch": 0.16539160107876053, + "grad_norm": 0.782844603061676, + "learning_rate": 9.842272678579203e-06, + "loss": 0.8465, + "step": 3005 + }, + { + "epoch": 0.1654466398811162, + "grad_norm": 0.8068844676017761, + "learning_rate": 9.842164644700615e-06, + "loss": 0.8856, + "step": 3006 + }, + { + "epoch": 0.16550167868347185, + "grad_norm": 0.9174006581306458, + "learning_rate": 9.842056574429615e-06, + "loss": 0.7748, + "step": 3007 + }, + { + "epoch": 0.1655567174858275, + "grad_norm": 0.7453809380531311, + "learning_rate": 9.841948467767012e-06, + "loss": 0.7565, + "step": 3008 + }, + { + "epoch": 0.16561175628818317, + "grad_norm": 0.8408182859420776, + "learning_rate": 9.841840324713622e-06, + "loss": 0.7345, + "step": 3009 + }, + { + "epoch": 0.16566679509053883, + "grad_norm": 0.8599638938903809, + "learning_rate": 9.841732145270254e-06, + "loss": 0.8163, + "step": 3010 + }, + { + "epoch": 0.1657218338928945, + "grad_norm": 0.877616822719574, + "learning_rate": 9.841623929437725e-06, + "loss": 0.8685, + "step": 3011 + }, + { + "epoch": 0.16577687269525015, + "grad_norm": 0.7765643000602722, + "learning_rate": 9.841515677216846e-06, + "loss": 0.7281, + "step": 3012 + }, + { + "epoch": 0.16583191149760582, + "grad_norm": 0.7891712784767151, + "learning_rate": 9.841407388608431e-06, + "loss": 0.8618, + "step": 3013 + }, + { + "epoch": 0.16588695029996148, + "grad_norm": 0.9215571284294128, + "learning_rate": 9.841299063613295e-06, + "loss": 0.8709, + "step": 3014 + }, + { + "epoch": 0.16594198910231714, + "grad_norm": 0.8428288698196411, + "learning_rate": 9.841190702232249e-06, + "loss": 0.8227, + "step": 3015 + }, + { + "epoch": 0.1659970279046728, + "grad_norm": 0.9294042587280273, + "learning_rate": 9.841082304466112e-06, + "loss": 0.8203, + "step": 3016 + }, + { + "epoch": 0.16605206670702846, + "grad_norm": 0.7530880570411682, + "learning_rate": 9.840973870315695e-06, + "loss": 0.7681, + "step": 3017 + }, + { + "epoch": 0.16610710550938412, + "grad_norm": 1.0149626731872559, + "learning_rate": 9.840865399781814e-06, + "loss": 0.9212, + "step": 3018 + }, + { + "epoch": 0.16616214431173978, + "grad_norm": 0.7595353722572327, + "learning_rate": 9.840756892865285e-06, + "loss": 0.795, + "step": 3019 + }, + { + "epoch": 0.16621718311409545, + "grad_norm": 0.7893253564834595, + "learning_rate": 9.840648349566924e-06, + "loss": 0.8147, + "step": 3020 + }, + { + "epoch": 0.1662722219164511, + "grad_norm": 0.8190789222717285, + "learning_rate": 9.840539769887543e-06, + "loss": 0.7233, + "step": 3021 + }, + { + "epoch": 0.16632726071880677, + "grad_norm": 0.7771229147911072, + "learning_rate": 9.840431153827963e-06, + "loss": 0.7172, + "step": 3022 + }, + { + "epoch": 0.16638229952116243, + "grad_norm": 0.7379328012466431, + "learning_rate": 9.840322501388998e-06, + "loss": 0.7603, + "step": 3023 + }, + { + "epoch": 0.1664373383235181, + "grad_norm": 0.9488499760627747, + "learning_rate": 9.840213812571464e-06, + "loss": 0.8025, + "step": 3024 + }, + { + "epoch": 0.16649237712587375, + "grad_norm": 0.7135865092277527, + "learning_rate": 9.84010508737618e-06, + "loss": 0.7412, + "step": 3025 + }, + { + "epoch": 0.16654741592822941, + "grad_norm": 1.6780112981796265, + "learning_rate": 9.83999632580396e-06, + "loss": 0.9231, + "step": 3026 + }, + { + "epoch": 0.16660245473058508, + "grad_norm": 0.8815935850143433, + "learning_rate": 9.839887527855623e-06, + "loss": 0.7903, + "step": 3027 + }, + { + "epoch": 0.16665749353294074, + "grad_norm": 0.8942261338233948, + "learning_rate": 9.83977869353199e-06, + "loss": 0.8328, + "step": 3028 + }, + { + "epoch": 0.16671253233529637, + "grad_norm": 0.7866815328598022, + "learning_rate": 9.839669822833873e-06, + "loss": 0.8483, + "step": 3029 + }, + { + "epoch": 0.16676757113765203, + "grad_norm": 0.8133070468902588, + "learning_rate": 9.839560915762094e-06, + "loss": 0.8665, + "step": 3030 + }, + { + "epoch": 0.1668226099400077, + "grad_norm": 0.7442927360534668, + "learning_rate": 9.839451972317469e-06, + "loss": 0.6296, + "step": 3031 + }, + { + "epoch": 0.16687764874236335, + "grad_norm": 0.7505021691322327, + "learning_rate": 9.83934299250082e-06, + "loss": 0.7976, + "step": 3032 + }, + { + "epoch": 0.16693268754471902, + "grad_norm": 0.8310422897338867, + "learning_rate": 9.839233976312964e-06, + "loss": 0.9022, + "step": 3033 + }, + { + "epoch": 0.16698772634707468, + "grad_norm": 0.8175413012504578, + "learning_rate": 9.839124923754721e-06, + "loss": 0.8653, + "step": 3034 + }, + { + "epoch": 0.16704276514943034, + "grad_norm": 0.7963089346885681, + "learning_rate": 9.839015834826912e-06, + "loss": 0.7888, + "step": 3035 + }, + { + "epoch": 0.167097803951786, + "grad_norm": 0.8923391699790955, + "learning_rate": 9.838906709530353e-06, + "loss": 0.9396, + "step": 3036 + }, + { + "epoch": 0.16715284275414166, + "grad_norm": 0.7851678133010864, + "learning_rate": 9.838797547865869e-06, + "loss": 0.8163, + "step": 3037 + }, + { + "epoch": 0.16720788155649732, + "grad_norm": 0.817877471446991, + "learning_rate": 9.838688349834275e-06, + "loss": 0.8928, + "step": 3038 + }, + { + "epoch": 0.16726292035885298, + "grad_norm": 0.7603926062583923, + "learning_rate": 9.838579115436395e-06, + "loss": 0.7418, + "step": 3039 + }, + { + "epoch": 0.16731795916120865, + "grad_norm": 0.8086446523666382, + "learning_rate": 9.83846984467305e-06, + "loss": 0.8017, + "step": 3040 + }, + { + "epoch": 0.1673729979635643, + "grad_norm": 1.4745439291000366, + "learning_rate": 9.838360537545061e-06, + "loss": 0.7964, + "step": 3041 + }, + { + "epoch": 0.16742803676591997, + "grad_norm": 0.778404176235199, + "learning_rate": 9.83825119405325e-06, + "loss": 0.7395, + "step": 3042 + }, + { + "epoch": 0.16748307556827563, + "grad_norm": 0.8245886564254761, + "learning_rate": 9.838141814198439e-06, + "loss": 0.8697, + "step": 3043 + }, + { + "epoch": 0.1675381143706313, + "grad_norm": 0.8395472764968872, + "learning_rate": 9.838032397981448e-06, + "loss": 0.8545, + "step": 3044 + }, + { + "epoch": 0.16759315317298695, + "grad_norm": 0.8973744511604309, + "learning_rate": 9.8379229454031e-06, + "loss": 0.8999, + "step": 3045 + }, + { + "epoch": 0.16764819197534261, + "grad_norm": 1.2034368515014648, + "learning_rate": 9.837813456464219e-06, + "loss": 0.9039, + "step": 3046 + }, + { + "epoch": 0.16770323077769828, + "grad_norm": 0.862167477607727, + "learning_rate": 9.837703931165625e-06, + "loss": 0.889, + "step": 3047 + }, + { + "epoch": 0.16775826958005394, + "grad_norm": 0.7624714970588684, + "learning_rate": 9.837594369508146e-06, + "loss": 0.7072, + "step": 3048 + }, + { + "epoch": 0.1678133083824096, + "grad_norm": 0.7771621346473694, + "learning_rate": 9.8374847714926e-06, + "loss": 0.8769, + "step": 3049 + }, + { + "epoch": 0.16786834718476526, + "grad_norm": 0.7834492921829224, + "learning_rate": 9.837375137119816e-06, + "loss": 0.841, + "step": 3050 + }, + { + "epoch": 0.16792338598712092, + "grad_norm": 0.8175067901611328, + "learning_rate": 9.837265466390612e-06, + "loss": 0.8149, + "step": 3051 + }, + { + "epoch": 0.16797842478947658, + "grad_norm": 0.7474493384361267, + "learning_rate": 9.83715575930582e-06, + "loss": 0.7716, + "step": 3052 + }, + { + "epoch": 0.16803346359183224, + "grad_norm": 1.1263303756713867, + "learning_rate": 9.837046015866257e-06, + "loss": 0.8026, + "step": 3053 + }, + { + "epoch": 0.1680885023941879, + "grad_norm": 0.8741740584373474, + "learning_rate": 9.836936236072752e-06, + "loss": 0.8795, + "step": 3054 + }, + { + "epoch": 0.16814354119654357, + "grad_norm": 0.8108506798744202, + "learning_rate": 9.83682641992613e-06, + "loss": 0.7682, + "step": 3055 + }, + { + "epoch": 0.16819857999889923, + "grad_norm": 0.9380543231964111, + "learning_rate": 9.836716567427213e-06, + "loss": 0.8739, + "step": 3056 + }, + { + "epoch": 0.1682536188012549, + "grad_norm": 0.7755940556526184, + "learning_rate": 9.83660667857683e-06, + "loss": 0.7287, + "step": 3057 + }, + { + "epoch": 0.16830865760361055, + "grad_norm": 0.808907151222229, + "learning_rate": 9.836496753375807e-06, + "loss": 0.7988, + "step": 3058 + }, + { + "epoch": 0.1683636964059662, + "grad_norm": 1.1496524810791016, + "learning_rate": 9.836386791824967e-06, + "loss": 0.8621, + "step": 3059 + }, + { + "epoch": 0.16841873520832187, + "grad_norm": 0.8550384640693665, + "learning_rate": 9.83627679392514e-06, + "loss": 0.913, + "step": 3060 + }, + { + "epoch": 0.16847377401067753, + "grad_norm": 0.761142909526825, + "learning_rate": 9.83616675967715e-06, + "loss": 0.7271, + "step": 3061 + }, + { + "epoch": 0.1685288128130332, + "grad_norm": 0.8496200442314148, + "learning_rate": 9.836056689081828e-06, + "loss": 0.7885, + "step": 3062 + }, + { + "epoch": 0.16858385161538886, + "grad_norm": 0.8310382962226868, + "learning_rate": 9.835946582139996e-06, + "loss": 0.858, + "step": 3063 + }, + { + "epoch": 0.16863889041774452, + "grad_norm": 0.7870821952819824, + "learning_rate": 9.835836438852485e-06, + "loss": 0.7791, + "step": 3064 + }, + { + "epoch": 0.16869392922010018, + "grad_norm": 0.7170534729957581, + "learning_rate": 9.83572625922012e-06, + "loss": 0.6666, + "step": 3065 + }, + { + "epoch": 0.16874896802245584, + "grad_norm": 0.9764187932014465, + "learning_rate": 9.835616043243732e-06, + "loss": 0.8341, + "step": 3066 + }, + { + "epoch": 0.1688040068248115, + "grad_norm": 0.7453315258026123, + "learning_rate": 9.83550579092415e-06, + "loss": 0.7032, + "step": 3067 + }, + { + "epoch": 0.16885904562716716, + "grad_norm": 0.9205759763717651, + "learning_rate": 9.835395502262196e-06, + "loss": 0.813, + "step": 3068 + }, + { + "epoch": 0.16891408442952283, + "grad_norm": 0.8152205944061279, + "learning_rate": 9.835285177258708e-06, + "loss": 0.8275, + "step": 3069 + }, + { + "epoch": 0.1689691232318785, + "grad_norm": 0.8065707087516785, + "learning_rate": 9.83517481591451e-06, + "loss": 0.8648, + "step": 3070 + }, + { + "epoch": 0.16902416203423415, + "grad_norm": 0.7774410247802734, + "learning_rate": 9.835064418230432e-06, + "loss": 0.7818, + "step": 3071 + }, + { + "epoch": 0.16907920083658978, + "grad_norm": 0.8591069579124451, + "learning_rate": 9.834953984207305e-06, + "loss": 0.8055, + "step": 3072 + }, + { + "epoch": 0.16913423963894544, + "grad_norm": 0.7421612739562988, + "learning_rate": 9.834843513845958e-06, + "loss": 0.7543, + "step": 3073 + }, + { + "epoch": 0.1691892784413011, + "grad_norm": 0.7855183482170105, + "learning_rate": 9.83473300714722e-06, + "loss": 0.7011, + "step": 3074 + }, + { + "epoch": 0.16924431724365677, + "grad_norm": 0.8061636686325073, + "learning_rate": 9.834622464111924e-06, + "loss": 0.8096, + "step": 3075 + }, + { + "epoch": 0.16929935604601243, + "grad_norm": 0.8048406839370728, + "learning_rate": 9.834511884740898e-06, + "loss": 0.8166, + "step": 3076 + }, + { + "epoch": 0.1693543948483681, + "grad_norm": 0.8776549696922302, + "learning_rate": 9.834401269034977e-06, + "loss": 0.8169, + "step": 3077 + }, + { + "epoch": 0.16940943365072375, + "grad_norm": 1.0208356380462646, + "learning_rate": 9.83429061699499e-06, + "loss": 0.6976, + "step": 3078 + }, + { + "epoch": 0.1694644724530794, + "grad_norm": 0.7641016840934753, + "learning_rate": 9.834179928621767e-06, + "loss": 0.7109, + "step": 3079 + }, + { + "epoch": 0.16951951125543507, + "grad_norm": 0.7648905515670776, + "learning_rate": 9.834069203916143e-06, + "loss": 0.7927, + "step": 3080 + }, + { + "epoch": 0.16957455005779073, + "grad_norm": 0.7898744344711304, + "learning_rate": 9.833958442878948e-06, + "loss": 0.7911, + "step": 3081 + }, + { + "epoch": 0.1696295888601464, + "grad_norm": 0.8812462687492371, + "learning_rate": 9.833847645511016e-06, + "loss": 0.8381, + "step": 3082 + }, + { + "epoch": 0.16968462766250206, + "grad_norm": 0.8141197562217712, + "learning_rate": 9.833736811813179e-06, + "loss": 0.7422, + "step": 3083 + }, + { + "epoch": 0.16973966646485772, + "grad_norm": 0.7860949635505676, + "learning_rate": 9.83362594178627e-06, + "loss": 0.7568, + "step": 3084 + }, + { + "epoch": 0.16979470526721338, + "grad_norm": 0.6688396334648132, + "learning_rate": 9.833515035431123e-06, + "loss": 0.7143, + "step": 3085 + }, + { + "epoch": 0.16984974406956904, + "grad_norm": 0.7525103092193604, + "learning_rate": 9.833404092748569e-06, + "loss": 0.8026, + "step": 3086 + }, + { + "epoch": 0.1699047828719247, + "grad_norm": 0.8505181670188904, + "learning_rate": 9.833293113739444e-06, + "loss": 0.8894, + "step": 3087 + }, + { + "epoch": 0.16995982167428036, + "grad_norm": 0.8432300090789795, + "learning_rate": 9.833182098404583e-06, + "loss": 0.7801, + "step": 3088 + }, + { + "epoch": 0.17001486047663603, + "grad_norm": 0.7655903100967407, + "learning_rate": 9.833071046744819e-06, + "loss": 0.7838, + "step": 3089 + }, + { + "epoch": 0.1700698992789917, + "grad_norm": 0.8436369895935059, + "learning_rate": 9.832959958760986e-06, + "loss": 0.8636, + "step": 3090 + }, + { + "epoch": 0.17012493808134735, + "grad_norm": 0.7880234122276306, + "learning_rate": 9.83284883445392e-06, + "loss": 0.7701, + "step": 3091 + }, + { + "epoch": 0.170179976883703, + "grad_norm": 0.7713757753372192, + "learning_rate": 9.832737673824455e-06, + "loss": 0.8652, + "step": 3092 + }, + { + "epoch": 0.17023501568605867, + "grad_norm": 0.7905295491218567, + "learning_rate": 9.832626476873428e-06, + "loss": 0.8666, + "step": 3093 + }, + { + "epoch": 0.17029005448841433, + "grad_norm": 0.7589883804321289, + "learning_rate": 9.832515243601675e-06, + "loss": 0.8051, + "step": 3094 + }, + { + "epoch": 0.17034509329077, + "grad_norm": 0.9068838953971863, + "learning_rate": 9.83240397401003e-06, + "loss": 0.9037, + "step": 3095 + }, + { + "epoch": 0.17040013209312566, + "grad_norm": 0.7465278506278992, + "learning_rate": 9.83229266809933e-06, + "loss": 0.7425, + "step": 3096 + }, + { + "epoch": 0.17045517089548132, + "grad_norm": 0.8111177086830139, + "learning_rate": 9.83218132587041e-06, + "loss": 0.8034, + "step": 3097 + }, + { + "epoch": 0.17051020969783698, + "grad_norm": 1.1007672548294067, + "learning_rate": 9.832069947324112e-06, + "loss": 0.9139, + "step": 3098 + }, + { + "epoch": 0.17056524850019264, + "grad_norm": 0.881179690361023, + "learning_rate": 9.831958532461269e-06, + "loss": 0.9062, + "step": 3099 + }, + { + "epoch": 0.1706202873025483, + "grad_norm": 0.8012413382530212, + "learning_rate": 9.831847081282718e-06, + "loss": 0.7956, + "step": 3100 + }, + { + "epoch": 0.17067532610490396, + "grad_norm": 0.741731584072113, + "learning_rate": 9.831735593789298e-06, + "loss": 0.8754, + "step": 3101 + }, + { + "epoch": 0.17073036490725962, + "grad_norm": 0.8945604562759399, + "learning_rate": 9.831624069981848e-06, + "loss": 0.8293, + "step": 3102 + }, + { + "epoch": 0.17078540370961529, + "grad_norm": 0.7865545749664307, + "learning_rate": 9.831512509861203e-06, + "loss": 0.7812, + "step": 3103 + }, + { + "epoch": 0.17084044251197095, + "grad_norm": 0.832847535610199, + "learning_rate": 9.831400913428205e-06, + "loss": 0.8925, + "step": 3104 + }, + { + "epoch": 0.1708954813143266, + "grad_norm": 0.7374216914176941, + "learning_rate": 9.83128928068369e-06, + "loss": 0.8275, + "step": 3105 + }, + { + "epoch": 0.17095052011668227, + "grad_norm": 0.748725414276123, + "learning_rate": 9.831177611628497e-06, + "loss": 0.8364, + "step": 3106 + }, + { + "epoch": 0.17100555891903793, + "grad_norm": 0.810276448726654, + "learning_rate": 9.831065906263468e-06, + "loss": 0.861, + "step": 3107 + }, + { + "epoch": 0.1710605977213936, + "grad_norm": 0.7607758641242981, + "learning_rate": 9.83095416458944e-06, + "loss": 0.7989, + "step": 3108 + }, + { + "epoch": 0.17111563652374925, + "grad_norm": 0.7206127047538757, + "learning_rate": 9.830842386607253e-06, + "loss": 0.7187, + "step": 3109 + }, + { + "epoch": 0.17117067532610492, + "grad_norm": 0.7775895595550537, + "learning_rate": 9.83073057231775e-06, + "loss": 0.8008, + "step": 3110 + }, + { + "epoch": 0.17122571412846058, + "grad_norm": 0.8351094722747803, + "learning_rate": 9.830618721721768e-06, + "loss": 0.8025, + "step": 3111 + }, + { + "epoch": 0.17128075293081624, + "grad_norm": 0.8090646266937256, + "learning_rate": 9.830506834820148e-06, + "loss": 0.8012, + "step": 3112 + }, + { + "epoch": 0.1713357917331719, + "grad_norm": 0.7762801051139832, + "learning_rate": 9.830394911613733e-06, + "loss": 0.8428, + "step": 3113 + }, + { + "epoch": 0.17139083053552756, + "grad_norm": 0.8117541074752808, + "learning_rate": 9.83028295210336e-06, + "loss": 0.8566, + "step": 3114 + }, + { + "epoch": 0.1714458693378832, + "grad_norm": 0.8786184787750244, + "learning_rate": 9.830170956289876e-06, + "loss": 0.8386, + "step": 3115 + }, + { + "epoch": 0.17150090814023886, + "grad_norm": 1.0181046724319458, + "learning_rate": 9.83005892417412e-06, + "loss": 0.8555, + "step": 3116 + }, + { + "epoch": 0.17155594694259452, + "grad_norm": 0.8236173391342163, + "learning_rate": 9.829946855756934e-06, + "loss": 0.7933, + "step": 3117 + }, + { + "epoch": 0.17161098574495018, + "grad_norm": 0.8058149814605713, + "learning_rate": 9.829834751039157e-06, + "loss": 0.842, + "step": 3118 + }, + { + "epoch": 0.17166602454730584, + "grad_norm": 0.7419908046722412, + "learning_rate": 9.82972261002164e-06, + "loss": 0.8397, + "step": 3119 + }, + { + "epoch": 0.1717210633496615, + "grad_norm": 0.7528164982795715, + "learning_rate": 9.829610432705216e-06, + "loss": 0.7931, + "step": 3120 + }, + { + "epoch": 0.17177610215201716, + "grad_norm": 0.7357296943664551, + "learning_rate": 9.829498219090736e-06, + "loss": 0.8089, + "step": 3121 + }, + { + "epoch": 0.17183114095437282, + "grad_norm": 0.7635773420333862, + "learning_rate": 9.829385969179039e-06, + "loss": 0.7442, + "step": 3122 + }, + { + "epoch": 0.17188617975672849, + "grad_norm": 0.8200171589851379, + "learning_rate": 9.82927368297097e-06, + "loss": 0.757, + "step": 3123 + }, + { + "epoch": 0.17194121855908415, + "grad_norm": 0.8367171287536621, + "learning_rate": 9.829161360467374e-06, + "loss": 0.915, + "step": 3124 + }, + { + "epoch": 0.1719962573614398, + "grad_norm": 0.8460778594017029, + "learning_rate": 9.829049001669091e-06, + "loss": 0.8568, + "step": 3125 + }, + { + "epoch": 0.17205129616379547, + "grad_norm": 0.7301799058914185, + "learning_rate": 9.82893660657697e-06, + "loss": 0.8041, + "step": 3126 + }, + { + "epoch": 0.17210633496615113, + "grad_norm": 0.7858132123947144, + "learning_rate": 9.828824175191854e-06, + "loss": 0.8367, + "step": 3127 + }, + { + "epoch": 0.1721613737685068, + "grad_norm": 0.8118360042572021, + "learning_rate": 9.82871170751459e-06, + "loss": 0.85, + "step": 3128 + }, + { + "epoch": 0.17221641257086245, + "grad_norm": 0.9020261764526367, + "learning_rate": 9.828599203546019e-06, + "loss": 0.789, + "step": 3129 + }, + { + "epoch": 0.17227145137321812, + "grad_norm": 0.8194546699523926, + "learning_rate": 9.828486663286989e-06, + "loss": 0.8644, + "step": 3130 + }, + { + "epoch": 0.17232649017557378, + "grad_norm": 0.7764905095100403, + "learning_rate": 9.828374086738345e-06, + "loss": 0.7961, + "step": 3131 + }, + { + "epoch": 0.17238152897792944, + "grad_norm": 0.7712632417678833, + "learning_rate": 9.828261473900935e-06, + "loss": 0.8082, + "step": 3132 + }, + { + "epoch": 0.1724365677802851, + "grad_norm": 0.7100280523300171, + "learning_rate": 9.828148824775604e-06, + "loss": 0.7514, + "step": 3133 + }, + { + "epoch": 0.17249160658264076, + "grad_norm": 0.7812890410423279, + "learning_rate": 9.8280361393632e-06, + "loss": 0.7125, + "step": 3134 + }, + { + "epoch": 0.17254664538499642, + "grad_norm": 0.8772642612457275, + "learning_rate": 9.827923417664568e-06, + "loss": 0.8355, + "step": 3135 + }, + { + "epoch": 0.17260168418735208, + "grad_norm": 0.9161205291748047, + "learning_rate": 9.827810659680555e-06, + "loss": 0.7511, + "step": 3136 + }, + { + "epoch": 0.17265672298970774, + "grad_norm": 0.7628560662269592, + "learning_rate": 9.82769786541201e-06, + "loss": 0.882, + "step": 3137 + }, + { + "epoch": 0.1727117617920634, + "grad_norm": 0.8203405737876892, + "learning_rate": 9.827585034859781e-06, + "loss": 0.8172, + "step": 3138 + }, + { + "epoch": 0.17276680059441907, + "grad_norm": 0.8318095207214355, + "learning_rate": 9.827472168024715e-06, + "loss": 0.7784, + "step": 3139 + }, + { + "epoch": 0.17282183939677473, + "grad_norm": 0.9137747287750244, + "learning_rate": 9.827359264907658e-06, + "loss": 0.8643, + "step": 3140 + }, + { + "epoch": 0.1728768781991304, + "grad_norm": 0.9441068768501282, + "learning_rate": 9.827246325509463e-06, + "loss": 0.7936, + "step": 3141 + }, + { + "epoch": 0.17293191700148605, + "grad_norm": 0.7402390837669373, + "learning_rate": 9.827133349830977e-06, + "loss": 0.7813, + "step": 3142 + }, + { + "epoch": 0.1729869558038417, + "grad_norm": 0.8328836560249329, + "learning_rate": 9.827020337873048e-06, + "loss": 0.7676, + "step": 3143 + }, + { + "epoch": 0.17304199460619737, + "grad_norm": 0.8106881380081177, + "learning_rate": 9.826907289636526e-06, + "loss": 0.9037, + "step": 3144 + }, + { + "epoch": 0.17309703340855304, + "grad_norm": 0.8457425236701965, + "learning_rate": 9.826794205122263e-06, + "loss": 0.78, + "step": 3145 + }, + { + "epoch": 0.1731520722109087, + "grad_norm": 0.9335517883300781, + "learning_rate": 9.826681084331105e-06, + "loss": 0.9197, + "step": 3146 + }, + { + "epoch": 0.17320711101326436, + "grad_norm": 0.9098715782165527, + "learning_rate": 9.826567927263904e-06, + "loss": 0.932, + "step": 3147 + }, + { + "epoch": 0.17326214981562002, + "grad_norm": 0.767234206199646, + "learning_rate": 9.826454733921512e-06, + "loss": 0.8717, + "step": 3148 + }, + { + "epoch": 0.17331718861797568, + "grad_norm": 0.8114444017410278, + "learning_rate": 9.826341504304775e-06, + "loss": 0.8744, + "step": 3149 + }, + { + "epoch": 0.17337222742033134, + "grad_norm": 0.7948976755142212, + "learning_rate": 9.82622823841455e-06, + "loss": 0.7947, + "step": 3150 + }, + { + "epoch": 0.173427266222687, + "grad_norm": 0.7808204889297485, + "learning_rate": 9.826114936251684e-06, + "loss": 0.8151, + "step": 3151 + }, + { + "epoch": 0.17348230502504267, + "grad_norm": 0.733860969543457, + "learning_rate": 9.82600159781703e-06, + "loss": 0.8018, + "step": 3152 + }, + { + "epoch": 0.17353734382739833, + "grad_norm": 0.7630699276924133, + "learning_rate": 9.825888223111442e-06, + "loss": 0.7937, + "step": 3153 + }, + { + "epoch": 0.173592382629754, + "grad_norm": 0.7892931699752808, + "learning_rate": 9.825774812135766e-06, + "loss": 0.782, + "step": 3154 + }, + { + "epoch": 0.17364742143210965, + "grad_norm": 0.6642436385154724, + "learning_rate": 9.825661364890862e-06, + "loss": 0.6611, + "step": 3155 + }, + { + "epoch": 0.1737024602344653, + "grad_norm": 0.7755968570709229, + "learning_rate": 9.825547881377577e-06, + "loss": 0.7835, + "step": 3156 + }, + { + "epoch": 0.17375749903682097, + "grad_norm": 0.8406579494476318, + "learning_rate": 9.825434361596766e-06, + "loss": 0.9178, + "step": 3157 + }, + { + "epoch": 0.1738125378391766, + "grad_norm": 0.8887308835983276, + "learning_rate": 9.825320805549284e-06, + "loss": 0.7951, + "step": 3158 + }, + { + "epoch": 0.17386757664153227, + "grad_norm": 0.85418701171875, + "learning_rate": 9.825207213235978e-06, + "loss": 0.8671, + "step": 3159 + }, + { + "epoch": 0.17392261544388793, + "grad_norm": 0.8831202983856201, + "learning_rate": 9.82509358465771e-06, + "loss": 0.8708, + "step": 3160 + }, + { + "epoch": 0.1739776542462436, + "grad_norm": 0.9041616320610046, + "learning_rate": 9.82497991981533e-06, + "loss": 0.8981, + "step": 3161 + }, + { + "epoch": 0.17403269304859925, + "grad_norm": 0.8169258832931519, + "learning_rate": 9.824866218709692e-06, + "loss": 0.8857, + "step": 3162 + }, + { + "epoch": 0.1740877318509549, + "grad_norm": 0.8714475631713867, + "learning_rate": 9.824752481341651e-06, + "loss": 0.8552, + "step": 3163 + }, + { + "epoch": 0.17414277065331057, + "grad_norm": 0.8261111378669739, + "learning_rate": 9.824638707712061e-06, + "loss": 0.808, + "step": 3164 + }, + { + "epoch": 0.17419780945566624, + "grad_norm": 0.7542527914047241, + "learning_rate": 9.82452489782178e-06, + "loss": 0.8078, + "step": 3165 + }, + { + "epoch": 0.1742528482580219, + "grad_norm": 1.309218168258667, + "learning_rate": 9.824411051671658e-06, + "loss": 0.9325, + "step": 3166 + }, + { + "epoch": 0.17430788706037756, + "grad_norm": 0.8528563380241394, + "learning_rate": 9.824297169262555e-06, + "loss": 0.8493, + "step": 3167 + }, + { + "epoch": 0.17436292586273322, + "grad_norm": 0.7777062058448792, + "learning_rate": 9.824183250595328e-06, + "loss": 0.7002, + "step": 3168 + }, + { + "epoch": 0.17441796466508888, + "grad_norm": 0.7385506629943848, + "learning_rate": 9.824069295670828e-06, + "loss": 0.8396, + "step": 3169 + }, + { + "epoch": 0.17447300346744454, + "grad_norm": 0.8316949605941772, + "learning_rate": 9.823955304489918e-06, + "loss": 0.8769, + "step": 3170 + }, + { + "epoch": 0.1745280422698002, + "grad_norm": 0.8149139285087585, + "learning_rate": 9.823841277053448e-06, + "loss": 0.8009, + "step": 3171 + }, + { + "epoch": 0.17458308107215587, + "grad_norm": 0.8761584162712097, + "learning_rate": 9.82372721336228e-06, + "loss": 0.7366, + "step": 3172 + }, + { + "epoch": 0.17463811987451153, + "grad_norm": 0.7104084491729736, + "learning_rate": 9.82361311341727e-06, + "loss": 0.6704, + "step": 3173 + }, + { + "epoch": 0.1746931586768672, + "grad_norm": 0.791806697845459, + "learning_rate": 9.823498977219273e-06, + "loss": 0.9054, + "step": 3174 + }, + { + "epoch": 0.17474819747922285, + "grad_norm": 0.7675086855888367, + "learning_rate": 9.82338480476915e-06, + "loss": 0.751, + "step": 3175 + }, + { + "epoch": 0.1748032362815785, + "grad_norm": 0.7380725145339966, + "learning_rate": 9.823270596067759e-06, + "loss": 0.7618, + "step": 3176 + }, + { + "epoch": 0.17485827508393417, + "grad_norm": 0.7311519384384155, + "learning_rate": 9.823156351115954e-06, + "loss": 0.7424, + "step": 3177 + }, + { + "epoch": 0.17491331388628983, + "grad_norm": 0.7888365387916565, + "learning_rate": 9.8230420699146e-06, + "loss": 0.7717, + "step": 3178 + }, + { + "epoch": 0.1749683526886455, + "grad_norm": 0.9329265356063843, + "learning_rate": 9.822927752464552e-06, + "loss": 0.8256, + "step": 3179 + }, + { + "epoch": 0.17502339149100116, + "grad_norm": 0.711794912815094, + "learning_rate": 9.822813398766671e-06, + "loss": 0.7373, + "step": 3180 + }, + { + "epoch": 0.17507843029335682, + "grad_norm": 0.8713497519493103, + "learning_rate": 9.822699008821813e-06, + "loss": 0.8135, + "step": 3181 + }, + { + "epoch": 0.17513346909571248, + "grad_norm": 0.6923471689224243, + "learning_rate": 9.822584582630841e-06, + "loss": 0.7589, + "step": 3182 + }, + { + "epoch": 0.17518850789806814, + "grad_norm": 0.8648017048835754, + "learning_rate": 9.822470120194616e-06, + "loss": 0.7828, + "step": 3183 + }, + { + "epoch": 0.1752435467004238, + "grad_norm": 0.8407077789306641, + "learning_rate": 9.822355621513994e-06, + "loss": 0.8537, + "step": 3184 + }, + { + "epoch": 0.17529858550277946, + "grad_norm": 0.8076738119125366, + "learning_rate": 9.822241086589841e-06, + "loss": 0.7827, + "step": 3185 + }, + { + "epoch": 0.17535362430513513, + "grad_norm": 0.8402661085128784, + "learning_rate": 9.822126515423011e-06, + "loss": 0.8247, + "step": 3186 + }, + { + "epoch": 0.1754086631074908, + "grad_norm": 0.8911813497543335, + "learning_rate": 9.822011908014373e-06, + "loss": 0.8996, + "step": 3187 + }, + { + "epoch": 0.17546370190984645, + "grad_norm": 0.8060111999511719, + "learning_rate": 9.821897264364782e-06, + "loss": 0.796, + "step": 3188 + }, + { + "epoch": 0.1755187407122021, + "grad_norm": 0.8476423621177673, + "learning_rate": 9.8217825844751e-06, + "loss": 0.8657, + "step": 3189 + }, + { + "epoch": 0.17557377951455777, + "grad_norm": 0.7614054083824158, + "learning_rate": 9.821667868346194e-06, + "loss": 0.8583, + "step": 3190 + }, + { + "epoch": 0.17562881831691343, + "grad_norm": 0.8312287330627441, + "learning_rate": 9.821553115978923e-06, + "loss": 0.7718, + "step": 3191 + }, + { + "epoch": 0.1756838571192691, + "grad_norm": 0.8199487328529358, + "learning_rate": 9.82143832737415e-06, + "loss": 0.7617, + "step": 3192 + }, + { + "epoch": 0.17573889592162475, + "grad_norm": 0.7529115080833435, + "learning_rate": 9.821323502532733e-06, + "loss": 0.7587, + "step": 3193 + }, + { + "epoch": 0.17579393472398042, + "grad_norm": 0.9205463528633118, + "learning_rate": 9.821208641455542e-06, + "loss": 0.7871, + "step": 3194 + }, + { + "epoch": 0.17584897352633608, + "grad_norm": 0.8055161833763123, + "learning_rate": 9.821093744143437e-06, + "loss": 0.8133, + "step": 3195 + }, + { + "epoch": 0.17590401232869174, + "grad_norm": 0.7322981953620911, + "learning_rate": 9.82097881059728e-06, + "loss": 0.7442, + "step": 3196 + }, + { + "epoch": 0.1759590511310474, + "grad_norm": 1.0465941429138184, + "learning_rate": 9.82086384081794e-06, + "loss": 1.0073, + "step": 3197 + }, + { + "epoch": 0.17601408993340306, + "grad_norm": 0.7607331275939941, + "learning_rate": 9.820748834806278e-06, + "loss": 0.8128, + "step": 3198 + }, + { + "epoch": 0.17606912873575872, + "grad_norm": 0.7901879549026489, + "learning_rate": 9.820633792563156e-06, + "loss": 0.7928, + "step": 3199 + }, + { + "epoch": 0.17612416753811436, + "grad_norm": 0.8010839223861694, + "learning_rate": 9.820518714089442e-06, + "loss": 0.7025, + "step": 3200 + }, + { + "epoch": 0.17617920634047002, + "grad_norm": 0.8511317372322083, + "learning_rate": 9.820403599385999e-06, + "loss": 0.7947, + "step": 3201 + }, + { + "epoch": 0.17623424514282568, + "grad_norm": 0.7978847026824951, + "learning_rate": 9.820288448453693e-06, + "loss": 0.7395, + "step": 3202 + }, + { + "epoch": 0.17628928394518134, + "grad_norm": 0.6991232633590698, + "learning_rate": 9.820173261293388e-06, + "loss": 0.7113, + "step": 3203 + }, + { + "epoch": 0.176344322747537, + "grad_norm": 0.8966444730758667, + "learning_rate": 9.820058037905954e-06, + "loss": 0.7399, + "step": 3204 + }, + { + "epoch": 0.17639936154989266, + "grad_norm": 0.8042632341384888, + "learning_rate": 9.819942778292253e-06, + "loss": 0.8183, + "step": 3205 + }, + { + "epoch": 0.17645440035224833, + "grad_norm": 0.8047537803649902, + "learning_rate": 9.81982748245315e-06, + "loss": 0.852, + "step": 3206 + }, + { + "epoch": 0.176509439154604, + "grad_norm": 0.8277122378349304, + "learning_rate": 9.819712150389517e-06, + "loss": 0.8828, + "step": 3207 + }, + { + "epoch": 0.17656447795695965, + "grad_norm": 0.8677185773849487, + "learning_rate": 9.819596782102216e-06, + "loss": 0.8416, + "step": 3208 + }, + { + "epoch": 0.1766195167593153, + "grad_norm": 0.8750975728034973, + "learning_rate": 9.819481377592115e-06, + "loss": 0.9289, + "step": 3209 + }, + { + "epoch": 0.17667455556167097, + "grad_norm": 0.7665122151374817, + "learning_rate": 9.819365936860084e-06, + "loss": 0.8653, + "step": 3210 + }, + { + "epoch": 0.17672959436402663, + "grad_norm": 0.9341353178024292, + "learning_rate": 9.819250459906989e-06, + "loss": 0.7225, + "step": 3211 + }, + { + "epoch": 0.1767846331663823, + "grad_norm": 0.7007241249084473, + "learning_rate": 9.819134946733696e-06, + "loss": 0.7429, + "step": 3212 + }, + { + "epoch": 0.17683967196873795, + "grad_norm": 0.8001461029052734, + "learning_rate": 9.819019397341074e-06, + "loss": 0.759, + "step": 3213 + }, + { + "epoch": 0.17689471077109362, + "grad_norm": 0.8936446905136108, + "learning_rate": 9.818903811729993e-06, + "loss": 0.8248, + "step": 3214 + }, + { + "epoch": 0.17694974957344928, + "grad_norm": 0.805570125579834, + "learning_rate": 9.818788189901321e-06, + "loss": 0.9214, + "step": 3215 + }, + { + "epoch": 0.17700478837580494, + "grad_norm": 0.7762455940246582, + "learning_rate": 9.818672531855926e-06, + "loss": 0.7848, + "step": 3216 + }, + { + "epoch": 0.1770598271781606, + "grad_norm": 0.8391497731208801, + "learning_rate": 9.81855683759468e-06, + "loss": 0.7543, + "step": 3217 + }, + { + "epoch": 0.17711486598051626, + "grad_norm": 0.8489046692848206, + "learning_rate": 9.818441107118449e-06, + "loss": 0.7908, + "step": 3218 + }, + { + "epoch": 0.17716990478287192, + "grad_norm": 1.0949461460113525, + "learning_rate": 9.818325340428105e-06, + "loss": 0.8255, + "step": 3219 + }, + { + "epoch": 0.17722494358522758, + "grad_norm": 0.8710842132568359, + "learning_rate": 9.81820953752452e-06, + "loss": 0.859, + "step": 3220 + }, + { + "epoch": 0.17727998238758325, + "grad_norm": 0.7936064600944519, + "learning_rate": 9.818093698408558e-06, + "loss": 0.8475, + "step": 3221 + }, + { + "epoch": 0.1773350211899389, + "grad_norm": 0.790341854095459, + "learning_rate": 9.817977823081095e-06, + "loss": 0.8137, + "step": 3222 + }, + { + "epoch": 0.17739005999229457, + "grad_norm": 0.8154531717300415, + "learning_rate": 9.817861911543002e-06, + "loss": 0.8687, + "step": 3223 + }, + { + "epoch": 0.17744509879465023, + "grad_norm": 0.8346067070960999, + "learning_rate": 9.817745963795144e-06, + "loss": 0.8905, + "step": 3224 + }, + { + "epoch": 0.1775001375970059, + "grad_norm": 0.7137764096260071, + "learning_rate": 9.817629979838401e-06, + "loss": 0.7715, + "step": 3225 + }, + { + "epoch": 0.17755517639936155, + "grad_norm": 0.7237628102302551, + "learning_rate": 9.81751395967364e-06, + "loss": 0.7824, + "step": 3226 + }, + { + "epoch": 0.17761021520171721, + "grad_norm": 0.9481163024902344, + "learning_rate": 9.817397903301733e-06, + "loss": 0.7451, + "step": 3227 + }, + { + "epoch": 0.17766525400407288, + "grad_norm": 0.9472424387931824, + "learning_rate": 9.817281810723552e-06, + "loss": 0.8774, + "step": 3228 + }, + { + "epoch": 0.17772029280642854, + "grad_norm": 0.9295538663864136, + "learning_rate": 9.81716568193997e-06, + "loss": 0.8507, + "step": 3229 + }, + { + "epoch": 0.1777753316087842, + "grad_norm": 0.7668172717094421, + "learning_rate": 9.817049516951863e-06, + "loss": 0.8547, + "step": 3230 + }, + { + "epoch": 0.17783037041113986, + "grad_norm": 0.8640413880348206, + "learning_rate": 9.8169333157601e-06, + "loss": 0.8485, + "step": 3231 + }, + { + "epoch": 0.17788540921349552, + "grad_norm": 0.9901431798934937, + "learning_rate": 9.816817078365554e-06, + "loss": 0.9236, + "step": 3232 + }, + { + "epoch": 0.17794044801585118, + "grad_norm": 1.0242371559143066, + "learning_rate": 9.816700804769104e-06, + "loss": 0.8096, + "step": 3233 + }, + { + "epoch": 0.17799548681820684, + "grad_norm": 0.910498857498169, + "learning_rate": 9.816584494971617e-06, + "loss": 0.829, + "step": 3234 + }, + { + "epoch": 0.1780505256205625, + "grad_norm": 0.8254473805427551, + "learning_rate": 9.816468148973972e-06, + "loss": 0.7828, + "step": 3235 + }, + { + "epoch": 0.17810556442291817, + "grad_norm": 0.7971221804618835, + "learning_rate": 9.816351766777039e-06, + "loss": 0.8057, + "step": 3236 + }, + { + "epoch": 0.17816060322527383, + "grad_norm": 0.8151674270629883, + "learning_rate": 9.816235348381697e-06, + "loss": 0.7801, + "step": 3237 + }, + { + "epoch": 0.1782156420276295, + "grad_norm": 0.7587556838989258, + "learning_rate": 9.81611889378882e-06, + "loss": 0.7814, + "step": 3238 + }, + { + "epoch": 0.17827068082998515, + "grad_norm": 0.8843516111373901, + "learning_rate": 9.816002402999283e-06, + "loss": 0.8873, + "step": 3239 + }, + { + "epoch": 0.1783257196323408, + "grad_norm": 0.917859673500061, + "learning_rate": 9.81588587601396e-06, + "loss": 0.8963, + "step": 3240 + }, + { + "epoch": 0.17838075843469647, + "grad_norm": 0.8256439566612244, + "learning_rate": 9.815769312833727e-06, + "loss": 0.9157, + "step": 3241 + }, + { + "epoch": 0.17843579723705214, + "grad_norm": 0.8364603519439697, + "learning_rate": 9.815652713459462e-06, + "loss": 0.8253, + "step": 3242 + }, + { + "epoch": 0.17849083603940777, + "grad_norm": 0.7717131972312927, + "learning_rate": 9.81553607789204e-06, + "loss": 0.7211, + "step": 3243 + }, + { + "epoch": 0.17854587484176343, + "grad_norm": 0.8069111704826355, + "learning_rate": 9.815419406132338e-06, + "loss": 0.8986, + "step": 3244 + }, + { + "epoch": 0.1786009136441191, + "grad_norm": 0.9176943302154541, + "learning_rate": 9.815302698181233e-06, + "loss": 0.8084, + "step": 3245 + }, + { + "epoch": 0.17865595244647475, + "grad_norm": 0.769183874130249, + "learning_rate": 9.815185954039601e-06, + "loss": 0.8084, + "step": 3246 + }, + { + "epoch": 0.17871099124883041, + "grad_norm": 0.8070697784423828, + "learning_rate": 9.815069173708321e-06, + "loss": 0.8371, + "step": 3247 + }, + { + "epoch": 0.17876603005118608, + "grad_norm": 0.7837347388267517, + "learning_rate": 9.81495235718827e-06, + "loss": 0.8015, + "step": 3248 + }, + { + "epoch": 0.17882106885354174, + "grad_norm": 0.9248430728912354, + "learning_rate": 9.814835504480327e-06, + "loss": 0.8396, + "step": 3249 + }, + { + "epoch": 0.1788761076558974, + "grad_norm": 0.7914367914199829, + "learning_rate": 9.814718615585367e-06, + "loss": 0.8068, + "step": 3250 + }, + { + "epoch": 0.17893114645825306, + "grad_norm": 0.8612570762634277, + "learning_rate": 9.814601690504273e-06, + "loss": 0.8227, + "step": 3251 + }, + { + "epoch": 0.17898618526060872, + "grad_norm": 0.7476248741149902, + "learning_rate": 9.81448472923792e-06, + "loss": 0.8609, + "step": 3252 + }, + { + "epoch": 0.17904122406296438, + "grad_norm": 0.7455218434333801, + "learning_rate": 9.81436773178719e-06, + "loss": 0.7992, + "step": 3253 + }, + { + "epoch": 0.17909626286532004, + "grad_norm": 0.7917896509170532, + "learning_rate": 9.814250698152958e-06, + "loss": 0.8383, + "step": 3254 + }, + { + "epoch": 0.1791513016676757, + "grad_norm": 0.6926130652427673, + "learning_rate": 9.81413362833611e-06, + "loss": 0.709, + "step": 3255 + }, + { + "epoch": 0.17920634047003137, + "grad_norm": 0.8219630718231201, + "learning_rate": 9.814016522337519e-06, + "loss": 0.9387, + "step": 3256 + }, + { + "epoch": 0.17926137927238703, + "grad_norm": 0.8588619828224182, + "learning_rate": 9.81389938015807e-06, + "loss": 0.8354, + "step": 3257 + }, + { + "epoch": 0.1793164180747427, + "grad_norm": 0.7868718504905701, + "learning_rate": 9.81378220179864e-06, + "loss": 0.8464, + "step": 3258 + }, + { + "epoch": 0.17937145687709835, + "grad_norm": 0.789479672908783, + "learning_rate": 9.813664987260114e-06, + "loss": 0.8577, + "step": 3259 + }, + { + "epoch": 0.179426495679454, + "grad_norm": 0.8280717730522156, + "learning_rate": 9.81354773654337e-06, + "loss": 0.765, + "step": 3260 + }, + { + "epoch": 0.17948153448180967, + "grad_norm": 0.7660181522369385, + "learning_rate": 9.813430449649289e-06, + "loss": 0.7116, + "step": 3261 + }, + { + "epoch": 0.17953657328416534, + "grad_norm": 0.8043892979621887, + "learning_rate": 9.813313126578754e-06, + "loss": 0.8398, + "step": 3262 + }, + { + "epoch": 0.179591612086521, + "grad_norm": 0.8708420991897583, + "learning_rate": 9.813195767332647e-06, + "loss": 0.8246, + "step": 3263 + }, + { + "epoch": 0.17964665088887666, + "grad_norm": 1.1456964015960693, + "learning_rate": 9.813078371911846e-06, + "loss": 0.8798, + "step": 3264 + }, + { + "epoch": 0.17970168969123232, + "grad_norm": 0.9668154716491699, + "learning_rate": 9.812960940317238e-06, + "loss": 0.9645, + "step": 3265 + }, + { + "epoch": 0.17975672849358798, + "grad_norm": 0.862050473690033, + "learning_rate": 9.812843472549705e-06, + "loss": 0.8675, + "step": 3266 + }, + { + "epoch": 0.17981176729594364, + "grad_norm": 0.7776491641998291, + "learning_rate": 9.812725968610126e-06, + "loss": 0.7727, + "step": 3267 + }, + { + "epoch": 0.1798668060982993, + "grad_norm": 0.7197048664093018, + "learning_rate": 9.812608428499389e-06, + "loss": 0.6877, + "step": 3268 + }, + { + "epoch": 0.17992184490065496, + "grad_norm": 0.7995713353157043, + "learning_rate": 9.812490852218375e-06, + "loss": 0.8576, + "step": 3269 + }, + { + "epoch": 0.17997688370301063, + "grad_norm": 0.8300820589065552, + "learning_rate": 9.812373239767967e-06, + "loss": 0.8119, + "step": 3270 + }, + { + "epoch": 0.1800319225053663, + "grad_norm": 0.8625856041908264, + "learning_rate": 9.812255591149052e-06, + "loss": 0.7547, + "step": 3271 + }, + { + "epoch": 0.18008696130772195, + "grad_norm": 1.016419768333435, + "learning_rate": 9.812137906362511e-06, + "loss": 0.8457, + "step": 3272 + }, + { + "epoch": 0.1801420001100776, + "grad_norm": 0.7303110361099243, + "learning_rate": 9.812020185409229e-06, + "loss": 0.7954, + "step": 3273 + }, + { + "epoch": 0.18019703891243327, + "grad_norm": 0.8632498383522034, + "learning_rate": 9.811902428290093e-06, + "loss": 0.8952, + "step": 3274 + }, + { + "epoch": 0.18025207771478893, + "grad_norm": 0.7666932940483093, + "learning_rate": 9.811784635005984e-06, + "loss": 0.746, + "step": 3275 + }, + { + "epoch": 0.1803071165171446, + "grad_norm": 0.8962032198905945, + "learning_rate": 9.811666805557791e-06, + "loss": 0.8654, + "step": 3276 + }, + { + "epoch": 0.18036215531950026, + "grad_norm": 0.9399656057357788, + "learning_rate": 9.811548939946397e-06, + "loss": 0.8062, + "step": 3277 + }, + { + "epoch": 0.18041719412185592, + "grad_norm": 0.7469807863235474, + "learning_rate": 9.811431038172692e-06, + "loss": 0.79, + "step": 3278 + }, + { + "epoch": 0.18047223292421158, + "grad_norm": 0.7661105394363403, + "learning_rate": 9.811313100237556e-06, + "loss": 0.7768, + "step": 3279 + }, + { + "epoch": 0.18052727172656724, + "grad_norm": 0.7567458748817444, + "learning_rate": 9.811195126141881e-06, + "loss": 0.7329, + "step": 3280 + }, + { + "epoch": 0.1805823105289229, + "grad_norm": 0.7187278866767883, + "learning_rate": 9.811077115886552e-06, + "loss": 0.6511, + "step": 3281 + }, + { + "epoch": 0.18063734933127856, + "grad_norm": 0.7641230821609497, + "learning_rate": 9.810959069472452e-06, + "loss": 0.7704, + "step": 3282 + }, + { + "epoch": 0.18069238813363422, + "grad_norm": 0.7790704369544983, + "learning_rate": 9.810840986900474e-06, + "loss": 0.8142, + "step": 3283 + }, + { + "epoch": 0.18074742693598989, + "grad_norm": 0.8102816343307495, + "learning_rate": 9.810722868171502e-06, + "loss": 0.765, + "step": 3284 + }, + { + "epoch": 0.18080246573834555, + "grad_norm": 0.7251957058906555, + "learning_rate": 9.810604713286424e-06, + "loss": 0.7836, + "step": 3285 + }, + { + "epoch": 0.18085750454070118, + "grad_norm": 0.845348060131073, + "learning_rate": 9.81048652224613e-06, + "loss": 0.8386, + "step": 3286 + }, + { + "epoch": 0.18091254334305684, + "grad_norm": 0.8397864103317261, + "learning_rate": 9.810368295051507e-06, + "loss": 0.805, + "step": 3287 + }, + { + "epoch": 0.1809675821454125, + "grad_norm": 1.0739909410476685, + "learning_rate": 9.810250031703444e-06, + "loss": 0.8735, + "step": 3288 + }, + { + "epoch": 0.18102262094776816, + "grad_norm": 0.752091646194458, + "learning_rate": 9.810131732202826e-06, + "loss": 0.7814, + "step": 3289 + }, + { + "epoch": 0.18107765975012383, + "grad_norm": 0.7826841473579407, + "learning_rate": 9.810013396550548e-06, + "loss": 0.7761, + "step": 3290 + }, + { + "epoch": 0.1811326985524795, + "grad_norm": 0.6979131102561951, + "learning_rate": 9.809895024747498e-06, + "loss": 0.672, + "step": 3291 + }, + { + "epoch": 0.18118773735483515, + "grad_norm": 0.8571394085884094, + "learning_rate": 9.809776616794562e-06, + "loss": 0.8795, + "step": 3292 + }, + { + "epoch": 0.1812427761571908, + "grad_norm": 0.8287902474403381, + "learning_rate": 9.809658172692634e-06, + "loss": 0.9032, + "step": 3293 + }, + { + "epoch": 0.18129781495954647, + "grad_norm": 0.7884420156478882, + "learning_rate": 9.809539692442602e-06, + "loss": 0.87, + "step": 3294 + }, + { + "epoch": 0.18135285376190213, + "grad_norm": 0.8955305218696594, + "learning_rate": 9.809421176045358e-06, + "loss": 0.7982, + "step": 3295 + }, + { + "epoch": 0.1814078925642578, + "grad_norm": 0.7893335819244385, + "learning_rate": 9.809302623501791e-06, + "loss": 0.7792, + "step": 3296 + }, + { + "epoch": 0.18146293136661346, + "grad_norm": 0.8077870011329651, + "learning_rate": 9.809184034812794e-06, + "loss": 0.829, + "step": 3297 + }, + { + "epoch": 0.18151797016896912, + "grad_norm": 0.8282631635665894, + "learning_rate": 9.809065409979256e-06, + "loss": 0.8502, + "step": 3298 + }, + { + "epoch": 0.18157300897132478, + "grad_norm": 0.7988418936729431, + "learning_rate": 9.808946749002068e-06, + "loss": 0.7853, + "step": 3299 + }, + { + "epoch": 0.18162804777368044, + "grad_norm": 0.7776056528091431, + "learning_rate": 9.808828051882127e-06, + "loss": 0.7843, + "step": 3300 + }, + { + "epoch": 0.1816830865760361, + "grad_norm": 0.8772258758544922, + "learning_rate": 9.80870931862032e-06, + "loss": 0.896, + "step": 3301 + }, + { + "epoch": 0.18173812537839176, + "grad_norm": 0.8080328702926636, + "learning_rate": 9.80859054921754e-06, + "loss": 0.8464, + "step": 3302 + }, + { + "epoch": 0.18179316418074742, + "grad_norm": 0.862707257270813, + "learning_rate": 9.808471743674682e-06, + "loss": 0.8732, + "step": 3303 + }, + { + "epoch": 0.18184820298310309, + "grad_norm": 1.1964820623397827, + "learning_rate": 9.808352901992637e-06, + "loss": 0.9911, + "step": 3304 + }, + { + "epoch": 0.18190324178545875, + "grad_norm": 0.8597685694694519, + "learning_rate": 9.808234024172298e-06, + "loss": 0.8724, + "step": 3305 + }, + { + "epoch": 0.1819582805878144, + "grad_norm": 0.8068556189537048, + "learning_rate": 9.80811511021456e-06, + "loss": 0.8116, + "step": 3306 + }, + { + "epoch": 0.18201331939017007, + "grad_norm": 1.0014268159866333, + "learning_rate": 9.807996160120317e-06, + "loss": 0.8585, + "step": 3307 + }, + { + "epoch": 0.18206835819252573, + "grad_norm": 0.8541132807731628, + "learning_rate": 9.80787717389046e-06, + "loss": 0.8505, + "step": 3308 + }, + { + "epoch": 0.1821233969948814, + "grad_norm": 0.7973629832267761, + "learning_rate": 9.807758151525886e-06, + "loss": 0.8312, + "step": 3309 + }, + { + "epoch": 0.18217843579723705, + "grad_norm": 0.82973712682724, + "learning_rate": 9.807639093027488e-06, + "loss": 0.894, + "step": 3310 + }, + { + "epoch": 0.18223347459959272, + "grad_norm": 0.7729674577713013, + "learning_rate": 9.807519998396162e-06, + "loss": 0.7459, + "step": 3311 + }, + { + "epoch": 0.18228851340194838, + "grad_norm": 0.8106189370155334, + "learning_rate": 9.807400867632804e-06, + "loss": 0.914, + "step": 3312 + }, + { + "epoch": 0.18234355220430404, + "grad_norm": 0.7672377228736877, + "learning_rate": 9.807281700738305e-06, + "loss": 0.8475, + "step": 3313 + }, + { + "epoch": 0.1823985910066597, + "grad_norm": 0.8776688575744629, + "learning_rate": 9.807162497713566e-06, + "loss": 0.7641, + "step": 3314 + }, + { + "epoch": 0.18245362980901536, + "grad_norm": 0.8781917691230774, + "learning_rate": 9.807043258559479e-06, + "loss": 0.86, + "step": 3315 + }, + { + "epoch": 0.18250866861137102, + "grad_norm": 0.819362998008728, + "learning_rate": 9.806923983276942e-06, + "loss": 0.8829, + "step": 3316 + }, + { + "epoch": 0.18256370741372668, + "grad_norm": 0.8065270185470581, + "learning_rate": 9.80680467186685e-06, + "loss": 0.7335, + "step": 3317 + }, + { + "epoch": 0.18261874621608234, + "grad_norm": 0.8692485690116882, + "learning_rate": 9.806685324330102e-06, + "loss": 0.8582, + "step": 3318 + }, + { + "epoch": 0.182673785018438, + "grad_norm": 0.7910160422325134, + "learning_rate": 9.806565940667594e-06, + "loss": 0.8569, + "step": 3319 + }, + { + "epoch": 0.18272882382079367, + "grad_norm": 0.8282253742218018, + "learning_rate": 9.806446520880225e-06, + "loss": 0.7791, + "step": 3320 + }, + { + "epoch": 0.18278386262314933, + "grad_norm": 0.7513861060142517, + "learning_rate": 9.806327064968887e-06, + "loss": 0.7287, + "step": 3321 + }, + { + "epoch": 0.182838901425505, + "grad_norm": 0.8141188621520996, + "learning_rate": 9.806207572934483e-06, + "loss": 0.7772, + "step": 3322 + }, + { + "epoch": 0.18289394022786065, + "grad_norm": 0.7963125705718994, + "learning_rate": 9.806088044777909e-06, + "loss": 0.7993, + "step": 3323 + }, + { + "epoch": 0.1829489790302163, + "grad_norm": 0.8527218103408813, + "learning_rate": 9.805968480500063e-06, + "loss": 0.822, + "step": 3324 + }, + { + "epoch": 0.18300401783257197, + "grad_norm": 0.822467565536499, + "learning_rate": 9.805848880101845e-06, + "loss": 0.8606, + "step": 3325 + }, + { + "epoch": 0.18305905663492764, + "grad_norm": 0.8197154402732849, + "learning_rate": 9.805729243584154e-06, + "loss": 0.9004, + "step": 3326 + }, + { + "epoch": 0.1831140954372833, + "grad_norm": 0.8379594683647156, + "learning_rate": 9.805609570947887e-06, + "loss": 0.8467, + "step": 3327 + }, + { + "epoch": 0.18316913423963896, + "grad_norm": 0.7787355184555054, + "learning_rate": 9.805489862193947e-06, + "loss": 0.8221, + "step": 3328 + }, + { + "epoch": 0.1832241730419946, + "grad_norm": 0.8464100956916809, + "learning_rate": 9.80537011732323e-06, + "loss": 0.7722, + "step": 3329 + }, + { + "epoch": 0.18327921184435025, + "grad_norm": 0.8351306319236755, + "learning_rate": 9.805250336336637e-06, + "loss": 0.7638, + "step": 3330 + }, + { + "epoch": 0.18333425064670592, + "grad_norm": 0.8098864555358887, + "learning_rate": 9.805130519235068e-06, + "loss": 0.8448, + "step": 3331 + }, + { + "epoch": 0.18338928944906158, + "grad_norm": 0.8290563821792603, + "learning_rate": 9.805010666019427e-06, + "loss": 0.6574, + "step": 3332 + }, + { + "epoch": 0.18344432825141724, + "grad_norm": 0.7748262882232666, + "learning_rate": 9.804890776690611e-06, + "loss": 0.8002, + "step": 3333 + }, + { + "epoch": 0.1834993670537729, + "grad_norm": 0.8422787189483643, + "learning_rate": 9.80477085124952e-06, + "loss": 0.8452, + "step": 3334 + }, + { + "epoch": 0.18355440585612856, + "grad_norm": 0.7776510119438171, + "learning_rate": 9.804650889697061e-06, + "loss": 0.8774, + "step": 3335 + }, + { + "epoch": 0.18360944465848422, + "grad_norm": 0.8449370861053467, + "learning_rate": 9.80453089203413e-06, + "loss": 0.8233, + "step": 3336 + }, + { + "epoch": 0.18366448346083988, + "grad_norm": 0.8254217505455017, + "learning_rate": 9.804410858261632e-06, + "loss": 0.8778, + "step": 3337 + }, + { + "epoch": 0.18371952226319554, + "grad_norm": 0.8673515915870667, + "learning_rate": 9.804290788380466e-06, + "loss": 0.8005, + "step": 3338 + }, + { + "epoch": 0.1837745610655512, + "grad_norm": 0.8106067776679993, + "learning_rate": 9.804170682391538e-06, + "loss": 0.86, + "step": 3339 + }, + { + "epoch": 0.18382959986790687, + "grad_norm": 0.8211669325828552, + "learning_rate": 9.804050540295749e-06, + "loss": 0.8013, + "step": 3340 + }, + { + "epoch": 0.18388463867026253, + "grad_norm": 0.7866180539131165, + "learning_rate": 9.803930362094003e-06, + "loss": 0.8108, + "step": 3341 + }, + { + "epoch": 0.1839396774726182, + "grad_norm": 0.8192055225372314, + "learning_rate": 9.8038101477872e-06, + "loss": 0.7586, + "step": 3342 + }, + { + "epoch": 0.18399471627497385, + "grad_norm": 0.940910279750824, + "learning_rate": 9.803689897376248e-06, + "loss": 0.8174, + "step": 3343 + }, + { + "epoch": 0.1840497550773295, + "grad_norm": 0.7979292869567871, + "learning_rate": 9.803569610862048e-06, + "loss": 0.8341, + "step": 3344 + }, + { + "epoch": 0.18410479387968517, + "grad_norm": 0.7577546238899231, + "learning_rate": 9.803449288245504e-06, + "loss": 0.7775, + "step": 3345 + }, + { + "epoch": 0.18415983268204084, + "grad_norm": 0.7255160212516785, + "learning_rate": 9.80332892952752e-06, + "loss": 0.7648, + "step": 3346 + }, + { + "epoch": 0.1842148714843965, + "grad_norm": 0.8269388675689697, + "learning_rate": 9.803208534709004e-06, + "loss": 0.8902, + "step": 3347 + }, + { + "epoch": 0.18426991028675216, + "grad_norm": 0.783867359161377, + "learning_rate": 9.803088103790857e-06, + "loss": 0.8191, + "step": 3348 + }, + { + "epoch": 0.18432494908910782, + "grad_norm": 0.7658863663673401, + "learning_rate": 9.802967636773986e-06, + "loss": 0.7505, + "step": 3349 + }, + { + "epoch": 0.18437998789146348, + "grad_norm": 0.701225757598877, + "learning_rate": 9.802847133659294e-06, + "loss": 0.7159, + "step": 3350 + }, + { + "epoch": 0.18443502669381914, + "grad_norm": 0.9224311709403992, + "learning_rate": 9.802726594447692e-06, + "loss": 0.7766, + "step": 3351 + }, + { + "epoch": 0.1844900654961748, + "grad_norm": 0.8835979700088501, + "learning_rate": 9.80260601914008e-06, + "loss": 0.9304, + "step": 3352 + }, + { + "epoch": 0.18454510429853047, + "grad_norm": 0.7918481826782227, + "learning_rate": 9.802485407737368e-06, + "loss": 0.7691, + "step": 3353 + }, + { + "epoch": 0.18460014310088613, + "grad_norm": 0.8855286240577698, + "learning_rate": 9.80236476024046e-06, + "loss": 0.9213, + "step": 3354 + }, + { + "epoch": 0.1846551819032418, + "grad_norm": 0.7863314747810364, + "learning_rate": 9.802244076650264e-06, + "loss": 0.7675, + "step": 3355 + }, + { + "epoch": 0.18471022070559745, + "grad_norm": 0.8230198621749878, + "learning_rate": 9.802123356967687e-06, + "loss": 0.7243, + "step": 3356 + }, + { + "epoch": 0.1847652595079531, + "grad_norm": 0.8038737773895264, + "learning_rate": 9.80200260119364e-06, + "loss": 0.8094, + "step": 3357 + }, + { + "epoch": 0.18482029831030877, + "grad_norm": 0.7656993269920349, + "learning_rate": 9.801881809329022e-06, + "loss": 0.7736, + "step": 3358 + }, + { + "epoch": 0.18487533711266443, + "grad_norm": 0.8222082853317261, + "learning_rate": 9.801760981374747e-06, + "loss": 0.844, + "step": 3359 + }, + { + "epoch": 0.1849303759150201, + "grad_norm": 0.7632889747619629, + "learning_rate": 9.801640117331723e-06, + "loss": 0.8354, + "step": 3360 + }, + { + "epoch": 0.18498541471737576, + "grad_norm": 0.8308513760566711, + "learning_rate": 9.801519217200857e-06, + "loss": 0.8277, + "step": 3361 + }, + { + "epoch": 0.18504045351973142, + "grad_norm": 0.7865434885025024, + "learning_rate": 9.801398280983057e-06, + "loss": 0.8614, + "step": 3362 + }, + { + "epoch": 0.18509549232208708, + "grad_norm": 0.7249410152435303, + "learning_rate": 9.801277308679232e-06, + "loss": 0.7259, + "step": 3363 + }, + { + "epoch": 0.18515053112444274, + "grad_norm": 0.7604461908340454, + "learning_rate": 9.801156300290293e-06, + "loss": 0.8507, + "step": 3364 + }, + { + "epoch": 0.1852055699267984, + "grad_norm": 0.8725959062576294, + "learning_rate": 9.801035255817149e-06, + "loss": 0.7688, + "step": 3365 + }, + { + "epoch": 0.18526060872915406, + "grad_norm": 0.7798827290534973, + "learning_rate": 9.800914175260708e-06, + "loss": 0.8788, + "step": 3366 + }, + { + "epoch": 0.18531564753150973, + "grad_norm": 0.7060996890068054, + "learning_rate": 9.800793058621882e-06, + "loss": 0.8183, + "step": 3367 + }, + { + "epoch": 0.1853706863338654, + "grad_norm": 0.7558063268661499, + "learning_rate": 9.80067190590158e-06, + "loss": 0.7834, + "step": 3368 + }, + { + "epoch": 0.18542572513622105, + "grad_norm": 0.7411057353019714, + "learning_rate": 9.800550717100714e-06, + "loss": 0.8298, + "step": 3369 + }, + { + "epoch": 0.1854807639385767, + "grad_norm": 0.8466144800186157, + "learning_rate": 9.800429492220193e-06, + "loss": 0.8297, + "step": 3370 + }, + { + "epoch": 0.18553580274093237, + "grad_norm": 0.7302330136299133, + "learning_rate": 9.800308231260928e-06, + "loss": 0.72, + "step": 3371 + }, + { + "epoch": 0.185590841543288, + "grad_norm": 0.8140530586242676, + "learning_rate": 9.800186934223832e-06, + "loss": 0.9287, + "step": 3372 + }, + { + "epoch": 0.18564588034564367, + "grad_norm": 0.8246129751205444, + "learning_rate": 9.800065601109817e-06, + "loss": 0.7891, + "step": 3373 + }, + { + "epoch": 0.18570091914799933, + "grad_norm": 0.8746623396873474, + "learning_rate": 9.799944231919794e-06, + "loss": 0.8549, + "step": 3374 + }, + { + "epoch": 0.185755957950355, + "grad_norm": 0.9977195858955383, + "learning_rate": 9.799822826654672e-06, + "loss": 0.821, + "step": 3375 + }, + { + "epoch": 0.18581099675271065, + "grad_norm": 0.8937395811080933, + "learning_rate": 9.79970138531537e-06, + "loss": 0.8639, + "step": 3376 + }, + { + "epoch": 0.1858660355550663, + "grad_norm": 1.039695143699646, + "learning_rate": 9.799579907902794e-06, + "loss": 1.0425, + "step": 3377 + }, + { + "epoch": 0.18592107435742197, + "grad_norm": 0.7847749590873718, + "learning_rate": 9.799458394417863e-06, + "loss": 0.8505, + "step": 3378 + }, + { + "epoch": 0.18597611315977763, + "grad_norm": 0.760334312915802, + "learning_rate": 9.799336844861486e-06, + "loss": 0.7418, + "step": 3379 + }, + { + "epoch": 0.1860311519621333, + "grad_norm": 0.7599604725837708, + "learning_rate": 9.799215259234578e-06, + "loss": 0.8305, + "step": 3380 + }, + { + "epoch": 0.18608619076448896, + "grad_norm": 0.846767246723175, + "learning_rate": 9.799093637538054e-06, + "loss": 0.7526, + "step": 3381 + }, + { + "epoch": 0.18614122956684462, + "grad_norm": 0.7840956449508667, + "learning_rate": 9.798971979772825e-06, + "loss": 0.8009, + "step": 3382 + }, + { + "epoch": 0.18619626836920028, + "grad_norm": 0.7826499342918396, + "learning_rate": 9.798850285939809e-06, + "loss": 0.821, + "step": 3383 + }, + { + "epoch": 0.18625130717155594, + "grad_norm": 0.7829813361167908, + "learning_rate": 9.798728556039918e-06, + "loss": 0.8053, + "step": 3384 + }, + { + "epoch": 0.1863063459739116, + "grad_norm": 0.7267470359802246, + "learning_rate": 9.798606790074067e-06, + "loss": 0.6797, + "step": 3385 + }, + { + "epoch": 0.18636138477626726, + "grad_norm": 0.8560196757316589, + "learning_rate": 9.798484988043173e-06, + "loss": 0.8476, + "step": 3386 + }, + { + "epoch": 0.18641642357862293, + "grad_norm": 0.7920921444892883, + "learning_rate": 9.798363149948148e-06, + "loss": 0.8832, + "step": 3387 + }, + { + "epoch": 0.1864714623809786, + "grad_norm": 0.8414384126663208, + "learning_rate": 9.798241275789912e-06, + "loss": 0.8607, + "step": 3388 + }, + { + "epoch": 0.18652650118333425, + "grad_norm": 0.7255431413650513, + "learning_rate": 9.798119365569378e-06, + "loss": 0.6426, + "step": 3389 + }, + { + "epoch": 0.1865815399856899, + "grad_norm": 0.8842852711677551, + "learning_rate": 9.797997419287465e-06, + "loss": 0.9058, + "step": 3390 + }, + { + "epoch": 0.18663657878804557, + "grad_norm": 0.7178265452384949, + "learning_rate": 9.797875436945086e-06, + "loss": 0.8134, + "step": 3391 + }, + { + "epoch": 0.18669161759040123, + "grad_norm": 0.7275096774101257, + "learning_rate": 9.797753418543161e-06, + "loss": 0.6858, + "step": 3392 + }, + { + "epoch": 0.1867466563927569, + "grad_norm": 0.7587800025939941, + "learning_rate": 9.797631364082605e-06, + "loss": 0.7437, + "step": 3393 + }, + { + "epoch": 0.18680169519511255, + "grad_norm": 0.9769744873046875, + "learning_rate": 9.797509273564336e-06, + "loss": 0.8024, + "step": 3394 + }, + { + "epoch": 0.18685673399746822, + "grad_norm": 0.7662433385848999, + "learning_rate": 9.79738714698927e-06, + "loss": 0.8122, + "step": 3395 + }, + { + "epoch": 0.18691177279982388, + "grad_norm": 0.8620306849479675, + "learning_rate": 9.797264984358328e-06, + "loss": 0.7952, + "step": 3396 + }, + { + "epoch": 0.18696681160217954, + "grad_norm": 0.7542591094970703, + "learning_rate": 9.797142785672427e-06, + "loss": 0.8315, + "step": 3397 + }, + { + "epoch": 0.1870218504045352, + "grad_norm": 0.7273713946342468, + "learning_rate": 9.797020550932483e-06, + "loss": 0.7316, + "step": 3398 + }, + { + "epoch": 0.18707688920689086, + "grad_norm": 1.031592845916748, + "learning_rate": 9.796898280139417e-06, + "loss": 0.7478, + "step": 3399 + }, + { + "epoch": 0.18713192800924652, + "grad_norm": 0.791407585144043, + "learning_rate": 9.796775973294147e-06, + "loss": 0.7742, + "step": 3400 + }, + { + "epoch": 0.18718696681160218, + "grad_norm": 0.8311418294906616, + "learning_rate": 9.796653630397595e-06, + "loss": 0.8182, + "step": 3401 + }, + { + "epoch": 0.18724200561395785, + "grad_norm": 0.7960993051528931, + "learning_rate": 9.796531251450678e-06, + "loss": 0.7606, + "step": 3402 + }, + { + "epoch": 0.1872970444163135, + "grad_norm": 0.8671618103981018, + "learning_rate": 9.796408836454316e-06, + "loss": 0.7136, + "step": 3403 + }, + { + "epoch": 0.18735208321866917, + "grad_norm": 1.1071348190307617, + "learning_rate": 9.796286385409428e-06, + "loss": 0.7729, + "step": 3404 + }, + { + "epoch": 0.18740712202102483, + "grad_norm": 0.738217294216156, + "learning_rate": 9.796163898316935e-06, + "loss": 0.7425, + "step": 3405 + }, + { + "epoch": 0.1874621608233805, + "grad_norm": 0.7567199468612671, + "learning_rate": 9.796041375177758e-06, + "loss": 0.8442, + "step": 3406 + }, + { + "epoch": 0.18751719962573615, + "grad_norm": 0.7942413091659546, + "learning_rate": 9.79591881599282e-06, + "loss": 0.852, + "step": 3407 + }, + { + "epoch": 0.18757223842809181, + "grad_norm": 0.7529355883598328, + "learning_rate": 9.795796220763038e-06, + "loss": 0.8086, + "step": 3408 + }, + { + "epoch": 0.18762727723044748, + "grad_norm": 0.7645192742347717, + "learning_rate": 9.795673589489337e-06, + "loss": 0.831, + "step": 3409 + }, + { + "epoch": 0.18768231603280314, + "grad_norm": 0.694791853427887, + "learning_rate": 9.795550922172635e-06, + "loss": 0.6919, + "step": 3410 + }, + { + "epoch": 0.1877373548351588, + "grad_norm": 0.7041944265365601, + "learning_rate": 9.795428218813858e-06, + "loss": 0.7284, + "step": 3411 + }, + { + "epoch": 0.18779239363751446, + "grad_norm": 0.8972276449203491, + "learning_rate": 9.795305479413924e-06, + "loss": 0.7156, + "step": 3412 + }, + { + "epoch": 0.18784743243987012, + "grad_norm": 0.9730873107910156, + "learning_rate": 9.795182703973758e-06, + "loss": 0.8739, + "step": 3413 + }, + { + "epoch": 0.18790247124222578, + "grad_norm": 0.8137956261634827, + "learning_rate": 9.795059892494283e-06, + "loss": 0.8189, + "step": 3414 + }, + { + "epoch": 0.18795751004458142, + "grad_norm": 0.8171416521072388, + "learning_rate": 9.794937044976422e-06, + "loss": 0.9449, + "step": 3415 + }, + { + "epoch": 0.18801254884693708, + "grad_norm": 0.7929911017417908, + "learning_rate": 9.794814161421098e-06, + "loss": 0.8034, + "step": 3416 + }, + { + "epoch": 0.18806758764929274, + "grad_norm": 1.1045749187469482, + "learning_rate": 9.794691241829233e-06, + "loss": 0.875, + "step": 3417 + }, + { + "epoch": 0.1881226264516484, + "grad_norm": 0.8141040205955505, + "learning_rate": 9.794568286201752e-06, + "loss": 0.787, + "step": 3418 + }, + { + "epoch": 0.18817766525400406, + "grad_norm": 0.7615541815757751, + "learning_rate": 9.79444529453958e-06, + "loss": 0.8491, + "step": 3419 + }, + { + "epoch": 0.18823270405635972, + "grad_norm": 0.848419189453125, + "learning_rate": 9.79432226684364e-06, + "loss": 0.7445, + "step": 3420 + }, + { + "epoch": 0.18828774285871538, + "grad_norm": 0.8075067400932312, + "learning_rate": 9.794199203114858e-06, + "loss": 0.6581, + "step": 3421 + }, + { + "epoch": 0.18834278166107105, + "grad_norm": 0.8473401069641113, + "learning_rate": 9.794076103354158e-06, + "loss": 0.839, + "step": 3422 + }, + { + "epoch": 0.1883978204634267, + "grad_norm": 0.8211609721183777, + "learning_rate": 9.793952967562463e-06, + "loss": 0.7709, + "step": 3423 + }, + { + "epoch": 0.18845285926578237, + "grad_norm": 0.7527804374694824, + "learning_rate": 9.793829795740703e-06, + "loss": 0.7315, + "step": 3424 + }, + { + "epoch": 0.18850789806813803, + "grad_norm": 0.7971188426017761, + "learning_rate": 9.793706587889802e-06, + "loss": 0.7507, + "step": 3425 + }, + { + "epoch": 0.1885629368704937, + "grad_norm": 1.024066686630249, + "learning_rate": 9.793583344010684e-06, + "loss": 0.9043, + "step": 3426 + }, + { + "epoch": 0.18861797567284935, + "grad_norm": 0.7428625226020813, + "learning_rate": 9.793460064104276e-06, + "loss": 0.7435, + "step": 3427 + }, + { + "epoch": 0.18867301447520501, + "grad_norm": 0.8438264727592468, + "learning_rate": 9.793336748171507e-06, + "loss": 0.8618, + "step": 3428 + }, + { + "epoch": 0.18872805327756068, + "grad_norm": 0.7846877574920654, + "learning_rate": 9.793213396213302e-06, + "loss": 0.8064, + "step": 3429 + }, + { + "epoch": 0.18878309207991634, + "grad_norm": 0.7527204751968384, + "learning_rate": 9.793090008230587e-06, + "loss": 0.7596, + "step": 3430 + }, + { + "epoch": 0.188838130882272, + "grad_norm": 1.1236757040023804, + "learning_rate": 9.792966584224292e-06, + "loss": 0.8292, + "step": 3431 + }, + { + "epoch": 0.18889316968462766, + "grad_norm": 0.8128102421760559, + "learning_rate": 9.792843124195343e-06, + "loss": 0.8073, + "step": 3432 + }, + { + "epoch": 0.18894820848698332, + "grad_norm": 0.7668742537498474, + "learning_rate": 9.792719628144667e-06, + "loss": 0.7848, + "step": 3433 + }, + { + "epoch": 0.18900324728933898, + "grad_norm": 1.8663485050201416, + "learning_rate": 9.792596096073193e-06, + "loss": 0.9388, + "step": 3434 + }, + { + "epoch": 0.18905828609169464, + "grad_norm": 0.8066239356994629, + "learning_rate": 9.792472527981852e-06, + "loss": 0.6647, + "step": 3435 + }, + { + "epoch": 0.1891133248940503, + "grad_norm": 0.8268817067146301, + "learning_rate": 9.792348923871567e-06, + "loss": 0.9676, + "step": 3436 + }, + { + "epoch": 0.18916836369640597, + "grad_norm": 0.7165037393569946, + "learning_rate": 9.792225283743272e-06, + "loss": 0.6937, + "step": 3437 + }, + { + "epoch": 0.18922340249876163, + "grad_norm": 0.7850403785705566, + "learning_rate": 9.792101607597895e-06, + "loss": 0.7782, + "step": 3438 + }, + { + "epoch": 0.1892784413011173, + "grad_norm": 0.8839808702468872, + "learning_rate": 9.791977895436365e-06, + "loss": 0.7639, + "step": 3439 + }, + { + "epoch": 0.18933348010347295, + "grad_norm": 0.8260362148284912, + "learning_rate": 9.791854147259611e-06, + "loss": 0.8201, + "step": 3440 + }, + { + "epoch": 0.1893885189058286, + "grad_norm": 0.8792916536331177, + "learning_rate": 9.791730363068564e-06, + "loss": 0.8251, + "step": 3441 + }, + { + "epoch": 0.18944355770818427, + "grad_norm": 0.8192774653434753, + "learning_rate": 9.791606542864154e-06, + "loss": 0.7944, + "step": 3442 + }, + { + "epoch": 0.18949859651053994, + "grad_norm": 0.751470685005188, + "learning_rate": 9.791482686647313e-06, + "loss": 0.7563, + "step": 3443 + }, + { + "epoch": 0.1895536353128956, + "grad_norm": 0.8902072906494141, + "learning_rate": 9.79135879441897e-06, + "loss": 0.7719, + "step": 3444 + }, + { + "epoch": 0.18960867411525126, + "grad_norm": 0.7166435122489929, + "learning_rate": 9.791234866180058e-06, + "loss": 0.7871, + "step": 3445 + }, + { + "epoch": 0.18966371291760692, + "grad_norm": 0.763416588306427, + "learning_rate": 9.791110901931505e-06, + "loss": 0.8226, + "step": 3446 + }, + { + "epoch": 0.18971875171996258, + "grad_norm": 0.806633472442627, + "learning_rate": 9.790986901674246e-06, + "loss": 0.7828, + "step": 3447 + }, + { + "epoch": 0.18977379052231824, + "grad_norm": 0.8139312863349915, + "learning_rate": 9.790862865409213e-06, + "loss": 0.8441, + "step": 3448 + }, + { + "epoch": 0.1898288293246739, + "grad_norm": 0.8362452387809753, + "learning_rate": 9.790738793137335e-06, + "loss": 0.8765, + "step": 3449 + }, + { + "epoch": 0.18988386812702956, + "grad_norm": 0.7736263871192932, + "learning_rate": 9.790614684859549e-06, + "loss": 0.8373, + "step": 3450 + }, + { + "epoch": 0.18993890692938523, + "grad_norm": 0.8742800354957581, + "learning_rate": 9.790490540576784e-06, + "loss": 0.8976, + "step": 3451 + }, + { + "epoch": 0.1899939457317409, + "grad_norm": 0.701505720615387, + "learning_rate": 9.790366360289974e-06, + "loss": 0.7799, + "step": 3452 + }, + { + "epoch": 0.19004898453409655, + "grad_norm": 0.7771356701850891, + "learning_rate": 9.790242144000055e-06, + "loss": 0.7617, + "step": 3453 + }, + { + "epoch": 0.1901040233364522, + "grad_norm": 0.897576093673706, + "learning_rate": 9.790117891707955e-06, + "loss": 0.7817, + "step": 3454 + }, + { + "epoch": 0.19015906213880787, + "grad_norm": 0.7296561002731323, + "learning_rate": 9.789993603414613e-06, + "loss": 0.8344, + "step": 3455 + }, + { + "epoch": 0.19021410094116353, + "grad_norm": 0.8099396228790283, + "learning_rate": 9.789869279120962e-06, + "loss": 0.7369, + "step": 3456 + }, + { + "epoch": 0.1902691397435192, + "grad_norm": 0.7802554368972778, + "learning_rate": 9.789744918827935e-06, + "loss": 0.8383, + "step": 3457 + }, + { + "epoch": 0.19032417854587483, + "grad_norm": 0.7508029341697693, + "learning_rate": 9.789620522536467e-06, + "loss": 0.825, + "step": 3458 + }, + { + "epoch": 0.1903792173482305, + "grad_norm": 0.7782164216041565, + "learning_rate": 9.789496090247494e-06, + "loss": 0.7737, + "step": 3459 + }, + { + "epoch": 0.19043425615058615, + "grad_norm": 0.7711489796638489, + "learning_rate": 9.78937162196195e-06, + "loss": 0.7694, + "step": 3460 + }, + { + "epoch": 0.1904892949529418, + "grad_norm": 0.821579098701477, + "learning_rate": 9.789247117680769e-06, + "loss": 0.7493, + "step": 3461 + }, + { + "epoch": 0.19054433375529747, + "grad_norm": 0.6700833439826965, + "learning_rate": 9.789122577404892e-06, + "loss": 0.7696, + "step": 3462 + }, + { + "epoch": 0.19059937255765314, + "grad_norm": 0.854340136051178, + "learning_rate": 9.78899800113525e-06, + "loss": 0.9503, + "step": 3463 + }, + { + "epoch": 0.1906544113600088, + "grad_norm": 0.8095537424087524, + "learning_rate": 9.78887338887278e-06, + "loss": 0.8435, + "step": 3464 + }, + { + "epoch": 0.19070945016236446, + "grad_norm": 0.8156480193138123, + "learning_rate": 9.78874874061842e-06, + "loss": 0.8561, + "step": 3465 + }, + { + "epoch": 0.19076448896472012, + "grad_norm": 0.8065482378005981, + "learning_rate": 9.788624056373108e-06, + "loss": 0.7793, + "step": 3466 + }, + { + "epoch": 0.19081952776707578, + "grad_norm": 0.789601743221283, + "learning_rate": 9.788499336137778e-06, + "loss": 0.7523, + "step": 3467 + }, + { + "epoch": 0.19087456656943144, + "grad_norm": 0.8322301506996155, + "learning_rate": 9.788374579913369e-06, + "loss": 0.9034, + "step": 3468 + }, + { + "epoch": 0.1909296053717871, + "grad_norm": 0.8194506764411926, + "learning_rate": 9.788249787700818e-06, + "loss": 0.8601, + "step": 3469 + }, + { + "epoch": 0.19098464417414276, + "grad_norm": 0.8419962525367737, + "learning_rate": 9.788124959501065e-06, + "loss": 0.869, + "step": 3470 + }, + { + "epoch": 0.19103968297649843, + "grad_norm": 0.760637104511261, + "learning_rate": 9.788000095315044e-06, + "loss": 0.7293, + "step": 3471 + }, + { + "epoch": 0.1910947217788541, + "grad_norm": 1.3964574337005615, + "learning_rate": 9.787875195143697e-06, + "loss": 0.8032, + "step": 3472 + }, + { + "epoch": 0.19114976058120975, + "grad_norm": 0.8205012679100037, + "learning_rate": 9.787750258987962e-06, + "loss": 0.8868, + "step": 3473 + }, + { + "epoch": 0.1912047993835654, + "grad_norm": 0.8183104991912842, + "learning_rate": 9.78762528684878e-06, + "loss": 0.7531, + "step": 3474 + }, + { + "epoch": 0.19125983818592107, + "grad_norm": 0.7659775018692017, + "learning_rate": 9.787500278727083e-06, + "loss": 0.8081, + "step": 3475 + }, + { + "epoch": 0.19131487698827673, + "grad_norm": 0.8262091279029846, + "learning_rate": 9.787375234623819e-06, + "loss": 0.82, + "step": 3476 + }, + { + "epoch": 0.1913699157906324, + "grad_norm": 0.857761025428772, + "learning_rate": 9.787250154539923e-06, + "loss": 0.9133, + "step": 3477 + }, + { + "epoch": 0.19142495459298806, + "grad_norm": 0.7551915645599365, + "learning_rate": 9.787125038476334e-06, + "loss": 0.7822, + "step": 3478 + }, + { + "epoch": 0.19147999339534372, + "grad_norm": 0.7777357697486877, + "learning_rate": 9.786999886433998e-06, + "loss": 0.7676, + "step": 3479 + }, + { + "epoch": 0.19153503219769938, + "grad_norm": 0.8389080166816711, + "learning_rate": 9.786874698413852e-06, + "loss": 0.7901, + "step": 3480 + }, + { + "epoch": 0.19159007100005504, + "grad_norm": 0.7894837856292725, + "learning_rate": 9.786749474416836e-06, + "loss": 0.8393, + "step": 3481 + }, + { + "epoch": 0.1916451098024107, + "grad_norm": 1.9752860069274902, + "learning_rate": 9.786624214443893e-06, + "loss": 0.7611, + "step": 3482 + }, + { + "epoch": 0.19170014860476636, + "grad_norm": 0.8023802042007446, + "learning_rate": 9.786498918495963e-06, + "loss": 0.8426, + "step": 3483 + }, + { + "epoch": 0.19175518740712202, + "grad_norm": 0.7232086658477783, + "learning_rate": 9.78637358657399e-06, + "loss": 0.6611, + "step": 3484 + }, + { + "epoch": 0.19181022620947769, + "grad_norm": 0.8198665380477905, + "learning_rate": 9.786248218678912e-06, + "loss": 0.8795, + "step": 3485 + }, + { + "epoch": 0.19186526501183335, + "grad_norm": 0.942404568195343, + "learning_rate": 9.786122814811675e-06, + "loss": 0.9146, + "step": 3486 + }, + { + "epoch": 0.191920303814189, + "grad_norm": 0.7602691054344177, + "learning_rate": 9.78599737497322e-06, + "loss": 0.7514, + "step": 3487 + }, + { + "epoch": 0.19197534261654467, + "grad_norm": 0.7981933951377869, + "learning_rate": 9.785871899164489e-06, + "loss": 0.7722, + "step": 3488 + }, + { + "epoch": 0.19203038141890033, + "grad_norm": 0.8617631793022156, + "learning_rate": 9.785746387386427e-06, + "loss": 0.8989, + "step": 3489 + }, + { + "epoch": 0.192085420221256, + "grad_norm": 0.7691803574562073, + "learning_rate": 9.785620839639976e-06, + "loss": 0.7929, + "step": 3490 + }, + { + "epoch": 0.19214045902361165, + "grad_norm": 1.3053189516067505, + "learning_rate": 9.785495255926078e-06, + "loss": 0.8478, + "step": 3491 + }, + { + "epoch": 0.19219549782596732, + "grad_norm": 0.807064950466156, + "learning_rate": 9.785369636245681e-06, + "loss": 0.7452, + "step": 3492 + }, + { + "epoch": 0.19225053662832298, + "grad_norm": 0.8182778358459473, + "learning_rate": 9.785243980599726e-06, + "loss": 0.8371, + "step": 3493 + }, + { + "epoch": 0.19230557543067864, + "grad_norm": 0.7654449343681335, + "learning_rate": 9.785118288989157e-06, + "loss": 0.8321, + "step": 3494 + }, + { + "epoch": 0.1923606142330343, + "grad_norm": 0.7192448973655701, + "learning_rate": 9.784992561414922e-06, + "loss": 0.7451, + "step": 3495 + }, + { + "epoch": 0.19241565303538996, + "grad_norm": 0.8639407753944397, + "learning_rate": 9.784866797877964e-06, + "loss": 0.9272, + "step": 3496 + }, + { + "epoch": 0.19247069183774562, + "grad_norm": 0.8329927921295166, + "learning_rate": 9.784740998379225e-06, + "loss": 0.8034, + "step": 3497 + }, + { + "epoch": 0.19252573064010128, + "grad_norm": 0.7975476980209351, + "learning_rate": 9.784615162919656e-06, + "loss": 0.6885, + "step": 3498 + }, + { + "epoch": 0.19258076944245694, + "grad_norm": 0.8077559471130371, + "learning_rate": 9.7844892915002e-06, + "loss": 0.8745, + "step": 3499 + }, + { + "epoch": 0.1926358082448126, + "grad_norm": 0.7957825660705566, + "learning_rate": 9.7843633841218e-06, + "loss": 0.7612, + "step": 3500 + }, + { + "epoch": 0.19269084704716824, + "grad_norm": 0.8478250503540039, + "learning_rate": 9.784237440785408e-06, + "loss": 0.8675, + "step": 3501 + }, + { + "epoch": 0.1927458858495239, + "grad_norm": 0.7289726138114929, + "learning_rate": 9.78411146149197e-06, + "loss": 0.7126, + "step": 3502 + }, + { + "epoch": 0.19280092465187956, + "grad_norm": 0.7608509063720703, + "learning_rate": 9.783985446242427e-06, + "loss": 0.7049, + "step": 3503 + }, + { + "epoch": 0.19285596345423522, + "grad_norm": 0.8985201120376587, + "learning_rate": 9.783859395037733e-06, + "loss": 0.8067, + "step": 3504 + }, + { + "epoch": 0.19291100225659089, + "grad_norm": 0.7563273906707764, + "learning_rate": 9.78373330787883e-06, + "loss": 0.7018, + "step": 3505 + }, + { + "epoch": 0.19296604105894655, + "grad_norm": 0.8022900223731995, + "learning_rate": 9.78360718476667e-06, + "loss": 0.8346, + "step": 3506 + }, + { + "epoch": 0.1930210798613022, + "grad_norm": 0.897566020488739, + "learning_rate": 9.783481025702197e-06, + "loss": 0.9465, + "step": 3507 + }, + { + "epoch": 0.19307611866365787, + "grad_norm": 0.9550303220748901, + "learning_rate": 9.783354830686363e-06, + "loss": 0.8904, + "step": 3508 + }, + { + "epoch": 0.19313115746601353, + "grad_norm": 0.8152582049369812, + "learning_rate": 9.783228599720114e-06, + "loss": 0.7776, + "step": 3509 + }, + { + "epoch": 0.1931861962683692, + "grad_norm": 0.7421940565109253, + "learning_rate": 9.783102332804398e-06, + "loss": 0.6847, + "step": 3510 + }, + { + "epoch": 0.19324123507072485, + "grad_norm": 0.7414368391036987, + "learning_rate": 9.782976029940167e-06, + "loss": 0.8435, + "step": 3511 + }, + { + "epoch": 0.19329627387308052, + "grad_norm": 0.7845529317855835, + "learning_rate": 9.782849691128366e-06, + "loss": 0.8255, + "step": 3512 + }, + { + "epoch": 0.19335131267543618, + "grad_norm": 0.7779788970947266, + "learning_rate": 9.78272331636995e-06, + "loss": 0.7801, + "step": 3513 + }, + { + "epoch": 0.19340635147779184, + "grad_norm": 0.7537885904312134, + "learning_rate": 9.782596905665865e-06, + "loss": 0.7501, + "step": 3514 + }, + { + "epoch": 0.1934613902801475, + "grad_norm": 0.7585812211036682, + "learning_rate": 9.782470459017059e-06, + "loss": 0.8425, + "step": 3515 + }, + { + "epoch": 0.19351642908250316, + "grad_norm": 0.7923589944839478, + "learning_rate": 9.78234397642449e-06, + "loss": 0.8412, + "step": 3516 + }, + { + "epoch": 0.19357146788485882, + "grad_norm": 0.8710628151893616, + "learning_rate": 9.7822174578891e-06, + "loss": 0.8014, + "step": 3517 + }, + { + "epoch": 0.19362650668721448, + "grad_norm": 0.7646920084953308, + "learning_rate": 9.782090903411845e-06, + "loss": 0.8256, + "step": 3518 + }, + { + "epoch": 0.19368154548957014, + "grad_norm": 0.7560480833053589, + "learning_rate": 9.781964312993675e-06, + "loss": 0.7816, + "step": 3519 + }, + { + "epoch": 0.1937365842919258, + "grad_norm": 0.7438123226165771, + "learning_rate": 9.78183768663554e-06, + "loss": 0.8319, + "step": 3520 + }, + { + "epoch": 0.19379162309428147, + "grad_norm": 0.7239874601364136, + "learning_rate": 9.781711024338394e-06, + "loss": 0.6968, + "step": 3521 + }, + { + "epoch": 0.19384666189663713, + "grad_norm": 0.881197988986969, + "learning_rate": 9.781584326103188e-06, + "loss": 0.9493, + "step": 3522 + }, + { + "epoch": 0.1939017006989928, + "grad_norm": 0.7903854846954346, + "learning_rate": 9.781457591930874e-06, + "loss": 0.8312, + "step": 3523 + }, + { + "epoch": 0.19395673950134845, + "grad_norm": 0.7375456094741821, + "learning_rate": 9.781330821822405e-06, + "loss": 0.7434, + "step": 3524 + }, + { + "epoch": 0.1940117783037041, + "grad_norm": 0.7101724743843079, + "learning_rate": 9.781204015778733e-06, + "loss": 0.75, + "step": 3525 + }, + { + "epoch": 0.19406681710605977, + "grad_norm": 0.8267471194267273, + "learning_rate": 9.781077173800812e-06, + "loss": 0.8807, + "step": 3526 + }, + { + "epoch": 0.19412185590841544, + "grad_norm": 0.9014178514480591, + "learning_rate": 9.780950295889594e-06, + "loss": 0.7836, + "step": 3527 + }, + { + "epoch": 0.1941768947107711, + "grad_norm": 0.7579739689826965, + "learning_rate": 9.780823382046034e-06, + "loss": 0.8331, + "step": 3528 + }, + { + "epoch": 0.19423193351312676, + "grad_norm": 0.8308925032615662, + "learning_rate": 9.780696432271084e-06, + "loss": 0.794, + "step": 3529 + }, + { + "epoch": 0.19428697231548242, + "grad_norm": 0.7461574673652649, + "learning_rate": 9.780569446565701e-06, + "loss": 0.8155, + "step": 3530 + }, + { + "epoch": 0.19434201111783808, + "grad_norm": 0.8658885359764099, + "learning_rate": 9.780442424930836e-06, + "loss": 0.7907, + "step": 3531 + }, + { + "epoch": 0.19439704992019374, + "grad_norm": 0.7243279218673706, + "learning_rate": 9.780315367367449e-06, + "loss": 0.7985, + "step": 3532 + }, + { + "epoch": 0.1944520887225494, + "grad_norm": 0.8482224345207214, + "learning_rate": 9.780188273876486e-06, + "loss": 0.9095, + "step": 3533 + }, + { + "epoch": 0.19450712752490507, + "grad_norm": 0.8675364255905151, + "learning_rate": 9.78006114445891e-06, + "loss": 0.759, + "step": 3534 + }, + { + "epoch": 0.19456216632726073, + "grad_norm": 0.8388474583625793, + "learning_rate": 9.779933979115675e-06, + "loss": 0.8331, + "step": 3535 + }, + { + "epoch": 0.1946172051296164, + "grad_norm": 0.8050872683525085, + "learning_rate": 9.779806777847735e-06, + "loss": 0.861, + "step": 3536 + }, + { + "epoch": 0.19467224393197205, + "grad_norm": 0.8401390910148621, + "learning_rate": 9.779679540656046e-06, + "loss": 0.755, + "step": 3537 + }, + { + "epoch": 0.1947272827343277, + "grad_norm": 0.865160346031189, + "learning_rate": 9.779552267541566e-06, + "loss": 0.7515, + "step": 3538 + }, + { + "epoch": 0.19478232153668337, + "grad_norm": 0.923086941242218, + "learning_rate": 9.77942495850525e-06, + "loss": 0.8032, + "step": 3539 + }, + { + "epoch": 0.19483736033903903, + "grad_norm": 0.8402467966079712, + "learning_rate": 9.779297613548056e-06, + "loss": 0.9198, + "step": 3540 + }, + { + "epoch": 0.1948923991413947, + "grad_norm": 0.7875306606292725, + "learning_rate": 9.779170232670939e-06, + "loss": 0.712, + "step": 3541 + }, + { + "epoch": 0.19494743794375036, + "grad_norm": 0.7996379137039185, + "learning_rate": 9.779042815874858e-06, + "loss": 0.8126, + "step": 3542 + }, + { + "epoch": 0.19500247674610602, + "grad_norm": 0.7644525766372681, + "learning_rate": 9.778915363160773e-06, + "loss": 0.8602, + "step": 3543 + }, + { + "epoch": 0.19505751554846165, + "grad_norm": 0.8068630695343018, + "learning_rate": 9.778787874529635e-06, + "loss": 0.736, + "step": 3544 + }, + { + "epoch": 0.1951125543508173, + "grad_norm": 0.7889519929885864, + "learning_rate": 9.77866034998241e-06, + "loss": 0.755, + "step": 3545 + }, + { + "epoch": 0.19516759315317297, + "grad_norm": 0.7895978689193726, + "learning_rate": 9.778532789520053e-06, + "loss": 0.8213, + "step": 3546 + }, + { + "epoch": 0.19522263195552864, + "grad_norm": 0.8571796417236328, + "learning_rate": 9.77840519314352e-06, + "loss": 0.8193, + "step": 3547 + }, + { + "epoch": 0.1952776707578843, + "grad_norm": 0.6880007982254028, + "learning_rate": 9.778277560853775e-06, + "loss": 0.6354, + "step": 3548 + }, + { + "epoch": 0.19533270956023996, + "grad_norm": 0.8155353665351868, + "learning_rate": 9.778149892651775e-06, + "loss": 0.8518, + "step": 3549 + }, + { + "epoch": 0.19538774836259562, + "grad_norm": 0.851021945476532, + "learning_rate": 9.778022188538479e-06, + "loss": 0.8506, + "step": 3550 + }, + { + "epoch": 0.19544278716495128, + "grad_norm": 0.8910510540008545, + "learning_rate": 9.777894448514847e-06, + "loss": 0.8825, + "step": 3551 + }, + { + "epoch": 0.19549782596730694, + "grad_norm": 0.8156018853187561, + "learning_rate": 9.777766672581838e-06, + "loss": 0.8262, + "step": 3552 + }, + { + "epoch": 0.1955528647696626, + "grad_norm": 0.756340503692627, + "learning_rate": 9.777638860740415e-06, + "loss": 0.7094, + "step": 3553 + }, + { + "epoch": 0.19560790357201827, + "grad_norm": 0.7604243159294128, + "learning_rate": 9.777511012991538e-06, + "loss": 0.8089, + "step": 3554 + }, + { + "epoch": 0.19566294237437393, + "grad_norm": 0.7609277963638306, + "learning_rate": 9.777383129336167e-06, + "loss": 0.7853, + "step": 3555 + }, + { + "epoch": 0.1957179811767296, + "grad_norm": 1.3562177419662476, + "learning_rate": 9.77725520977526e-06, + "loss": 0.7051, + "step": 3556 + }, + { + "epoch": 0.19577301997908525, + "grad_norm": 0.7428582310676575, + "learning_rate": 9.777127254309784e-06, + "loss": 0.734, + "step": 3557 + }, + { + "epoch": 0.1958280587814409, + "grad_norm": 0.6941032409667969, + "learning_rate": 9.776999262940698e-06, + "loss": 0.7862, + "step": 3558 + }, + { + "epoch": 0.19588309758379657, + "grad_norm": 0.8249906301498413, + "learning_rate": 9.776871235668966e-06, + "loss": 0.8324, + "step": 3559 + }, + { + "epoch": 0.19593813638615223, + "grad_norm": 0.6778795719146729, + "learning_rate": 9.776743172495546e-06, + "loss": 0.743, + "step": 3560 + }, + { + "epoch": 0.1959931751885079, + "grad_norm": 0.8454411625862122, + "learning_rate": 9.776615073421405e-06, + "loss": 0.8625, + "step": 3561 + }, + { + "epoch": 0.19604821399086356, + "grad_norm": 0.8303809762001038, + "learning_rate": 9.776486938447503e-06, + "loss": 0.8806, + "step": 3562 + }, + { + "epoch": 0.19610325279321922, + "grad_norm": 0.8814080357551575, + "learning_rate": 9.776358767574803e-06, + "loss": 0.9096, + "step": 3563 + }, + { + "epoch": 0.19615829159557488, + "grad_norm": 0.7860022187232971, + "learning_rate": 9.77623056080427e-06, + "loss": 0.8101, + "step": 3564 + }, + { + "epoch": 0.19621333039793054, + "grad_norm": 0.7604898810386658, + "learning_rate": 9.776102318136866e-06, + "loss": 0.8121, + "step": 3565 + }, + { + "epoch": 0.1962683692002862, + "grad_norm": 0.810708224773407, + "learning_rate": 9.775974039573555e-06, + "loss": 0.8334, + "step": 3566 + }, + { + "epoch": 0.19632340800264186, + "grad_norm": 1.0174707174301147, + "learning_rate": 9.775845725115301e-06, + "loss": 0.8147, + "step": 3567 + }, + { + "epoch": 0.19637844680499753, + "grad_norm": 0.825137734413147, + "learning_rate": 9.77571737476307e-06, + "loss": 0.816, + "step": 3568 + }, + { + "epoch": 0.1964334856073532, + "grad_norm": 0.9023691415786743, + "learning_rate": 9.775588988517826e-06, + "loss": 0.9157, + "step": 3569 + }, + { + "epoch": 0.19648852440970885, + "grad_norm": 0.7287655472755432, + "learning_rate": 9.775460566380534e-06, + "loss": 0.7414, + "step": 3570 + }, + { + "epoch": 0.1965435632120645, + "grad_norm": 0.8675361275672913, + "learning_rate": 9.775332108352158e-06, + "loss": 0.7212, + "step": 3571 + }, + { + "epoch": 0.19659860201442017, + "grad_norm": 0.8633139729499817, + "learning_rate": 9.775203614433664e-06, + "loss": 0.7254, + "step": 3572 + }, + { + "epoch": 0.19665364081677583, + "grad_norm": 0.8628275394439697, + "learning_rate": 9.775075084626017e-06, + "loss": 0.7403, + "step": 3573 + }, + { + "epoch": 0.1967086796191315, + "grad_norm": 0.86918044090271, + "learning_rate": 9.774946518930184e-06, + "loss": 0.8208, + "step": 3574 + }, + { + "epoch": 0.19676371842148715, + "grad_norm": 1.3616218566894531, + "learning_rate": 9.774817917347132e-06, + "loss": 0.7432, + "step": 3575 + }, + { + "epoch": 0.19681875722384282, + "grad_norm": 0.929084062576294, + "learning_rate": 9.774689279877827e-06, + "loss": 0.9567, + "step": 3576 + }, + { + "epoch": 0.19687379602619848, + "grad_norm": 0.7732542753219604, + "learning_rate": 9.774560606523234e-06, + "loss": 0.8682, + "step": 3577 + }, + { + "epoch": 0.19692883482855414, + "grad_norm": 0.7933471202850342, + "learning_rate": 9.774431897284323e-06, + "loss": 0.7112, + "step": 3578 + }, + { + "epoch": 0.1969838736309098, + "grad_norm": 0.8229583501815796, + "learning_rate": 9.77430315216206e-06, + "loss": 0.762, + "step": 3579 + }, + { + "epoch": 0.19703891243326546, + "grad_norm": 0.7571341395378113, + "learning_rate": 9.774174371157412e-06, + "loss": 0.7627, + "step": 3580 + }, + { + "epoch": 0.19709395123562112, + "grad_norm": 1.1551839113235474, + "learning_rate": 9.774045554271347e-06, + "loss": 0.8621, + "step": 3581 + }, + { + "epoch": 0.19714899003797678, + "grad_norm": 0.8546237349510193, + "learning_rate": 9.773916701504833e-06, + "loss": 0.8183, + "step": 3582 + }, + { + "epoch": 0.19720402884033245, + "grad_norm": 0.7297555804252625, + "learning_rate": 9.773787812858841e-06, + "loss": 0.8098, + "step": 3583 + }, + { + "epoch": 0.1972590676426881, + "grad_norm": 0.7846053838729858, + "learning_rate": 9.773658888334336e-06, + "loss": 0.7874, + "step": 3584 + }, + { + "epoch": 0.19731410644504377, + "grad_norm": 0.8949562907218933, + "learning_rate": 9.773529927932288e-06, + "loss": 0.8651, + "step": 3585 + }, + { + "epoch": 0.19736914524739943, + "grad_norm": 0.8041829466819763, + "learning_rate": 9.773400931653668e-06, + "loss": 0.7519, + "step": 3586 + }, + { + "epoch": 0.19742418404975506, + "grad_norm": 0.8090983033180237, + "learning_rate": 9.773271899499444e-06, + "loss": 0.8606, + "step": 3587 + }, + { + "epoch": 0.19747922285211073, + "grad_norm": 0.7954100966453552, + "learning_rate": 9.773142831470587e-06, + "loss": 0.9028, + "step": 3588 + }, + { + "epoch": 0.1975342616544664, + "grad_norm": 0.6865562796592712, + "learning_rate": 9.773013727568066e-06, + "loss": 0.7323, + "step": 3589 + }, + { + "epoch": 0.19758930045682205, + "grad_norm": 0.9144858717918396, + "learning_rate": 9.772884587792851e-06, + "loss": 0.8178, + "step": 3590 + }, + { + "epoch": 0.1976443392591777, + "grad_norm": 0.8096563220024109, + "learning_rate": 9.772755412145913e-06, + "loss": 0.7749, + "step": 3591 + }, + { + "epoch": 0.19769937806153337, + "grad_norm": 1.4496957063674927, + "learning_rate": 9.772626200628222e-06, + "loss": 0.7981, + "step": 3592 + }, + { + "epoch": 0.19775441686388903, + "grad_norm": 0.7699438333511353, + "learning_rate": 9.77249695324075e-06, + "loss": 0.7683, + "step": 3593 + }, + { + "epoch": 0.1978094556662447, + "grad_norm": 0.7883017063140869, + "learning_rate": 9.77236766998447e-06, + "loss": 0.7668, + "step": 3594 + }, + { + "epoch": 0.19786449446860035, + "grad_norm": 0.7552568912506104, + "learning_rate": 9.772238350860352e-06, + "loss": 0.7914, + "step": 3595 + }, + { + "epoch": 0.19791953327095602, + "grad_norm": 0.8585009574890137, + "learning_rate": 9.772108995869366e-06, + "loss": 0.9888, + "step": 3596 + }, + { + "epoch": 0.19797457207331168, + "grad_norm": 0.9459839463233948, + "learning_rate": 9.77197960501249e-06, + "loss": 0.9923, + "step": 3597 + }, + { + "epoch": 0.19802961087566734, + "grad_norm": 0.844771683216095, + "learning_rate": 9.77185017829069e-06, + "loss": 0.8427, + "step": 3598 + }, + { + "epoch": 0.198084649678023, + "grad_norm": 0.749700665473938, + "learning_rate": 9.77172071570494e-06, + "loss": 0.8111, + "step": 3599 + }, + { + "epoch": 0.19813968848037866, + "grad_norm": 0.7297450304031372, + "learning_rate": 9.771591217256216e-06, + "loss": 0.7783, + "step": 3600 + }, + { + "epoch": 0.19819472728273432, + "grad_norm": 0.7928450703620911, + "learning_rate": 9.77146168294549e-06, + "loss": 0.8755, + "step": 3601 + }, + { + "epoch": 0.19824976608508998, + "grad_norm": 0.7236143946647644, + "learning_rate": 9.771332112773734e-06, + "loss": 0.7159, + "step": 3602 + }, + { + "epoch": 0.19830480488744565, + "grad_norm": 0.8170965313911438, + "learning_rate": 9.771202506741926e-06, + "loss": 0.9093, + "step": 3603 + }, + { + "epoch": 0.1983598436898013, + "grad_norm": 0.8834578990936279, + "learning_rate": 9.771072864851035e-06, + "loss": 0.8961, + "step": 3604 + }, + { + "epoch": 0.19841488249215697, + "grad_norm": 1.3750289678573608, + "learning_rate": 9.770943187102037e-06, + "loss": 0.8175, + "step": 3605 + }, + { + "epoch": 0.19846992129451263, + "grad_norm": 0.7016286253929138, + "learning_rate": 9.770813473495909e-06, + "loss": 0.7171, + "step": 3606 + }, + { + "epoch": 0.1985249600968683, + "grad_norm": 0.7792307734489441, + "learning_rate": 9.770683724033622e-06, + "loss": 0.6892, + "step": 3607 + }, + { + "epoch": 0.19857999889922395, + "grad_norm": 0.789820671081543, + "learning_rate": 9.770553938716153e-06, + "loss": 0.8531, + "step": 3608 + }, + { + "epoch": 0.19863503770157961, + "grad_norm": 0.7585997581481934, + "learning_rate": 9.77042411754448e-06, + "loss": 0.8195, + "step": 3609 + }, + { + "epoch": 0.19869007650393528, + "grad_norm": 0.8989273905754089, + "learning_rate": 9.770294260519573e-06, + "loss": 0.891, + "step": 3610 + }, + { + "epoch": 0.19874511530629094, + "grad_norm": 0.8044012188911438, + "learning_rate": 9.770164367642414e-06, + "loss": 0.8428, + "step": 3611 + }, + { + "epoch": 0.1988001541086466, + "grad_norm": 0.7847021222114563, + "learning_rate": 9.770034438913975e-06, + "loss": 0.8302, + "step": 3612 + }, + { + "epoch": 0.19885519291100226, + "grad_norm": 0.9260531663894653, + "learning_rate": 9.769904474335234e-06, + "loss": 0.8187, + "step": 3613 + }, + { + "epoch": 0.19891023171335792, + "grad_norm": 0.7491805553436279, + "learning_rate": 9.769774473907168e-06, + "loss": 0.8374, + "step": 3614 + }, + { + "epoch": 0.19896527051571358, + "grad_norm": 1.1665992736816406, + "learning_rate": 9.769644437630754e-06, + "loss": 0.8154, + "step": 3615 + }, + { + "epoch": 0.19902030931806924, + "grad_norm": 0.9162279963493347, + "learning_rate": 9.769514365506968e-06, + "loss": 0.8883, + "step": 3616 + }, + { + "epoch": 0.1990753481204249, + "grad_norm": 0.8980437517166138, + "learning_rate": 9.769384257536791e-06, + "loss": 0.8948, + "step": 3617 + }, + { + "epoch": 0.19913038692278057, + "grad_norm": 0.7544137835502625, + "learning_rate": 9.769254113721197e-06, + "loss": 0.7763, + "step": 3618 + }, + { + "epoch": 0.19918542572513623, + "grad_norm": 0.8393334746360779, + "learning_rate": 9.769123934061168e-06, + "loss": 0.8361, + "step": 3619 + }, + { + "epoch": 0.1992404645274919, + "grad_norm": 0.8184031248092651, + "learning_rate": 9.768993718557678e-06, + "loss": 0.8104, + "step": 3620 + }, + { + "epoch": 0.19929550332984755, + "grad_norm": 0.8023706674575806, + "learning_rate": 9.76886346721171e-06, + "loss": 0.7824, + "step": 3621 + }, + { + "epoch": 0.1993505421322032, + "grad_norm": 0.9354264736175537, + "learning_rate": 9.768733180024238e-06, + "loss": 0.7782, + "step": 3622 + }, + { + "epoch": 0.19940558093455887, + "grad_norm": 0.7037177681922913, + "learning_rate": 9.768602856996244e-06, + "loss": 0.8054, + "step": 3623 + }, + { + "epoch": 0.19946061973691454, + "grad_norm": 0.7926928997039795, + "learning_rate": 9.768472498128709e-06, + "loss": 0.8864, + "step": 3624 + }, + { + "epoch": 0.1995156585392702, + "grad_norm": 0.7963769435882568, + "learning_rate": 9.76834210342261e-06, + "loss": 0.8505, + "step": 3625 + }, + { + "epoch": 0.19957069734162586, + "grad_norm": 0.8553926944732666, + "learning_rate": 9.768211672878929e-06, + "loss": 0.8519, + "step": 3626 + }, + { + "epoch": 0.19962573614398152, + "grad_norm": 0.8147156834602356, + "learning_rate": 9.768081206498644e-06, + "loss": 0.8091, + "step": 3627 + }, + { + "epoch": 0.19968077494633718, + "grad_norm": 0.8226443529129028, + "learning_rate": 9.767950704282739e-06, + "loss": 0.8561, + "step": 3628 + }, + { + "epoch": 0.19973581374869284, + "grad_norm": 0.7246909141540527, + "learning_rate": 9.76782016623219e-06, + "loss": 0.7318, + "step": 3629 + }, + { + "epoch": 0.19979085255104848, + "grad_norm": 1.0527293682098389, + "learning_rate": 9.767689592347983e-06, + "loss": 0.7699, + "step": 3630 + }, + { + "epoch": 0.19984589135340414, + "grad_norm": 0.7433847188949585, + "learning_rate": 9.767558982631097e-06, + "loss": 0.8619, + "step": 3631 + }, + { + "epoch": 0.1999009301557598, + "grad_norm": 0.7901468873023987, + "learning_rate": 9.767428337082513e-06, + "loss": 0.8365, + "step": 3632 + }, + { + "epoch": 0.19995596895811546, + "grad_norm": 0.7766845226287842, + "learning_rate": 9.767297655703215e-06, + "loss": 0.7767, + "step": 3633 + }, + { + "epoch": 0.20001100776047112, + "grad_norm": 0.7785109281539917, + "learning_rate": 9.767166938494183e-06, + "loss": 0.7114, + "step": 3634 + }, + { + "epoch": 0.20006604656282678, + "grad_norm": 0.8068187832832336, + "learning_rate": 9.767036185456402e-06, + "loss": 0.8142, + "step": 3635 + }, + { + "epoch": 0.20012108536518244, + "grad_norm": 0.7893292307853699, + "learning_rate": 9.766905396590851e-06, + "loss": 0.8658, + "step": 3636 + }, + { + "epoch": 0.2001761241675381, + "grad_norm": 0.8647506237030029, + "learning_rate": 9.766774571898516e-06, + "loss": 0.84, + "step": 3637 + }, + { + "epoch": 0.20023116296989377, + "grad_norm": 0.8545078635215759, + "learning_rate": 9.766643711380378e-06, + "loss": 0.8455, + "step": 3638 + }, + { + "epoch": 0.20028620177224943, + "grad_norm": 0.924404501914978, + "learning_rate": 9.766512815037424e-06, + "loss": 0.6954, + "step": 3639 + }, + { + "epoch": 0.2003412405746051, + "grad_norm": 0.8077614903450012, + "learning_rate": 9.766381882870635e-06, + "loss": 0.7724, + "step": 3640 + }, + { + "epoch": 0.20039627937696075, + "grad_norm": 0.8886739015579224, + "learning_rate": 9.766250914880994e-06, + "loss": 0.8318, + "step": 3641 + }, + { + "epoch": 0.2004513181793164, + "grad_norm": 0.8086267113685608, + "learning_rate": 9.76611991106949e-06, + "loss": 0.8494, + "step": 3642 + }, + { + "epoch": 0.20050635698167207, + "grad_norm": 0.8606873750686646, + "learning_rate": 9.765988871437101e-06, + "loss": 0.8488, + "step": 3643 + }, + { + "epoch": 0.20056139578402774, + "grad_norm": 0.6966355443000793, + "learning_rate": 9.76585779598482e-06, + "loss": 0.7361, + "step": 3644 + }, + { + "epoch": 0.2006164345863834, + "grad_norm": 0.8474385738372803, + "learning_rate": 9.765726684713623e-06, + "loss": 0.8354, + "step": 3645 + }, + { + "epoch": 0.20067147338873906, + "grad_norm": 0.7609736919403076, + "learning_rate": 9.765595537624502e-06, + "loss": 0.7297, + "step": 3646 + }, + { + "epoch": 0.20072651219109472, + "grad_norm": 1.08648681640625, + "learning_rate": 9.76546435471844e-06, + "loss": 0.7534, + "step": 3647 + }, + { + "epoch": 0.20078155099345038, + "grad_norm": 0.7437332272529602, + "learning_rate": 9.765333135996425e-06, + "loss": 0.8532, + "step": 3648 + }, + { + "epoch": 0.20083658979580604, + "grad_norm": 0.9016552567481995, + "learning_rate": 9.76520188145944e-06, + "loss": 0.7968, + "step": 3649 + }, + { + "epoch": 0.2008916285981617, + "grad_norm": 0.8916428089141846, + "learning_rate": 9.765070591108473e-06, + "loss": 0.9601, + "step": 3650 + }, + { + "epoch": 0.20094666740051736, + "grad_norm": 0.7679058313369751, + "learning_rate": 9.764939264944512e-06, + "loss": 0.816, + "step": 3651 + }, + { + "epoch": 0.20100170620287303, + "grad_norm": 0.7716549634933472, + "learning_rate": 9.764807902968543e-06, + "loss": 0.876, + "step": 3652 + }, + { + "epoch": 0.2010567450052287, + "grad_norm": 0.8288074731826782, + "learning_rate": 9.764676505181554e-06, + "loss": 0.8054, + "step": 3653 + }, + { + "epoch": 0.20111178380758435, + "grad_norm": 0.7906842827796936, + "learning_rate": 9.76454507158453e-06, + "loss": 0.8026, + "step": 3654 + }, + { + "epoch": 0.20116682260994, + "grad_norm": 0.8093311190605164, + "learning_rate": 9.764413602178461e-06, + "loss": 0.8093, + "step": 3655 + }, + { + "epoch": 0.20122186141229567, + "grad_norm": 0.7234730124473572, + "learning_rate": 9.764282096964335e-06, + "loss": 0.7194, + "step": 3656 + }, + { + "epoch": 0.20127690021465133, + "grad_norm": 0.9048555493354797, + "learning_rate": 9.76415055594314e-06, + "loss": 0.8996, + "step": 3657 + }, + { + "epoch": 0.201331939017007, + "grad_norm": 0.7630691528320312, + "learning_rate": 9.764018979115864e-06, + "loss": 0.7876, + "step": 3658 + }, + { + "epoch": 0.20138697781936266, + "grad_norm": 0.9551032781600952, + "learning_rate": 9.763887366483498e-06, + "loss": 0.8249, + "step": 3659 + }, + { + "epoch": 0.20144201662171832, + "grad_norm": 0.6988314986228943, + "learning_rate": 9.76375571804703e-06, + "loss": 0.8011, + "step": 3660 + }, + { + "epoch": 0.20149705542407398, + "grad_norm": 0.7790704369544983, + "learning_rate": 9.763624033807448e-06, + "loss": 0.8287, + "step": 3661 + }, + { + "epoch": 0.20155209422642964, + "grad_norm": 0.7201293706893921, + "learning_rate": 9.763492313765743e-06, + "loss": 0.7854, + "step": 3662 + }, + { + "epoch": 0.2016071330287853, + "grad_norm": 0.8691730499267578, + "learning_rate": 9.763360557922905e-06, + "loss": 0.8348, + "step": 3663 + }, + { + "epoch": 0.20166217183114096, + "grad_norm": 0.7660881876945496, + "learning_rate": 9.763228766279924e-06, + "loss": 0.7686, + "step": 3664 + }, + { + "epoch": 0.20171721063349662, + "grad_norm": 1.083796501159668, + "learning_rate": 9.76309693883779e-06, + "loss": 0.8848, + "step": 3665 + }, + { + "epoch": 0.20177224943585229, + "grad_norm": 0.7892678380012512, + "learning_rate": 9.762965075597496e-06, + "loss": 0.7804, + "step": 3666 + }, + { + "epoch": 0.20182728823820795, + "grad_norm": 0.7166122198104858, + "learning_rate": 9.762833176560031e-06, + "loss": 0.761, + "step": 3667 + }, + { + "epoch": 0.2018823270405636, + "grad_norm": 0.8187084794044495, + "learning_rate": 9.762701241726386e-06, + "loss": 0.8251, + "step": 3668 + }, + { + "epoch": 0.20193736584291927, + "grad_norm": 0.6930577158927917, + "learning_rate": 9.762569271097556e-06, + "loss": 0.6795, + "step": 3669 + }, + { + "epoch": 0.20199240464527493, + "grad_norm": 0.8085465431213379, + "learning_rate": 9.762437264674527e-06, + "loss": 0.8415, + "step": 3670 + }, + { + "epoch": 0.2020474434476306, + "grad_norm": 0.8111084699630737, + "learning_rate": 9.762305222458294e-06, + "loss": 0.792, + "step": 3671 + }, + { + "epoch": 0.20210248224998625, + "grad_norm": 0.8200401067733765, + "learning_rate": 9.762173144449852e-06, + "loss": 0.8224, + "step": 3672 + }, + { + "epoch": 0.2021575210523419, + "grad_norm": 0.8460109233856201, + "learning_rate": 9.762041030650192e-06, + "loss": 0.9025, + "step": 3673 + }, + { + "epoch": 0.20221255985469755, + "grad_norm": 0.8152671456336975, + "learning_rate": 9.761908881060303e-06, + "loss": 0.9002, + "step": 3674 + }, + { + "epoch": 0.2022675986570532, + "grad_norm": 0.8204773664474487, + "learning_rate": 9.761776695681185e-06, + "loss": 0.8324, + "step": 3675 + }, + { + "epoch": 0.20232263745940887, + "grad_norm": 0.8121044039726257, + "learning_rate": 9.761644474513825e-06, + "loss": 0.855, + "step": 3676 + }, + { + "epoch": 0.20237767626176453, + "grad_norm": 0.79920494556427, + "learning_rate": 9.76151221755922e-06, + "loss": 0.7837, + "step": 3677 + }, + { + "epoch": 0.2024327150641202, + "grad_norm": 0.862808346748352, + "learning_rate": 9.761379924818367e-06, + "loss": 0.8714, + "step": 3678 + }, + { + "epoch": 0.20248775386647586, + "grad_norm": 0.7135004997253418, + "learning_rate": 9.761247596292254e-06, + "loss": 0.774, + "step": 3679 + }, + { + "epoch": 0.20254279266883152, + "grad_norm": 0.7967603802680969, + "learning_rate": 9.761115231981878e-06, + "loss": 0.919, + "step": 3680 + }, + { + "epoch": 0.20259783147118718, + "grad_norm": 0.7425099611282349, + "learning_rate": 9.760982831888236e-06, + "loss": 0.819, + "step": 3681 + }, + { + "epoch": 0.20265287027354284, + "grad_norm": 0.7631763815879822, + "learning_rate": 9.760850396012323e-06, + "loss": 0.816, + "step": 3682 + }, + { + "epoch": 0.2027079090758985, + "grad_norm": 0.7931755185127258, + "learning_rate": 9.76071792435513e-06, + "loss": 0.8299, + "step": 3683 + }, + { + "epoch": 0.20276294787825416, + "grad_norm": 0.8409438729286194, + "learning_rate": 9.760585416917657e-06, + "loss": 0.8503, + "step": 3684 + }, + { + "epoch": 0.20281798668060982, + "grad_norm": 0.7632728815078735, + "learning_rate": 9.760452873700898e-06, + "loss": 0.8394, + "step": 3685 + }, + { + "epoch": 0.20287302548296549, + "grad_norm": 0.7765083312988281, + "learning_rate": 9.76032029470585e-06, + "loss": 0.8879, + "step": 3686 + }, + { + "epoch": 0.20292806428532115, + "grad_norm": 0.7736936807632446, + "learning_rate": 9.760187679933507e-06, + "loss": 0.7987, + "step": 3687 + }, + { + "epoch": 0.2029831030876768, + "grad_norm": 0.8270270824432373, + "learning_rate": 9.760055029384869e-06, + "loss": 0.8267, + "step": 3688 + }, + { + "epoch": 0.20303814189003247, + "grad_norm": 0.7742369174957275, + "learning_rate": 9.759922343060932e-06, + "loss": 0.8447, + "step": 3689 + }, + { + "epoch": 0.20309318069238813, + "grad_norm": 0.7543869018554688, + "learning_rate": 9.759789620962692e-06, + "loss": 0.7325, + "step": 3690 + }, + { + "epoch": 0.2031482194947438, + "grad_norm": 0.7913174033164978, + "learning_rate": 9.759656863091147e-06, + "loss": 0.8622, + "step": 3691 + }, + { + "epoch": 0.20320325829709945, + "grad_norm": 0.7445376515388489, + "learning_rate": 9.759524069447296e-06, + "loss": 0.7115, + "step": 3692 + }, + { + "epoch": 0.20325829709945512, + "grad_norm": 0.7744696140289307, + "learning_rate": 9.759391240032136e-06, + "loss": 0.8437, + "step": 3693 + }, + { + "epoch": 0.20331333590181078, + "grad_norm": 0.6984724998474121, + "learning_rate": 9.759258374846665e-06, + "loss": 0.7415, + "step": 3694 + }, + { + "epoch": 0.20336837470416644, + "grad_norm": 0.7453249096870422, + "learning_rate": 9.759125473891882e-06, + "loss": 0.7708, + "step": 3695 + }, + { + "epoch": 0.2034234135065221, + "grad_norm": 0.7459438443183899, + "learning_rate": 9.758992537168787e-06, + "loss": 0.7961, + "step": 3696 + }, + { + "epoch": 0.20347845230887776, + "grad_norm": 0.808944582939148, + "learning_rate": 9.758859564678377e-06, + "loss": 0.8875, + "step": 3697 + }, + { + "epoch": 0.20353349111123342, + "grad_norm": 0.7202889323234558, + "learning_rate": 9.758726556421652e-06, + "loss": 0.8064, + "step": 3698 + }, + { + "epoch": 0.20358852991358908, + "grad_norm": 0.7874952554702759, + "learning_rate": 9.758593512399613e-06, + "loss": 0.7881, + "step": 3699 + }, + { + "epoch": 0.20364356871594474, + "grad_norm": 0.771300733089447, + "learning_rate": 9.758460432613259e-06, + "loss": 0.8938, + "step": 3700 + }, + { + "epoch": 0.2036986075183004, + "grad_norm": 0.7332000136375427, + "learning_rate": 9.758327317063589e-06, + "loss": 0.7369, + "step": 3701 + }, + { + "epoch": 0.20375364632065607, + "grad_norm": 0.8206236958503723, + "learning_rate": 9.758194165751604e-06, + "loss": 0.8727, + "step": 3702 + }, + { + "epoch": 0.20380868512301173, + "grad_norm": 0.7750238180160522, + "learning_rate": 9.758060978678308e-06, + "loss": 0.8013, + "step": 3703 + }, + { + "epoch": 0.2038637239253674, + "grad_norm": 0.7213704586029053, + "learning_rate": 9.757927755844698e-06, + "loss": 0.7413, + "step": 3704 + }, + { + "epoch": 0.20391876272772305, + "grad_norm": 0.8982640504837036, + "learning_rate": 9.757794497251776e-06, + "loss": 0.9077, + "step": 3705 + }, + { + "epoch": 0.2039738015300787, + "grad_norm": 0.8439363241195679, + "learning_rate": 9.757661202900544e-06, + "loss": 0.7887, + "step": 3706 + }, + { + "epoch": 0.20402884033243437, + "grad_norm": 0.7700560688972473, + "learning_rate": 9.757527872792005e-06, + "loss": 0.8677, + "step": 3707 + }, + { + "epoch": 0.20408387913479004, + "grad_norm": 0.7462438941001892, + "learning_rate": 9.75739450692716e-06, + "loss": 0.7937, + "step": 3708 + }, + { + "epoch": 0.2041389179371457, + "grad_norm": 0.9125999808311462, + "learning_rate": 9.75726110530701e-06, + "loss": 0.9374, + "step": 3709 + }, + { + "epoch": 0.20419395673950136, + "grad_norm": 0.8949875831604004, + "learning_rate": 9.75712766793256e-06, + "loss": 0.8586, + "step": 3710 + }, + { + "epoch": 0.20424899554185702, + "grad_norm": 0.9042442440986633, + "learning_rate": 9.756994194804812e-06, + "loss": 0.9411, + "step": 3711 + }, + { + "epoch": 0.20430403434421268, + "grad_norm": 0.7646238207817078, + "learning_rate": 9.756860685924769e-06, + "loss": 0.8353, + "step": 3712 + }, + { + "epoch": 0.20435907314656834, + "grad_norm": 0.7551934123039246, + "learning_rate": 9.756727141293434e-06, + "loss": 0.8109, + "step": 3713 + }, + { + "epoch": 0.204414111948924, + "grad_norm": 0.7526532411575317, + "learning_rate": 9.756593560911811e-06, + "loss": 0.8509, + "step": 3714 + }, + { + "epoch": 0.20446915075127967, + "grad_norm": 0.8423319458961487, + "learning_rate": 9.756459944780903e-06, + "loss": 0.9003, + "step": 3715 + }, + { + "epoch": 0.2045241895536353, + "grad_norm": 0.7966015934944153, + "learning_rate": 9.756326292901716e-06, + "loss": 0.7606, + "step": 3716 + }, + { + "epoch": 0.20457922835599096, + "grad_norm": 0.7642805576324463, + "learning_rate": 9.756192605275256e-06, + "loss": 0.8321, + "step": 3717 + }, + { + "epoch": 0.20463426715834662, + "grad_norm": 0.7285729646682739, + "learning_rate": 9.756058881902524e-06, + "loss": 0.7375, + "step": 3718 + }, + { + "epoch": 0.20468930596070228, + "grad_norm": 0.852020263671875, + "learning_rate": 9.755925122784525e-06, + "loss": 0.8207, + "step": 3719 + }, + { + "epoch": 0.20474434476305794, + "grad_norm": 0.8227072358131409, + "learning_rate": 9.755791327922268e-06, + "loss": 0.872, + "step": 3720 + }, + { + "epoch": 0.2047993835654136, + "grad_norm": 1.0128127336502075, + "learning_rate": 9.755657497316755e-06, + "loss": 0.9186, + "step": 3721 + }, + { + "epoch": 0.20485442236776927, + "grad_norm": 0.8208017349243164, + "learning_rate": 9.755523630968994e-06, + "loss": 0.6968, + "step": 3722 + }, + { + "epoch": 0.20490946117012493, + "grad_norm": 0.7716407179832458, + "learning_rate": 9.75538972887999e-06, + "loss": 0.8068, + "step": 3723 + }, + { + "epoch": 0.2049644999724806, + "grad_norm": 0.779608964920044, + "learning_rate": 9.75525579105075e-06, + "loss": 0.6968, + "step": 3724 + }, + { + "epoch": 0.20501953877483625, + "grad_norm": 0.7463479042053223, + "learning_rate": 9.75512181748228e-06, + "loss": 0.7581, + "step": 3725 + }, + { + "epoch": 0.2050745775771919, + "grad_norm": 0.8104956150054932, + "learning_rate": 9.754987808175587e-06, + "loss": 0.7838, + "step": 3726 + }, + { + "epoch": 0.20512961637954757, + "grad_norm": 0.7911564707756042, + "learning_rate": 9.75485376313168e-06, + "loss": 0.848, + "step": 3727 + }, + { + "epoch": 0.20518465518190324, + "grad_norm": 0.8340871334075928, + "learning_rate": 9.754719682351564e-06, + "loss": 0.7879, + "step": 3728 + }, + { + "epoch": 0.2052396939842589, + "grad_norm": 1.5543067455291748, + "learning_rate": 9.754585565836247e-06, + "loss": 0.8091, + "step": 3729 + }, + { + "epoch": 0.20529473278661456, + "grad_norm": 0.8262580633163452, + "learning_rate": 9.754451413586739e-06, + "loss": 0.9076, + "step": 3730 + }, + { + "epoch": 0.20534977158897022, + "grad_norm": 0.7558280825614929, + "learning_rate": 9.754317225604045e-06, + "loss": 0.7781, + "step": 3731 + }, + { + "epoch": 0.20540481039132588, + "grad_norm": 0.7197710275650024, + "learning_rate": 9.754183001889177e-06, + "loss": 0.765, + "step": 3732 + }, + { + "epoch": 0.20545984919368154, + "grad_norm": 0.8053440451622009, + "learning_rate": 9.754048742443141e-06, + "loss": 0.7986, + "step": 3733 + }, + { + "epoch": 0.2055148879960372, + "grad_norm": 0.9183983206748962, + "learning_rate": 9.753914447266947e-06, + "loss": 0.8522, + "step": 3734 + }, + { + "epoch": 0.20556992679839287, + "grad_norm": 0.8095504641532898, + "learning_rate": 9.753780116361607e-06, + "loss": 0.7243, + "step": 3735 + }, + { + "epoch": 0.20562496560074853, + "grad_norm": 0.816818356513977, + "learning_rate": 9.753645749728127e-06, + "loss": 0.8262, + "step": 3736 + }, + { + "epoch": 0.2056800044031042, + "grad_norm": 0.8425988554954529, + "learning_rate": 9.753511347367516e-06, + "loss": 0.8142, + "step": 3737 + }, + { + "epoch": 0.20573504320545985, + "grad_norm": 0.7719724178314209, + "learning_rate": 9.753376909280789e-06, + "loss": 0.8444, + "step": 3738 + }, + { + "epoch": 0.2057900820078155, + "grad_norm": 0.877646803855896, + "learning_rate": 9.753242435468952e-06, + "loss": 0.8515, + "step": 3739 + }, + { + "epoch": 0.20584512081017117, + "grad_norm": 0.9261211156845093, + "learning_rate": 9.753107925933017e-06, + "loss": 0.7605, + "step": 3740 + }, + { + "epoch": 0.20590015961252683, + "grad_norm": 0.7790889739990234, + "learning_rate": 9.752973380673995e-06, + "loss": 0.7911, + "step": 3741 + }, + { + "epoch": 0.2059551984148825, + "grad_norm": 0.7112367153167725, + "learning_rate": 9.752838799692899e-06, + "loss": 0.8212, + "step": 3742 + }, + { + "epoch": 0.20601023721723816, + "grad_norm": 0.7568365335464478, + "learning_rate": 9.752704182990736e-06, + "loss": 0.8505, + "step": 3743 + }, + { + "epoch": 0.20606527601959382, + "grad_norm": 0.7501981258392334, + "learning_rate": 9.752569530568523e-06, + "loss": 0.8191, + "step": 3744 + }, + { + "epoch": 0.20612031482194948, + "grad_norm": 0.7822220325469971, + "learning_rate": 9.752434842427268e-06, + "loss": 0.8032, + "step": 3745 + }, + { + "epoch": 0.20617535362430514, + "grad_norm": 0.810197114944458, + "learning_rate": 9.752300118567987e-06, + "loss": 0.7789, + "step": 3746 + }, + { + "epoch": 0.2062303924266608, + "grad_norm": 0.7386943101882935, + "learning_rate": 9.752165358991688e-06, + "loss": 0.7733, + "step": 3747 + }, + { + "epoch": 0.20628543122901646, + "grad_norm": 0.7086807489395142, + "learning_rate": 9.75203056369939e-06, + "loss": 0.6328, + "step": 3748 + }, + { + "epoch": 0.20634047003137213, + "grad_norm": 0.9881154894828796, + "learning_rate": 9.751895732692099e-06, + "loss": 0.8515, + "step": 3749 + }, + { + "epoch": 0.2063955088337278, + "grad_norm": 0.813521683216095, + "learning_rate": 9.751760865970831e-06, + "loss": 0.8438, + "step": 3750 + }, + { + "epoch": 0.20645054763608345, + "grad_norm": 0.8357470631599426, + "learning_rate": 9.751625963536602e-06, + "loss": 0.7635, + "step": 3751 + }, + { + "epoch": 0.2065055864384391, + "grad_norm": 0.8629693388938904, + "learning_rate": 9.751491025390423e-06, + "loss": 0.888, + "step": 3752 + }, + { + "epoch": 0.20656062524079477, + "grad_norm": 0.8844664096832275, + "learning_rate": 9.751356051533311e-06, + "loss": 0.7654, + "step": 3753 + }, + { + "epoch": 0.20661566404315043, + "grad_norm": 0.7006319165229797, + "learning_rate": 9.751221041966276e-06, + "loss": 0.7618, + "step": 3754 + }, + { + "epoch": 0.2066707028455061, + "grad_norm": 0.9291046261787415, + "learning_rate": 9.75108599669034e-06, + "loss": 0.8485, + "step": 3755 + }, + { + "epoch": 0.20672574164786175, + "grad_norm": 0.7670828700065613, + "learning_rate": 9.75095091570651e-06, + "loss": 0.7856, + "step": 3756 + }, + { + "epoch": 0.20678078045021742, + "grad_norm": 0.8709883689880371, + "learning_rate": 9.750815799015804e-06, + "loss": 0.7983, + "step": 3757 + }, + { + "epoch": 0.20683581925257308, + "grad_norm": 0.7688055634498596, + "learning_rate": 9.750680646619241e-06, + "loss": 0.8064, + "step": 3758 + }, + { + "epoch": 0.2068908580549287, + "grad_norm": 0.9492738246917725, + "learning_rate": 9.750545458517832e-06, + "loss": 0.8256, + "step": 3759 + }, + { + "epoch": 0.20694589685728437, + "grad_norm": 0.9685352444648743, + "learning_rate": 9.750410234712596e-06, + "loss": 0.839, + "step": 3760 + }, + { + "epoch": 0.20700093565964003, + "grad_norm": 0.788577139377594, + "learning_rate": 9.750274975204547e-06, + "loss": 0.8743, + "step": 3761 + }, + { + "epoch": 0.2070559744619957, + "grad_norm": 0.8496370315551758, + "learning_rate": 9.750139679994703e-06, + "loss": 0.9286, + "step": 3762 + }, + { + "epoch": 0.20711101326435136, + "grad_norm": 0.9539788961410522, + "learning_rate": 9.750004349084083e-06, + "loss": 0.7568, + "step": 3763 + }, + { + "epoch": 0.20716605206670702, + "grad_norm": 0.8825643062591553, + "learning_rate": 9.7498689824737e-06, + "loss": 0.9339, + "step": 3764 + }, + { + "epoch": 0.20722109086906268, + "grad_norm": 0.7771373391151428, + "learning_rate": 9.749733580164573e-06, + "loss": 0.851, + "step": 3765 + }, + { + "epoch": 0.20727612967141834, + "grad_norm": 0.7460281252861023, + "learning_rate": 9.749598142157721e-06, + "loss": 0.8208, + "step": 3766 + }, + { + "epoch": 0.207331168473774, + "grad_norm": 0.8370739817619324, + "learning_rate": 9.74946266845416e-06, + "loss": 0.8634, + "step": 3767 + }, + { + "epoch": 0.20738620727612966, + "grad_norm": 0.7770463228225708, + "learning_rate": 9.749327159054907e-06, + "loss": 0.7955, + "step": 3768 + }, + { + "epoch": 0.20744124607848533, + "grad_norm": 0.8048208355903625, + "learning_rate": 9.749191613960985e-06, + "loss": 0.7736, + "step": 3769 + }, + { + "epoch": 0.207496284880841, + "grad_norm": 0.9187547564506531, + "learning_rate": 9.74905603317341e-06, + "loss": 0.8534, + "step": 3770 + }, + { + "epoch": 0.20755132368319665, + "grad_norm": 0.7304024696350098, + "learning_rate": 9.7489204166932e-06, + "loss": 0.72, + "step": 3771 + }, + { + "epoch": 0.2076063624855523, + "grad_norm": 0.86177659034729, + "learning_rate": 9.748784764521376e-06, + "loss": 0.7838, + "step": 3772 + }, + { + "epoch": 0.20766140128790797, + "grad_norm": 0.7988011837005615, + "learning_rate": 9.748649076658956e-06, + "loss": 0.7776, + "step": 3773 + }, + { + "epoch": 0.20771644009026363, + "grad_norm": 0.706099808216095, + "learning_rate": 9.74851335310696e-06, + "loss": 0.759, + "step": 3774 + }, + { + "epoch": 0.2077714788926193, + "grad_norm": 0.8125914931297302, + "learning_rate": 9.748377593866412e-06, + "loss": 0.8155, + "step": 3775 + }, + { + "epoch": 0.20782651769497495, + "grad_norm": 0.8603429794311523, + "learning_rate": 9.748241798938326e-06, + "loss": 0.8018, + "step": 3776 + }, + { + "epoch": 0.20788155649733062, + "grad_norm": 0.7735254764556885, + "learning_rate": 9.748105968323726e-06, + "loss": 0.7788, + "step": 3777 + }, + { + "epoch": 0.20793659529968628, + "grad_norm": 0.9037501811981201, + "learning_rate": 9.747970102023635e-06, + "loss": 0.8907, + "step": 3778 + }, + { + "epoch": 0.20799163410204194, + "grad_norm": 0.8781846761703491, + "learning_rate": 9.74783420003907e-06, + "loss": 0.867, + "step": 3779 + }, + { + "epoch": 0.2080466729043976, + "grad_norm": 0.8486423492431641, + "learning_rate": 9.747698262371052e-06, + "loss": 0.817, + "step": 3780 + }, + { + "epoch": 0.20810171170675326, + "grad_norm": 0.8242751359939575, + "learning_rate": 9.747562289020607e-06, + "loss": 0.7385, + "step": 3781 + }, + { + "epoch": 0.20815675050910892, + "grad_norm": 0.8776529431343079, + "learning_rate": 9.747426279988754e-06, + "loss": 0.8222, + "step": 3782 + }, + { + "epoch": 0.20821178931146458, + "grad_norm": 0.7428975105285645, + "learning_rate": 9.747290235276517e-06, + "loss": 0.6954, + "step": 3783 + }, + { + "epoch": 0.20826682811382025, + "grad_norm": 0.8631997108459473, + "learning_rate": 9.747154154884917e-06, + "loss": 0.7956, + "step": 3784 + }, + { + "epoch": 0.2083218669161759, + "grad_norm": 0.7819229364395142, + "learning_rate": 9.747018038814976e-06, + "loss": 0.778, + "step": 3785 + }, + { + "epoch": 0.20837690571853157, + "grad_norm": 0.7770963311195374, + "learning_rate": 9.746881887067718e-06, + "loss": 0.8055, + "step": 3786 + }, + { + "epoch": 0.20843194452088723, + "grad_norm": 0.7168729305267334, + "learning_rate": 9.746745699644169e-06, + "loss": 0.7476, + "step": 3787 + }, + { + "epoch": 0.2084869833232429, + "grad_norm": 0.7963632941246033, + "learning_rate": 9.746609476545348e-06, + "loss": 0.8083, + "step": 3788 + }, + { + "epoch": 0.20854202212559855, + "grad_norm": 0.6689679026603699, + "learning_rate": 9.746473217772281e-06, + "loss": 0.6687, + "step": 3789 + }, + { + "epoch": 0.20859706092795421, + "grad_norm": 0.8085560202598572, + "learning_rate": 9.746336923325991e-06, + "loss": 0.8221, + "step": 3790 + }, + { + "epoch": 0.20865209973030988, + "grad_norm": 0.7215744256973267, + "learning_rate": 9.746200593207505e-06, + "loss": 0.7261, + "step": 3791 + }, + { + "epoch": 0.20870713853266554, + "grad_norm": 0.7821729779243469, + "learning_rate": 9.746064227417844e-06, + "loss": 0.7683, + "step": 3792 + }, + { + "epoch": 0.2087621773350212, + "grad_norm": 1.0014925003051758, + "learning_rate": 9.745927825958036e-06, + "loss": 0.7485, + "step": 3793 + }, + { + "epoch": 0.20881721613737686, + "grad_norm": 0.9447367787361145, + "learning_rate": 9.745791388829102e-06, + "loss": 0.835, + "step": 3794 + }, + { + "epoch": 0.20887225493973252, + "grad_norm": 0.7333751916885376, + "learning_rate": 9.745654916032073e-06, + "loss": 0.811, + "step": 3795 + }, + { + "epoch": 0.20892729374208818, + "grad_norm": 0.7516912221908569, + "learning_rate": 9.745518407567973e-06, + "loss": 0.7669, + "step": 3796 + }, + { + "epoch": 0.20898233254444384, + "grad_norm": 0.7826053500175476, + "learning_rate": 9.745381863437824e-06, + "loss": 0.7963, + "step": 3797 + }, + { + "epoch": 0.2090373713467995, + "grad_norm": 0.8258751630783081, + "learning_rate": 9.745245283642658e-06, + "loss": 0.7929, + "step": 3798 + }, + { + "epoch": 0.20909241014915517, + "grad_norm": 0.7990522980690002, + "learning_rate": 9.745108668183497e-06, + "loss": 0.8518, + "step": 3799 + }, + { + "epoch": 0.20914744895151083, + "grad_norm": 1.3855403661727905, + "learning_rate": 9.744972017061369e-06, + "loss": 0.7768, + "step": 3800 + }, + { + "epoch": 0.2092024877538665, + "grad_norm": 0.8456707000732422, + "learning_rate": 9.744835330277302e-06, + "loss": 0.7629, + "step": 3801 + }, + { + "epoch": 0.20925752655622212, + "grad_norm": 0.8992564678192139, + "learning_rate": 9.744698607832323e-06, + "loss": 0.8991, + "step": 3802 + }, + { + "epoch": 0.20931256535857778, + "grad_norm": 0.8533509969711304, + "learning_rate": 9.744561849727459e-06, + "loss": 0.8883, + "step": 3803 + }, + { + "epoch": 0.20936760416093345, + "grad_norm": 0.8363122940063477, + "learning_rate": 9.744425055963739e-06, + "loss": 0.8537, + "step": 3804 + }, + { + "epoch": 0.2094226429632891, + "grad_norm": 0.7462213039398193, + "learning_rate": 9.744288226542189e-06, + "loss": 0.7713, + "step": 3805 + }, + { + "epoch": 0.20947768176564477, + "grad_norm": 0.8148539066314697, + "learning_rate": 9.744151361463841e-06, + "loss": 0.7887, + "step": 3806 + }, + { + "epoch": 0.20953272056800043, + "grad_norm": 0.7504319548606873, + "learning_rate": 9.744014460729718e-06, + "loss": 0.7385, + "step": 3807 + }, + { + "epoch": 0.2095877593703561, + "grad_norm": 0.9291114807128906, + "learning_rate": 9.743877524340854e-06, + "loss": 0.9886, + "step": 3808 + }, + { + "epoch": 0.20964279817271175, + "grad_norm": 0.7747925519943237, + "learning_rate": 9.743740552298276e-06, + "loss": 0.8772, + "step": 3809 + }, + { + "epoch": 0.20969783697506741, + "grad_norm": 0.7283097505569458, + "learning_rate": 9.743603544603016e-06, + "loss": 0.7403, + "step": 3810 + }, + { + "epoch": 0.20975287577742308, + "grad_norm": 0.8403457999229431, + "learning_rate": 9.743466501256098e-06, + "loss": 0.7998, + "step": 3811 + }, + { + "epoch": 0.20980791457977874, + "grad_norm": 0.8218665719032288, + "learning_rate": 9.743329422258557e-06, + "loss": 0.8019, + "step": 3812 + }, + { + "epoch": 0.2098629533821344, + "grad_norm": 0.6991317868232727, + "learning_rate": 9.743192307611423e-06, + "loss": 0.743, + "step": 3813 + }, + { + "epoch": 0.20991799218449006, + "grad_norm": 0.767295241355896, + "learning_rate": 9.743055157315725e-06, + "loss": 0.8003, + "step": 3814 + }, + { + "epoch": 0.20997303098684572, + "grad_norm": 0.9457303285598755, + "learning_rate": 9.742917971372492e-06, + "loss": 0.8448, + "step": 3815 + }, + { + "epoch": 0.21002806978920138, + "grad_norm": 0.7839058637619019, + "learning_rate": 9.742780749782758e-06, + "loss": 0.8828, + "step": 3816 + }, + { + "epoch": 0.21008310859155704, + "grad_norm": 0.7831344604492188, + "learning_rate": 9.742643492547553e-06, + "loss": 0.7714, + "step": 3817 + }, + { + "epoch": 0.2101381473939127, + "grad_norm": 0.7637175917625427, + "learning_rate": 9.74250619966791e-06, + "loss": 0.7508, + "step": 3818 + }, + { + "epoch": 0.21019318619626837, + "grad_norm": 0.8830221891403198, + "learning_rate": 9.74236887114486e-06, + "loss": 0.8508, + "step": 3819 + }, + { + "epoch": 0.21024822499862403, + "grad_norm": 0.7803365588188171, + "learning_rate": 9.742231506979434e-06, + "loss": 0.8094, + "step": 3820 + }, + { + "epoch": 0.2103032638009797, + "grad_norm": 0.7701493501663208, + "learning_rate": 9.742094107172666e-06, + "loss": 0.8851, + "step": 3821 + }, + { + "epoch": 0.21035830260333535, + "grad_norm": 0.6434544324874878, + "learning_rate": 9.741956671725588e-06, + "loss": 0.7015, + "step": 3822 + }, + { + "epoch": 0.210413341405691, + "grad_norm": 0.7294684052467346, + "learning_rate": 9.741819200639233e-06, + "loss": 0.7357, + "step": 3823 + }, + { + "epoch": 0.21046838020804667, + "grad_norm": 0.702367901802063, + "learning_rate": 9.741681693914635e-06, + "loss": 0.7518, + "step": 3824 + }, + { + "epoch": 0.21052341901040234, + "grad_norm": 0.7567246556282043, + "learning_rate": 9.741544151552826e-06, + "loss": 0.8259, + "step": 3825 + }, + { + "epoch": 0.210578457812758, + "grad_norm": 0.7478607892990112, + "learning_rate": 9.741406573554841e-06, + "loss": 0.81, + "step": 3826 + }, + { + "epoch": 0.21063349661511366, + "grad_norm": 0.7270129323005676, + "learning_rate": 9.741268959921712e-06, + "loss": 0.8201, + "step": 3827 + }, + { + "epoch": 0.21068853541746932, + "grad_norm": 0.8108176589012146, + "learning_rate": 9.741131310654475e-06, + "loss": 0.8425, + "step": 3828 + }, + { + "epoch": 0.21074357421982498, + "grad_norm": 0.7773691415786743, + "learning_rate": 9.740993625754165e-06, + "loss": 0.8372, + "step": 3829 + }, + { + "epoch": 0.21079861302218064, + "grad_norm": 0.8988421559333801, + "learning_rate": 9.740855905221816e-06, + "loss": 0.8285, + "step": 3830 + }, + { + "epoch": 0.2108536518245363, + "grad_norm": 0.7339534759521484, + "learning_rate": 9.740718149058462e-06, + "loss": 0.7567, + "step": 3831 + }, + { + "epoch": 0.21090869062689196, + "grad_norm": 0.8465108275413513, + "learning_rate": 9.740580357265141e-06, + "loss": 0.8747, + "step": 3832 + }, + { + "epoch": 0.21096372942924763, + "grad_norm": 0.7956714034080505, + "learning_rate": 9.740442529842885e-06, + "loss": 0.7665, + "step": 3833 + }, + { + "epoch": 0.2110187682316033, + "grad_norm": 0.96270751953125, + "learning_rate": 9.740304666792733e-06, + "loss": 0.8338, + "step": 3834 + }, + { + "epoch": 0.21107380703395895, + "grad_norm": 0.812329113483429, + "learning_rate": 9.74016676811572e-06, + "loss": 0.8407, + "step": 3835 + }, + { + "epoch": 0.2111288458363146, + "grad_norm": 0.7975192070007324, + "learning_rate": 9.740028833812882e-06, + "loss": 0.7836, + "step": 3836 + }, + { + "epoch": 0.21118388463867027, + "grad_norm": 0.826621949672699, + "learning_rate": 9.739890863885258e-06, + "loss": 0.732, + "step": 3837 + }, + { + "epoch": 0.21123892344102593, + "grad_norm": 0.9015662670135498, + "learning_rate": 9.73975285833388e-06, + "loss": 0.8837, + "step": 3838 + }, + { + "epoch": 0.2112939622433816, + "grad_norm": 0.7641518712043762, + "learning_rate": 9.73961481715979e-06, + "loss": 0.7334, + "step": 3839 + }, + { + "epoch": 0.21134900104573726, + "grad_norm": 0.8062206506729126, + "learning_rate": 9.739476740364023e-06, + "loss": 0.8381, + "step": 3840 + }, + { + "epoch": 0.21140403984809292, + "grad_norm": 0.7301875352859497, + "learning_rate": 9.739338627947619e-06, + "loss": 0.7389, + "step": 3841 + }, + { + "epoch": 0.21145907865044858, + "grad_norm": 0.8995181322097778, + "learning_rate": 9.739200479911612e-06, + "loss": 0.8111, + "step": 3842 + }, + { + "epoch": 0.21151411745280424, + "grad_norm": 0.7154433131217957, + "learning_rate": 9.739062296257045e-06, + "loss": 0.7501, + "step": 3843 + }, + { + "epoch": 0.2115691562551599, + "grad_norm": 0.8403087854385376, + "learning_rate": 9.738924076984954e-06, + "loss": 0.8212, + "step": 3844 + }, + { + "epoch": 0.21162419505751554, + "grad_norm": 0.7616639137268066, + "learning_rate": 9.738785822096377e-06, + "loss": 0.82, + "step": 3845 + }, + { + "epoch": 0.2116792338598712, + "grad_norm": 0.7897970080375671, + "learning_rate": 9.738647531592356e-06, + "loss": 0.7972, + "step": 3846 + }, + { + "epoch": 0.21173427266222686, + "grad_norm": 0.7909015417098999, + "learning_rate": 9.738509205473928e-06, + "loss": 0.7939, + "step": 3847 + }, + { + "epoch": 0.21178931146458252, + "grad_norm": 0.9553212523460388, + "learning_rate": 9.73837084374213e-06, + "loss": 0.8672, + "step": 3848 + }, + { + "epoch": 0.21184435026693818, + "grad_norm": 0.9558283686637878, + "learning_rate": 9.73823244639801e-06, + "loss": 0.897, + "step": 3849 + }, + { + "epoch": 0.21189938906929384, + "grad_norm": 0.819530725479126, + "learning_rate": 9.7380940134426e-06, + "loss": 0.86, + "step": 3850 + }, + { + "epoch": 0.2119544278716495, + "grad_norm": 0.7301751971244812, + "learning_rate": 9.737955544876945e-06, + "loss": 0.8265, + "step": 3851 + }, + { + "epoch": 0.21200946667400516, + "grad_norm": 0.8564972281455994, + "learning_rate": 9.737817040702085e-06, + "loss": 0.8253, + "step": 3852 + }, + { + "epoch": 0.21206450547636083, + "grad_norm": 0.7715204358100891, + "learning_rate": 9.737678500919059e-06, + "loss": 0.7779, + "step": 3853 + }, + { + "epoch": 0.2121195442787165, + "grad_norm": 0.7296929955482483, + "learning_rate": 9.73753992552891e-06, + "loss": 0.787, + "step": 3854 + }, + { + "epoch": 0.21217458308107215, + "grad_norm": 0.8574217557907104, + "learning_rate": 9.73740131453268e-06, + "loss": 0.797, + "step": 3855 + }, + { + "epoch": 0.2122296218834278, + "grad_norm": 0.8320643901824951, + "learning_rate": 9.737262667931409e-06, + "loss": 0.876, + "step": 3856 + }, + { + "epoch": 0.21228466068578347, + "grad_norm": 0.7313587069511414, + "learning_rate": 9.73712398572614e-06, + "loss": 0.7151, + "step": 3857 + }, + { + "epoch": 0.21233969948813913, + "grad_norm": 0.7039312720298767, + "learning_rate": 9.736985267917916e-06, + "loss": 0.7353, + "step": 3858 + }, + { + "epoch": 0.2123947382904948, + "grad_norm": 0.7893409132957458, + "learning_rate": 9.736846514507776e-06, + "loss": 0.8383, + "step": 3859 + }, + { + "epoch": 0.21244977709285046, + "grad_norm": 0.8771371245384216, + "learning_rate": 9.736707725496767e-06, + "loss": 0.7543, + "step": 3860 + }, + { + "epoch": 0.21250481589520612, + "grad_norm": 1.0067707300186157, + "learning_rate": 9.736568900885932e-06, + "loss": 0.796, + "step": 3861 + }, + { + "epoch": 0.21255985469756178, + "grad_norm": 0.9171931743621826, + "learning_rate": 9.736430040676312e-06, + "loss": 0.8174, + "step": 3862 + }, + { + "epoch": 0.21261489349991744, + "grad_norm": 0.7616068720817566, + "learning_rate": 9.736291144868952e-06, + "loss": 0.7762, + "step": 3863 + }, + { + "epoch": 0.2126699323022731, + "grad_norm": 0.789010226726532, + "learning_rate": 9.736152213464895e-06, + "loss": 0.7749, + "step": 3864 + }, + { + "epoch": 0.21272497110462876, + "grad_norm": 0.7943348288536072, + "learning_rate": 9.736013246465187e-06, + "loss": 0.6687, + "step": 3865 + }, + { + "epoch": 0.21278000990698442, + "grad_norm": 0.8351758718490601, + "learning_rate": 9.73587424387087e-06, + "loss": 0.9201, + "step": 3866 + }, + { + "epoch": 0.21283504870934009, + "grad_norm": 0.7710975408554077, + "learning_rate": 9.735735205682991e-06, + "loss": 0.8357, + "step": 3867 + }, + { + "epoch": 0.21289008751169575, + "grad_norm": 0.8955768942832947, + "learning_rate": 9.73559613190259e-06, + "loss": 0.8396, + "step": 3868 + }, + { + "epoch": 0.2129451263140514, + "grad_norm": 0.8664666414260864, + "learning_rate": 9.735457022530722e-06, + "loss": 0.8176, + "step": 3869 + }, + { + "epoch": 0.21300016511640707, + "grad_norm": 0.7955949902534485, + "learning_rate": 9.735317877568424e-06, + "loss": 0.8421, + "step": 3870 + }, + { + "epoch": 0.21305520391876273, + "grad_norm": 0.849866509437561, + "learning_rate": 9.735178697016742e-06, + "loss": 0.7677, + "step": 3871 + }, + { + "epoch": 0.2131102427211184, + "grad_norm": 0.7782625555992126, + "learning_rate": 9.735039480876727e-06, + "loss": 0.7838, + "step": 3872 + }, + { + "epoch": 0.21316528152347405, + "grad_norm": 0.7734919190406799, + "learning_rate": 9.734900229149423e-06, + "loss": 0.757, + "step": 3873 + }, + { + "epoch": 0.21322032032582972, + "grad_norm": 0.8462040424346924, + "learning_rate": 9.734760941835876e-06, + "loss": 0.8841, + "step": 3874 + }, + { + "epoch": 0.21327535912818538, + "grad_norm": 0.7219869494438171, + "learning_rate": 9.734621618937133e-06, + "loss": 0.7651, + "step": 3875 + }, + { + "epoch": 0.21333039793054104, + "grad_norm": 0.7550874352455139, + "learning_rate": 9.734482260454241e-06, + "loss": 0.8032, + "step": 3876 + }, + { + "epoch": 0.2133854367328967, + "grad_norm": 0.7504588961601257, + "learning_rate": 9.734342866388247e-06, + "loss": 0.7923, + "step": 3877 + }, + { + "epoch": 0.21344047553525236, + "grad_norm": 0.7407390475273132, + "learning_rate": 9.7342034367402e-06, + "loss": 0.7569, + "step": 3878 + }, + { + "epoch": 0.21349551433760802, + "grad_norm": 0.7911562323570251, + "learning_rate": 9.734063971511147e-06, + "loss": 0.8726, + "step": 3879 + }, + { + "epoch": 0.21355055313996368, + "grad_norm": 0.9132450819015503, + "learning_rate": 9.733924470702139e-06, + "loss": 0.9445, + "step": 3880 + }, + { + "epoch": 0.21360559194231934, + "grad_norm": 0.9639442563056946, + "learning_rate": 9.733784934314218e-06, + "loss": 0.7307, + "step": 3881 + }, + { + "epoch": 0.213660630744675, + "grad_norm": 0.7724352478981018, + "learning_rate": 9.73364536234844e-06, + "loss": 0.8337, + "step": 3882 + }, + { + "epoch": 0.21371566954703067, + "grad_norm": 0.9643296599388123, + "learning_rate": 9.733505754805848e-06, + "loss": 0.8277, + "step": 3883 + }, + { + "epoch": 0.21377070834938633, + "grad_norm": 0.8135218620300293, + "learning_rate": 9.733366111687494e-06, + "loss": 0.7933, + "step": 3884 + }, + { + "epoch": 0.213825747151742, + "grad_norm": 0.7527105212211609, + "learning_rate": 9.733226432994426e-06, + "loss": 0.7302, + "step": 3885 + }, + { + "epoch": 0.21388078595409765, + "grad_norm": 1.090550184249878, + "learning_rate": 9.733086718727698e-06, + "loss": 0.8646, + "step": 3886 + }, + { + "epoch": 0.2139358247564533, + "grad_norm": 0.9227491617202759, + "learning_rate": 9.732946968888358e-06, + "loss": 0.8525, + "step": 3887 + }, + { + "epoch": 0.21399086355880895, + "grad_norm": 0.7781830430030823, + "learning_rate": 9.732807183477454e-06, + "loss": 0.8757, + "step": 3888 + }, + { + "epoch": 0.2140459023611646, + "grad_norm": 0.7740090489387512, + "learning_rate": 9.732667362496036e-06, + "loss": 0.7557, + "step": 3889 + }, + { + "epoch": 0.21410094116352027, + "grad_norm": 0.7341694831848145, + "learning_rate": 9.732527505945159e-06, + "loss": 0.7481, + "step": 3890 + }, + { + "epoch": 0.21415597996587593, + "grad_norm": 0.8691402673721313, + "learning_rate": 9.732387613825872e-06, + "loss": 0.8395, + "step": 3891 + }, + { + "epoch": 0.2142110187682316, + "grad_norm": 0.7845497131347656, + "learning_rate": 9.732247686139227e-06, + "loss": 0.6999, + "step": 3892 + }, + { + "epoch": 0.21426605757058725, + "grad_norm": 0.7944281697273254, + "learning_rate": 9.732107722886275e-06, + "loss": 0.7677, + "step": 3893 + }, + { + "epoch": 0.21432109637294292, + "grad_norm": 0.904195249080658, + "learning_rate": 9.731967724068065e-06, + "loss": 0.8429, + "step": 3894 + }, + { + "epoch": 0.21437613517529858, + "grad_norm": 0.7968988418579102, + "learning_rate": 9.731827689685655e-06, + "loss": 0.8224, + "step": 3895 + }, + { + "epoch": 0.21443117397765424, + "grad_norm": 0.773674726486206, + "learning_rate": 9.731687619740095e-06, + "loss": 0.7743, + "step": 3896 + }, + { + "epoch": 0.2144862127800099, + "grad_norm": 0.7873631715774536, + "learning_rate": 9.731547514232439e-06, + "loss": 0.8581, + "step": 3897 + }, + { + "epoch": 0.21454125158236556, + "grad_norm": 0.7989653944969177, + "learning_rate": 9.731407373163735e-06, + "loss": 0.8447, + "step": 3898 + }, + { + "epoch": 0.21459629038472122, + "grad_norm": 0.74820876121521, + "learning_rate": 9.73126719653504e-06, + "loss": 0.8745, + "step": 3899 + }, + { + "epoch": 0.21465132918707688, + "grad_norm": 0.7191246747970581, + "learning_rate": 9.731126984347408e-06, + "loss": 0.7533, + "step": 3900 + }, + { + "epoch": 0.21470636798943254, + "grad_norm": 0.7718465328216553, + "learning_rate": 9.730986736601893e-06, + "loss": 0.8184, + "step": 3901 + }, + { + "epoch": 0.2147614067917882, + "grad_norm": 0.7055066823959351, + "learning_rate": 9.730846453299547e-06, + "loss": 0.7352, + "step": 3902 + }, + { + "epoch": 0.21481644559414387, + "grad_norm": 0.7500855326652527, + "learning_rate": 9.730706134441425e-06, + "loss": 0.8111, + "step": 3903 + }, + { + "epoch": 0.21487148439649953, + "grad_norm": 0.7568232417106628, + "learning_rate": 9.730565780028583e-06, + "loss": 0.8126, + "step": 3904 + }, + { + "epoch": 0.2149265231988552, + "grad_norm": 0.7418738007545471, + "learning_rate": 9.730425390062075e-06, + "loss": 0.8014, + "step": 3905 + }, + { + "epoch": 0.21498156200121085, + "grad_norm": 0.7967441082000732, + "learning_rate": 9.730284964542955e-06, + "loss": 0.7965, + "step": 3906 + }, + { + "epoch": 0.2150366008035665, + "grad_norm": 0.7444791197776794, + "learning_rate": 9.730144503472281e-06, + "loss": 0.7113, + "step": 3907 + }, + { + "epoch": 0.21509163960592217, + "grad_norm": 0.8372869491577148, + "learning_rate": 9.730004006851107e-06, + "loss": 0.838, + "step": 3908 + }, + { + "epoch": 0.21514667840827784, + "grad_norm": 0.7984300851821899, + "learning_rate": 9.729863474680488e-06, + "loss": 0.856, + "step": 3909 + }, + { + "epoch": 0.2152017172106335, + "grad_norm": 0.7508612871170044, + "learning_rate": 9.72972290696148e-06, + "loss": 0.7947, + "step": 3910 + }, + { + "epoch": 0.21525675601298916, + "grad_norm": 0.7559992074966431, + "learning_rate": 9.729582303695142e-06, + "loss": 0.785, + "step": 3911 + }, + { + "epoch": 0.21531179481534482, + "grad_norm": 0.7764164209365845, + "learning_rate": 9.729441664882531e-06, + "loss": 0.8297, + "step": 3912 + }, + { + "epoch": 0.21536683361770048, + "grad_norm": 0.8112726211547852, + "learning_rate": 9.7293009905247e-06, + "loss": 0.8073, + "step": 3913 + }, + { + "epoch": 0.21542187242005614, + "grad_norm": 0.9748952388763428, + "learning_rate": 9.729160280622709e-06, + "loss": 0.7584, + "step": 3914 + }, + { + "epoch": 0.2154769112224118, + "grad_norm": 0.789191484451294, + "learning_rate": 9.729019535177617e-06, + "loss": 0.7568, + "step": 3915 + }, + { + "epoch": 0.21553195002476747, + "grad_norm": 0.7300963401794434, + "learning_rate": 9.728878754190478e-06, + "loss": 0.8029, + "step": 3916 + }, + { + "epoch": 0.21558698882712313, + "grad_norm": 0.9201067686080933, + "learning_rate": 9.728737937662354e-06, + "loss": 0.8665, + "step": 3917 + }, + { + "epoch": 0.2156420276294788, + "grad_norm": 0.8820425271987915, + "learning_rate": 9.728597085594301e-06, + "loss": 0.8378, + "step": 3918 + }, + { + "epoch": 0.21569706643183445, + "grad_norm": 0.7762684226036072, + "learning_rate": 9.728456197987376e-06, + "loss": 0.8005, + "step": 3919 + }, + { + "epoch": 0.2157521052341901, + "grad_norm": 0.8429732918739319, + "learning_rate": 9.728315274842641e-06, + "loss": 0.8337, + "step": 3920 + }, + { + "epoch": 0.21580714403654577, + "grad_norm": 0.7820748090744019, + "learning_rate": 9.728174316161156e-06, + "loss": 0.8085, + "step": 3921 + }, + { + "epoch": 0.21586218283890143, + "grad_norm": 0.8748064637184143, + "learning_rate": 9.728033321943977e-06, + "loss": 0.7734, + "step": 3922 + }, + { + "epoch": 0.2159172216412571, + "grad_norm": 0.8878050446510315, + "learning_rate": 9.727892292192166e-06, + "loss": 0.9226, + "step": 3923 + }, + { + "epoch": 0.21597226044361276, + "grad_norm": 0.8156047463417053, + "learning_rate": 9.72775122690678e-06, + "loss": 0.8111, + "step": 3924 + }, + { + "epoch": 0.21602729924596842, + "grad_norm": 0.7392945885658264, + "learning_rate": 9.727610126088883e-06, + "loss": 0.758, + "step": 3925 + }, + { + "epoch": 0.21608233804832408, + "grad_norm": 0.7573148608207703, + "learning_rate": 9.727468989739532e-06, + "loss": 0.8142, + "step": 3926 + }, + { + "epoch": 0.21613737685067974, + "grad_norm": 0.831847608089447, + "learning_rate": 9.727327817859792e-06, + "loss": 0.7337, + "step": 3927 + }, + { + "epoch": 0.2161924156530354, + "grad_norm": 0.8012371063232422, + "learning_rate": 9.72718661045072e-06, + "loss": 0.8128, + "step": 3928 + }, + { + "epoch": 0.21624745445539106, + "grad_norm": 0.7985890507698059, + "learning_rate": 9.72704536751338e-06, + "loss": 0.8549, + "step": 3929 + }, + { + "epoch": 0.21630249325774673, + "grad_norm": 0.7194695472717285, + "learning_rate": 9.726904089048832e-06, + "loss": 0.775, + "step": 3930 + }, + { + "epoch": 0.21635753206010236, + "grad_norm": 0.8029330968856812, + "learning_rate": 9.726762775058138e-06, + "loss": 0.9167, + "step": 3931 + }, + { + "epoch": 0.21641257086245802, + "grad_norm": 0.7388954162597656, + "learning_rate": 9.72662142554236e-06, + "loss": 0.7295, + "step": 3932 + }, + { + "epoch": 0.21646760966481368, + "grad_norm": 0.798796534538269, + "learning_rate": 9.726480040502559e-06, + "loss": 0.8686, + "step": 3933 + }, + { + "epoch": 0.21652264846716934, + "grad_norm": 0.9977202415466309, + "learning_rate": 9.726338619939802e-06, + "loss": 0.8387, + "step": 3934 + }, + { + "epoch": 0.216577687269525, + "grad_norm": 0.8173295855522156, + "learning_rate": 9.726197163855148e-06, + "loss": 0.7773, + "step": 3935 + }, + { + "epoch": 0.21663272607188067, + "grad_norm": 0.6519538760185242, + "learning_rate": 9.72605567224966e-06, + "loss": 0.6319, + "step": 3936 + }, + { + "epoch": 0.21668776487423633, + "grad_norm": 0.8004894852638245, + "learning_rate": 9.725914145124404e-06, + "loss": 0.8281, + "step": 3937 + }, + { + "epoch": 0.216742803676592, + "grad_norm": 0.7327558398246765, + "learning_rate": 9.725772582480442e-06, + "loss": 0.7105, + "step": 3938 + }, + { + "epoch": 0.21679784247894765, + "grad_norm": 0.7624199986457825, + "learning_rate": 9.725630984318839e-06, + "loss": 0.7823, + "step": 3939 + }, + { + "epoch": 0.2168528812813033, + "grad_norm": 0.7750238180160522, + "learning_rate": 9.725489350640658e-06, + "loss": 0.8147, + "step": 3940 + }, + { + "epoch": 0.21690792008365897, + "grad_norm": 0.6886566877365112, + "learning_rate": 9.725347681446964e-06, + "loss": 0.7263, + "step": 3941 + }, + { + "epoch": 0.21696295888601463, + "grad_norm": 0.882060170173645, + "learning_rate": 9.725205976738821e-06, + "loss": 0.8931, + "step": 3942 + }, + { + "epoch": 0.2170179976883703, + "grad_norm": 0.7946881055831909, + "learning_rate": 9.725064236517297e-06, + "loss": 0.8036, + "step": 3943 + }, + { + "epoch": 0.21707303649072596, + "grad_norm": 0.7062187194824219, + "learning_rate": 9.724922460783453e-06, + "loss": 0.6915, + "step": 3944 + }, + { + "epoch": 0.21712807529308162, + "grad_norm": 0.7978640794754028, + "learning_rate": 9.724780649538356e-06, + "loss": 0.8873, + "step": 3945 + }, + { + "epoch": 0.21718311409543728, + "grad_norm": 0.8828096389770508, + "learning_rate": 9.724638802783073e-06, + "loss": 0.7114, + "step": 3946 + }, + { + "epoch": 0.21723815289779294, + "grad_norm": 0.7301073670387268, + "learning_rate": 9.724496920518672e-06, + "loss": 0.8107, + "step": 3947 + }, + { + "epoch": 0.2172931917001486, + "grad_norm": 0.7944212555885315, + "learning_rate": 9.724355002746213e-06, + "loss": 0.8135, + "step": 3948 + }, + { + "epoch": 0.21734823050250426, + "grad_norm": 0.7988898754119873, + "learning_rate": 9.724213049466768e-06, + "loss": 0.7173, + "step": 3949 + }, + { + "epoch": 0.21740326930485993, + "grad_norm": 0.7734915018081665, + "learning_rate": 9.724071060681401e-06, + "loss": 0.8131, + "step": 3950 + }, + { + "epoch": 0.2174583081072156, + "grad_norm": 0.6856646537780762, + "learning_rate": 9.723929036391183e-06, + "loss": 0.6873, + "step": 3951 + }, + { + "epoch": 0.21751334690957125, + "grad_norm": 0.8652976751327515, + "learning_rate": 9.723786976597179e-06, + "loss": 0.7908, + "step": 3952 + }, + { + "epoch": 0.2175683857119269, + "grad_norm": 0.7325445413589478, + "learning_rate": 9.723644881300453e-06, + "loss": 0.7389, + "step": 3953 + }, + { + "epoch": 0.21762342451428257, + "grad_norm": 0.8596270084381104, + "learning_rate": 9.723502750502079e-06, + "loss": 0.7785, + "step": 3954 + }, + { + "epoch": 0.21767846331663823, + "grad_norm": 0.739248514175415, + "learning_rate": 9.723360584203123e-06, + "loss": 0.8125, + "step": 3955 + }, + { + "epoch": 0.2177335021189939, + "grad_norm": 0.815617561340332, + "learning_rate": 9.723218382404652e-06, + "loss": 0.8682, + "step": 3956 + }, + { + "epoch": 0.21778854092134955, + "grad_norm": 0.758756160736084, + "learning_rate": 9.723076145107738e-06, + "loss": 0.7717, + "step": 3957 + }, + { + "epoch": 0.21784357972370522, + "grad_norm": 0.9007643461227417, + "learning_rate": 9.722933872313445e-06, + "loss": 0.7901, + "step": 3958 + }, + { + "epoch": 0.21789861852606088, + "grad_norm": 0.781548023223877, + "learning_rate": 9.722791564022846e-06, + "loss": 0.8338, + "step": 3959 + }, + { + "epoch": 0.21795365732841654, + "grad_norm": 0.7730190753936768, + "learning_rate": 9.722649220237011e-06, + "loss": 0.8032, + "step": 3960 + }, + { + "epoch": 0.2180086961307722, + "grad_norm": 0.8737791776657104, + "learning_rate": 9.722506840957009e-06, + "loss": 0.8436, + "step": 3961 + }, + { + "epoch": 0.21806373493312786, + "grad_norm": 0.8151329159736633, + "learning_rate": 9.722364426183908e-06, + "loss": 0.8115, + "step": 3962 + }, + { + "epoch": 0.21811877373548352, + "grad_norm": 0.7852860689163208, + "learning_rate": 9.722221975918782e-06, + "loss": 0.7977, + "step": 3963 + }, + { + "epoch": 0.21817381253783918, + "grad_norm": 0.9064140319824219, + "learning_rate": 9.722079490162698e-06, + "loss": 0.8799, + "step": 3964 + }, + { + "epoch": 0.21822885134019485, + "grad_norm": 0.8579906821250916, + "learning_rate": 9.72193696891673e-06, + "loss": 0.7825, + "step": 3965 + }, + { + "epoch": 0.2182838901425505, + "grad_norm": 0.8005900382995605, + "learning_rate": 9.721794412181946e-06, + "loss": 0.8601, + "step": 3966 + }, + { + "epoch": 0.21833892894490617, + "grad_norm": 0.7661529183387756, + "learning_rate": 9.721651819959421e-06, + "loss": 0.7446, + "step": 3967 + }, + { + "epoch": 0.21839396774726183, + "grad_norm": 0.7558436989784241, + "learning_rate": 9.721509192250224e-06, + "loss": 0.7484, + "step": 3968 + }, + { + "epoch": 0.2184490065496175, + "grad_norm": 0.765446126461029, + "learning_rate": 9.721366529055427e-06, + "loss": 0.7727, + "step": 3969 + }, + { + "epoch": 0.21850404535197315, + "grad_norm": 0.7329973578453064, + "learning_rate": 9.721223830376103e-06, + "loss": 0.797, + "step": 3970 + }, + { + "epoch": 0.21855908415432881, + "grad_norm": 0.8881974220275879, + "learning_rate": 9.721081096213324e-06, + "loss": 0.9199, + "step": 3971 + }, + { + "epoch": 0.21861412295668448, + "grad_norm": 0.8246786594390869, + "learning_rate": 9.720938326568165e-06, + "loss": 0.9108, + "step": 3972 + }, + { + "epoch": 0.21866916175904014, + "grad_norm": 0.7187291979789734, + "learning_rate": 9.720795521441697e-06, + "loss": 0.7756, + "step": 3973 + }, + { + "epoch": 0.21872420056139577, + "grad_norm": 0.7880695462226868, + "learning_rate": 9.720652680834995e-06, + "loss": 0.8548, + "step": 3974 + }, + { + "epoch": 0.21877923936375143, + "grad_norm": 0.8841108679771423, + "learning_rate": 9.720509804749128e-06, + "loss": 0.8477, + "step": 3975 + }, + { + "epoch": 0.2188342781661071, + "grad_norm": 0.9061402678489685, + "learning_rate": 9.720366893185173e-06, + "loss": 0.8235, + "step": 3976 + }, + { + "epoch": 0.21888931696846275, + "grad_norm": 0.8342392444610596, + "learning_rate": 9.720223946144206e-06, + "loss": 0.7777, + "step": 3977 + }, + { + "epoch": 0.21894435577081842, + "grad_norm": 0.7933762073516846, + "learning_rate": 9.720080963627299e-06, + "loss": 0.7943, + "step": 3978 + }, + { + "epoch": 0.21899939457317408, + "grad_norm": 0.8358896374702454, + "learning_rate": 9.719937945635527e-06, + "loss": 0.8932, + "step": 3979 + }, + { + "epoch": 0.21905443337552974, + "grad_norm": 0.7479808926582336, + "learning_rate": 9.719794892169964e-06, + "loss": 0.7446, + "step": 3980 + }, + { + "epoch": 0.2191094721778854, + "grad_norm": 0.7920958399772644, + "learning_rate": 9.719651803231685e-06, + "loss": 0.7489, + "step": 3981 + }, + { + "epoch": 0.21916451098024106, + "grad_norm": 0.7098824977874756, + "learning_rate": 9.719508678821768e-06, + "loss": 0.7763, + "step": 3982 + }, + { + "epoch": 0.21921954978259672, + "grad_norm": 0.8733491897583008, + "learning_rate": 9.719365518941288e-06, + "loss": 0.7325, + "step": 3983 + }, + { + "epoch": 0.21927458858495238, + "grad_norm": 0.8328796029090881, + "learning_rate": 9.719222323591318e-06, + "loss": 0.9097, + "step": 3984 + }, + { + "epoch": 0.21932962738730805, + "grad_norm": 0.7869352698326111, + "learning_rate": 9.719079092772936e-06, + "loss": 0.759, + "step": 3985 + }, + { + "epoch": 0.2193846661896637, + "grad_norm": 0.8278539180755615, + "learning_rate": 9.718935826487221e-06, + "loss": 0.8545, + "step": 3986 + }, + { + "epoch": 0.21943970499201937, + "grad_norm": 0.8122449517250061, + "learning_rate": 9.718792524735246e-06, + "loss": 0.7646, + "step": 3987 + }, + { + "epoch": 0.21949474379437503, + "grad_norm": 1.072253942489624, + "learning_rate": 9.71864918751809e-06, + "loss": 0.915, + "step": 3988 + }, + { + "epoch": 0.2195497825967307, + "grad_norm": 0.7770013213157654, + "learning_rate": 9.718505814836829e-06, + "loss": 0.7561, + "step": 3989 + }, + { + "epoch": 0.21960482139908635, + "grad_norm": 0.9011678695678711, + "learning_rate": 9.718362406692544e-06, + "loss": 0.7532, + "step": 3990 + }, + { + "epoch": 0.21965986020144201, + "grad_norm": 0.8867584466934204, + "learning_rate": 9.718218963086307e-06, + "loss": 0.8732, + "step": 3991 + }, + { + "epoch": 0.21971489900379768, + "grad_norm": 0.8884773850440979, + "learning_rate": 9.718075484019201e-06, + "loss": 0.7403, + "step": 3992 + }, + { + "epoch": 0.21976993780615334, + "grad_norm": 0.8995673060417175, + "learning_rate": 9.7179319694923e-06, + "loss": 0.9283, + "step": 3993 + }, + { + "epoch": 0.219824976608509, + "grad_norm": 0.7875818014144897, + "learning_rate": 9.717788419506688e-06, + "loss": 0.8633, + "step": 3994 + }, + { + "epoch": 0.21988001541086466, + "grad_norm": 0.7693219184875488, + "learning_rate": 9.71764483406344e-06, + "loss": 0.8073, + "step": 3995 + }, + { + "epoch": 0.21993505421322032, + "grad_norm": 0.7932817339897156, + "learning_rate": 9.717501213163636e-06, + "loss": 0.7537, + "step": 3996 + }, + { + "epoch": 0.21999009301557598, + "grad_norm": 0.8274912238121033, + "learning_rate": 9.717357556808358e-06, + "loss": 0.7715, + "step": 3997 + }, + { + "epoch": 0.22004513181793164, + "grad_norm": 0.7533993124961853, + "learning_rate": 9.71721386499868e-06, + "loss": 0.7482, + "step": 3998 + }, + { + "epoch": 0.2201001706202873, + "grad_norm": 1.028228759765625, + "learning_rate": 9.717070137735687e-06, + "loss": 0.9897, + "step": 3999 + }, + { + "epoch": 0.22015520942264297, + "grad_norm": 1.1093978881835938, + "learning_rate": 9.716926375020457e-06, + "loss": 0.8701, + "step": 4000 + }, + { + "epoch": 0.22021024822499863, + "grad_norm": 0.7891124486923218, + "learning_rate": 9.716782576854073e-06, + "loss": 0.8533, + "step": 4001 + }, + { + "epoch": 0.2202652870273543, + "grad_norm": 1.1783788204193115, + "learning_rate": 9.716638743237611e-06, + "loss": 0.8088, + "step": 4002 + }, + { + "epoch": 0.22032032582970995, + "grad_norm": 0.8713383078575134, + "learning_rate": 9.716494874172157e-06, + "loss": 0.8382, + "step": 4003 + }, + { + "epoch": 0.2203753646320656, + "grad_norm": 0.7821565270423889, + "learning_rate": 9.716350969658787e-06, + "loss": 0.8168, + "step": 4004 + }, + { + "epoch": 0.22043040343442127, + "grad_norm": 0.7642589211463928, + "learning_rate": 9.716207029698589e-06, + "loss": 0.7209, + "step": 4005 + }, + { + "epoch": 0.22048544223677694, + "grad_norm": 0.935625433921814, + "learning_rate": 9.716063054292639e-06, + "loss": 0.8436, + "step": 4006 + }, + { + "epoch": 0.2205404810391326, + "grad_norm": 0.7064627408981323, + "learning_rate": 9.715919043442024e-06, + "loss": 0.7651, + "step": 4007 + }, + { + "epoch": 0.22059551984148826, + "grad_norm": 0.6980876326560974, + "learning_rate": 9.715774997147823e-06, + "loss": 0.7842, + "step": 4008 + }, + { + "epoch": 0.22065055864384392, + "grad_norm": 0.7691119313240051, + "learning_rate": 9.715630915411118e-06, + "loss": 0.7345, + "step": 4009 + }, + { + "epoch": 0.22070559744619958, + "grad_norm": 0.8870186805725098, + "learning_rate": 9.715486798232994e-06, + "loss": 0.7531, + "step": 4010 + }, + { + "epoch": 0.22076063624855524, + "grad_norm": 0.7225383520126343, + "learning_rate": 9.715342645614533e-06, + "loss": 0.8543, + "step": 4011 + }, + { + "epoch": 0.2208156750509109, + "grad_norm": 0.7517428994178772, + "learning_rate": 9.71519845755682e-06, + "loss": 0.84, + "step": 4012 + }, + { + "epoch": 0.22087071385326656, + "grad_norm": 0.8115549087524414, + "learning_rate": 9.715054234060937e-06, + "loss": 0.7823, + "step": 4013 + }, + { + "epoch": 0.22092575265562223, + "grad_norm": 1.6656148433685303, + "learning_rate": 9.714909975127968e-06, + "loss": 0.8951, + "step": 4014 + }, + { + "epoch": 0.2209807914579779, + "grad_norm": 0.906508207321167, + "learning_rate": 9.714765680758997e-06, + "loss": 0.8599, + "step": 4015 + }, + { + "epoch": 0.22103583026033355, + "grad_norm": 0.8274093866348267, + "learning_rate": 9.71462135095511e-06, + "loss": 0.9568, + "step": 4016 + }, + { + "epoch": 0.22109086906268918, + "grad_norm": 0.7745386958122253, + "learning_rate": 9.714476985717393e-06, + "loss": 0.8641, + "step": 4017 + }, + { + "epoch": 0.22114590786504484, + "grad_norm": 0.8112689256668091, + "learning_rate": 9.714332585046928e-06, + "loss": 0.834, + "step": 4018 + }, + { + "epoch": 0.2212009466674005, + "grad_norm": 0.916847825050354, + "learning_rate": 9.714188148944799e-06, + "loss": 0.8546, + "step": 4019 + }, + { + "epoch": 0.22125598546975617, + "grad_norm": 0.8595414161682129, + "learning_rate": 9.714043677412096e-06, + "loss": 0.9388, + "step": 4020 + }, + { + "epoch": 0.22131102427211183, + "grad_norm": 0.8672438263893127, + "learning_rate": 9.713899170449901e-06, + "loss": 0.8151, + "step": 4021 + }, + { + "epoch": 0.2213660630744675, + "grad_norm": 0.699749767780304, + "learning_rate": 9.713754628059304e-06, + "loss": 0.7433, + "step": 4022 + }, + { + "epoch": 0.22142110187682315, + "grad_norm": 0.8071898818016052, + "learning_rate": 9.713610050241387e-06, + "loss": 0.7663, + "step": 4023 + }, + { + "epoch": 0.2214761406791788, + "grad_norm": 0.745030403137207, + "learning_rate": 9.713465436997239e-06, + "loss": 0.7733, + "step": 4024 + }, + { + "epoch": 0.22153117948153447, + "grad_norm": 0.8034930229187012, + "learning_rate": 9.713320788327947e-06, + "loss": 0.9015, + "step": 4025 + }, + { + "epoch": 0.22158621828389014, + "grad_norm": 0.8549708724021912, + "learning_rate": 9.713176104234597e-06, + "loss": 0.7127, + "step": 4026 + }, + { + "epoch": 0.2216412570862458, + "grad_norm": 0.8432256579399109, + "learning_rate": 9.713031384718277e-06, + "loss": 0.8163, + "step": 4027 + }, + { + "epoch": 0.22169629588860146, + "grad_norm": 0.7623703479766846, + "learning_rate": 9.712886629780075e-06, + "loss": 0.8272, + "step": 4028 + }, + { + "epoch": 0.22175133469095712, + "grad_norm": 0.8425806760787964, + "learning_rate": 9.712741839421079e-06, + "loss": 0.7907, + "step": 4029 + }, + { + "epoch": 0.22180637349331278, + "grad_norm": 0.7477750778198242, + "learning_rate": 9.712597013642376e-06, + "loss": 0.7662, + "step": 4030 + }, + { + "epoch": 0.22186141229566844, + "grad_norm": 0.7761805057525635, + "learning_rate": 9.712452152445056e-06, + "loss": 0.7999, + "step": 4031 + }, + { + "epoch": 0.2219164510980241, + "grad_norm": 0.8604531288146973, + "learning_rate": 9.712307255830207e-06, + "loss": 0.812, + "step": 4032 + }, + { + "epoch": 0.22197148990037976, + "grad_norm": 0.8113332986831665, + "learning_rate": 9.712162323798918e-06, + "loss": 0.8092, + "step": 4033 + }, + { + "epoch": 0.22202652870273543, + "grad_norm": 0.7980128526687622, + "learning_rate": 9.71201735635228e-06, + "loss": 0.6934, + "step": 4034 + }, + { + "epoch": 0.2220815675050911, + "grad_norm": 0.7819470763206482, + "learning_rate": 9.711872353491377e-06, + "loss": 0.8531, + "step": 4035 + }, + { + "epoch": 0.22213660630744675, + "grad_norm": 0.8283445835113525, + "learning_rate": 9.711727315217305e-06, + "loss": 0.8594, + "step": 4036 + }, + { + "epoch": 0.2221916451098024, + "grad_norm": 0.7282612919807434, + "learning_rate": 9.711582241531153e-06, + "loss": 0.7374, + "step": 4037 + }, + { + "epoch": 0.22224668391215807, + "grad_norm": 0.9564353823661804, + "learning_rate": 9.711437132434007e-06, + "loss": 0.7996, + "step": 4038 + }, + { + "epoch": 0.22230172271451373, + "grad_norm": 0.8559701442718506, + "learning_rate": 9.711291987926963e-06, + "loss": 0.949, + "step": 4039 + }, + { + "epoch": 0.2223567615168694, + "grad_norm": 0.7515334486961365, + "learning_rate": 9.71114680801111e-06, + "loss": 0.7188, + "step": 4040 + }, + { + "epoch": 0.22241180031922506, + "grad_norm": 0.7685608863830566, + "learning_rate": 9.711001592687537e-06, + "loss": 0.7679, + "step": 4041 + }, + { + "epoch": 0.22246683912158072, + "grad_norm": 0.6848913431167603, + "learning_rate": 9.710856341957337e-06, + "loss": 0.7666, + "step": 4042 + }, + { + "epoch": 0.22252187792393638, + "grad_norm": 0.7270542979240417, + "learning_rate": 9.710711055821602e-06, + "loss": 0.7563, + "step": 4043 + }, + { + "epoch": 0.22257691672629204, + "grad_norm": 0.7965164184570312, + "learning_rate": 9.710565734281424e-06, + "loss": 0.7586, + "step": 4044 + }, + { + "epoch": 0.2226319555286477, + "grad_norm": 0.7872949242591858, + "learning_rate": 9.710420377337895e-06, + "loss": 0.8423, + "step": 4045 + }, + { + "epoch": 0.22268699433100336, + "grad_norm": 0.7466526627540588, + "learning_rate": 9.710274984992107e-06, + "loss": 0.7578, + "step": 4046 + }, + { + "epoch": 0.22274203313335902, + "grad_norm": 0.7208731770515442, + "learning_rate": 9.710129557245154e-06, + "loss": 0.7019, + "step": 4047 + }, + { + "epoch": 0.22279707193571469, + "grad_norm": 0.6953400373458862, + "learning_rate": 9.709984094098127e-06, + "loss": 0.7234, + "step": 4048 + }, + { + "epoch": 0.22285211073807035, + "grad_norm": 0.7866283059120178, + "learning_rate": 9.709838595552122e-06, + "loss": 0.785, + "step": 4049 + }, + { + "epoch": 0.222907149540426, + "grad_norm": 0.7404114007949829, + "learning_rate": 9.709693061608227e-06, + "loss": 0.7706, + "step": 4050 + }, + { + "epoch": 0.22296218834278167, + "grad_norm": 0.8788254857063293, + "learning_rate": 9.709547492267544e-06, + "loss": 0.8392, + "step": 4051 + }, + { + "epoch": 0.22301722714513733, + "grad_norm": 0.7493161559104919, + "learning_rate": 9.70940188753116e-06, + "loss": 0.8346, + "step": 4052 + }, + { + "epoch": 0.223072265947493, + "grad_norm": 0.7340379357337952, + "learning_rate": 9.709256247400174e-06, + "loss": 0.7715, + "step": 4053 + }, + { + "epoch": 0.22312730474984865, + "grad_norm": 0.7291178107261658, + "learning_rate": 9.709110571875677e-06, + "loss": 0.866, + "step": 4054 + }, + { + "epoch": 0.22318234355220432, + "grad_norm": 0.8046013712882996, + "learning_rate": 9.708964860958765e-06, + "loss": 0.7885, + "step": 4055 + }, + { + "epoch": 0.22323738235455998, + "grad_norm": 0.832941472530365, + "learning_rate": 9.708819114650535e-06, + "loss": 0.873, + "step": 4056 + }, + { + "epoch": 0.22329242115691564, + "grad_norm": 0.6933377981185913, + "learning_rate": 9.70867333295208e-06, + "loss": 0.7944, + "step": 4057 + }, + { + "epoch": 0.2233474599592713, + "grad_norm": 0.7976044416427612, + "learning_rate": 9.708527515864499e-06, + "loss": 0.72, + "step": 4058 + }, + { + "epoch": 0.22340249876162696, + "grad_norm": 0.7698904871940613, + "learning_rate": 9.708381663388884e-06, + "loss": 0.7603, + "step": 4059 + }, + { + "epoch": 0.2234575375639826, + "grad_norm": 0.7554401159286499, + "learning_rate": 9.708235775526331e-06, + "loss": 0.7488, + "step": 4060 + }, + { + "epoch": 0.22351257636633826, + "grad_norm": 0.7382954359054565, + "learning_rate": 9.70808985227794e-06, + "loss": 0.7418, + "step": 4061 + }, + { + "epoch": 0.22356761516869392, + "grad_norm": 0.7220499515533447, + "learning_rate": 9.707943893644806e-06, + "loss": 0.7691, + "step": 4062 + }, + { + "epoch": 0.22362265397104958, + "grad_norm": 0.727542519569397, + "learning_rate": 9.707797899628027e-06, + "loss": 0.7603, + "step": 4063 + }, + { + "epoch": 0.22367769277340524, + "grad_norm": 0.7857500910758972, + "learning_rate": 9.707651870228697e-06, + "loss": 0.8633, + "step": 4064 + }, + { + "epoch": 0.2237327315757609, + "grad_norm": 0.7975600361824036, + "learning_rate": 9.707505805447917e-06, + "loss": 0.8591, + "step": 4065 + }, + { + "epoch": 0.22378777037811656, + "grad_norm": 1.0063475370407104, + "learning_rate": 9.707359705286784e-06, + "loss": 0.7935, + "step": 4066 + }, + { + "epoch": 0.22384280918047222, + "grad_norm": 0.7307062745094299, + "learning_rate": 9.707213569746393e-06, + "loss": 0.797, + "step": 4067 + }, + { + "epoch": 0.22389784798282789, + "grad_norm": 0.7891914248466492, + "learning_rate": 9.707067398827847e-06, + "loss": 0.853, + "step": 4068 + }, + { + "epoch": 0.22395288678518355, + "grad_norm": 0.7479422092437744, + "learning_rate": 9.706921192532242e-06, + "loss": 0.7359, + "step": 4069 + }, + { + "epoch": 0.2240079255875392, + "grad_norm": 0.8436065912246704, + "learning_rate": 9.706774950860676e-06, + "loss": 0.7916, + "step": 4070 + }, + { + "epoch": 0.22406296438989487, + "grad_norm": 0.7586960196495056, + "learning_rate": 9.706628673814252e-06, + "loss": 0.7871, + "step": 4071 + }, + { + "epoch": 0.22411800319225053, + "grad_norm": 0.8181111812591553, + "learning_rate": 9.706482361394064e-06, + "loss": 0.7782, + "step": 4072 + }, + { + "epoch": 0.2241730419946062, + "grad_norm": 0.7205253839492798, + "learning_rate": 9.706336013601217e-06, + "loss": 0.7912, + "step": 4073 + }, + { + "epoch": 0.22422808079696185, + "grad_norm": 0.9823397397994995, + "learning_rate": 9.706189630436806e-06, + "loss": 0.8393, + "step": 4074 + }, + { + "epoch": 0.22428311959931752, + "grad_norm": 0.7360854148864746, + "learning_rate": 9.706043211901935e-06, + "loss": 0.8239, + "step": 4075 + }, + { + "epoch": 0.22433815840167318, + "grad_norm": 0.7590144872665405, + "learning_rate": 9.705896757997701e-06, + "loss": 0.7177, + "step": 4076 + }, + { + "epoch": 0.22439319720402884, + "grad_norm": 0.7691343426704407, + "learning_rate": 9.70575026872521e-06, + "loss": 0.7731, + "step": 4077 + }, + { + "epoch": 0.2244482360063845, + "grad_norm": 0.7057286500930786, + "learning_rate": 9.705603744085556e-06, + "loss": 0.7746, + "step": 4078 + }, + { + "epoch": 0.22450327480874016, + "grad_norm": 0.7954769134521484, + "learning_rate": 9.705457184079847e-06, + "loss": 0.8215, + "step": 4079 + }, + { + "epoch": 0.22455831361109582, + "grad_norm": 0.7089072465896606, + "learning_rate": 9.70531058870918e-06, + "loss": 0.7263, + "step": 4080 + }, + { + "epoch": 0.22461335241345148, + "grad_norm": 0.9847552180290222, + "learning_rate": 9.705163957974657e-06, + "loss": 0.8948, + "step": 4081 + }, + { + "epoch": 0.22466839121580715, + "grad_norm": 0.7977012395858765, + "learning_rate": 9.705017291877383e-06, + "loss": 0.7518, + "step": 4082 + }, + { + "epoch": 0.2247234300181628, + "grad_norm": 0.8084518909454346, + "learning_rate": 9.704870590418458e-06, + "loss": 0.8711, + "step": 4083 + }, + { + "epoch": 0.22477846882051847, + "grad_norm": 0.9151536822319031, + "learning_rate": 9.704723853598986e-06, + "loss": 0.8217, + "step": 4084 + }, + { + "epoch": 0.22483350762287413, + "grad_norm": 0.908136248588562, + "learning_rate": 9.704577081420065e-06, + "loss": 0.6961, + "step": 4085 + }, + { + "epoch": 0.2248885464252298, + "grad_norm": 0.8569996953010559, + "learning_rate": 9.704430273882806e-06, + "loss": 0.8405, + "step": 4086 + }, + { + "epoch": 0.22494358522758545, + "grad_norm": 0.7687774300575256, + "learning_rate": 9.704283430988307e-06, + "loss": 0.6903, + "step": 4087 + }, + { + "epoch": 0.2249986240299411, + "grad_norm": 0.863203763961792, + "learning_rate": 9.704136552737673e-06, + "loss": 0.8927, + "step": 4088 + }, + { + "epoch": 0.22505366283229677, + "grad_norm": 1.252581238746643, + "learning_rate": 9.703989639132008e-06, + "loss": 0.8792, + "step": 4089 + }, + { + "epoch": 0.22510870163465244, + "grad_norm": 0.7844160795211792, + "learning_rate": 9.703842690172415e-06, + "loss": 0.844, + "step": 4090 + }, + { + "epoch": 0.2251637404370081, + "grad_norm": 0.8669766187667847, + "learning_rate": 9.703695705860002e-06, + "loss": 0.7008, + "step": 4091 + }, + { + "epoch": 0.22521877923936376, + "grad_norm": 0.7180137634277344, + "learning_rate": 9.703548686195869e-06, + "loss": 0.8242, + "step": 4092 + }, + { + "epoch": 0.22527381804171942, + "grad_norm": 0.7225000858306885, + "learning_rate": 9.703401631181124e-06, + "loss": 0.724, + "step": 4093 + }, + { + "epoch": 0.22532885684407508, + "grad_norm": 0.8348065614700317, + "learning_rate": 9.70325454081687e-06, + "loss": 0.7996, + "step": 4094 + }, + { + "epoch": 0.22538389564643074, + "grad_norm": 0.8099488019943237, + "learning_rate": 9.703107415104216e-06, + "loss": 0.7498, + "step": 4095 + }, + { + "epoch": 0.2254389344487864, + "grad_norm": 0.7051188945770264, + "learning_rate": 9.702960254044264e-06, + "loss": 0.7322, + "step": 4096 + }, + { + "epoch": 0.22549397325114207, + "grad_norm": 0.742859423160553, + "learning_rate": 9.702813057638122e-06, + "loss": 0.746, + "step": 4097 + }, + { + "epoch": 0.22554901205349773, + "grad_norm": 0.7981536984443665, + "learning_rate": 9.702665825886897e-06, + "loss": 0.8705, + "step": 4098 + }, + { + "epoch": 0.2256040508558534, + "grad_norm": 1.0317178964614868, + "learning_rate": 9.702518558791693e-06, + "loss": 0.8261, + "step": 4099 + }, + { + "epoch": 0.22565908965820905, + "grad_norm": 0.7811983823776245, + "learning_rate": 9.702371256353618e-06, + "loss": 0.7633, + "step": 4100 + }, + { + "epoch": 0.2257141284605647, + "grad_norm": 0.8288078308105469, + "learning_rate": 9.702223918573782e-06, + "loss": 0.7974, + "step": 4101 + }, + { + "epoch": 0.22576916726292034, + "grad_norm": 0.8932577967643738, + "learning_rate": 9.702076545453286e-06, + "loss": 0.7517, + "step": 4102 + }, + { + "epoch": 0.225824206065276, + "grad_norm": 0.8342248201370239, + "learning_rate": 9.701929136993243e-06, + "loss": 0.8634, + "step": 4103 + }, + { + "epoch": 0.22587924486763167, + "grad_norm": 0.790392279624939, + "learning_rate": 9.701781693194761e-06, + "loss": 0.7705, + "step": 4104 + }, + { + "epoch": 0.22593428366998733, + "grad_norm": 0.824691891670227, + "learning_rate": 9.701634214058944e-06, + "loss": 0.877, + "step": 4105 + }, + { + "epoch": 0.225989322472343, + "grad_norm": 0.9237051010131836, + "learning_rate": 9.701486699586904e-06, + "loss": 0.842, + "step": 4106 + }, + { + "epoch": 0.22604436127469865, + "grad_norm": 0.7453535199165344, + "learning_rate": 9.701339149779747e-06, + "loss": 0.8217, + "step": 4107 + }, + { + "epoch": 0.2260994000770543, + "grad_norm": 0.727872371673584, + "learning_rate": 9.701191564638586e-06, + "loss": 0.849, + "step": 4108 + }, + { + "epoch": 0.22615443887940997, + "grad_norm": 0.966585636138916, + "learning_rate": 9.701043944164526e-06, + "loss": 0.7742, + "step": 4109 + }, + { + "epoch": 0.22620947768176564, + "grad_norm": 0.7556117177009583, + "learning_rate": 9.700896288358678e-06, + "loss": 0.7498, + "step": 4110 + }, + { + "epoch": 0.2262645164841213, + "grad_norm": 0.848143458366394, + "learning_rate": 9.700748597222151e-06, + "loss": 0.7237, + "step": 4111 + }, + { + "epoch": 0.22631955528647696, + "grad_norm": 0.9046787619590759, + "learning_rate": 9.700600870756056e-06, + "loss": 0.8066, + "step": 4112 + }, + { + "epoch": 0.22637459408883262, + "grad_norm": 0.923159658908844, + "learning_rate": 9.700453108961505e-06, + "loss": 0.8404, + "step": 4113 + }, + { + "epoch": 0.22642963289118828, + "grad_norm": 0.8697664737701416, + "learning_rate": 9.700305311839606e-06, + "loss": 0.7269, + "step": 4114 + }, + { + "epoch": 0.22648467169354394, + "grad_norm": 0.8179994821548462, + "learning_rate": 9.70015747939147e-06, + "loss": 0.8083, + "step": 4115 + }, + { + "epoch": 0.2265397104958996, + "grad_norm": 0.7961694002151489, + "learning_rate": 9.700009611618208e-06, + "loss": 0.7327, + "step": 4116 + }, + { + "epoch": 0.22659474929825527, + "grad_norm": 0.7317802309989929, + "learning_rate": 9.699861708520934e-06, + "loss": 0.8273, + "step": 4117 + }, + { + "epoch": 0.22664978810061093, + "grad_norm": 0.9190557599067688, + "learning_rate": 9.699713770100757e-06, + "loss": 0.8027, + "step": 4118 + }, + { + "epoch": 0.2267048269029666, + "grad_norm": 0.7618072628974915, + "learning_rate": 9.699565796358788e-06, + "loss": 0.7669, + "step": 4119 + }, + { + "epoch": 0.22675986570532225, + "grad_norm": 1.0236154794692993, + "learning_rate": 9.699417787296139e-06, + "loss": 0.7511, + "step": 4120 + }, + { + "epoch": 0.2268149045076779, + "grad_norm": 0.8011670708656311, + "learning_rate": 9.699269742913927e-06, + "loss": 0.7644, + "step": 4121 + }, + { + "epoch": 0.22686994331003357, + "grad_norm": 0.7808024287223816, + "learning_rate": 9.69912166321326e-06, + "loss": 0.7894, + "step": 4122 + }, + { + "epoch": 0.22692498211238923, + "grad_norm": 0.8645655512809753, + "learning_rate": 9.698973548195252e-06, + "loss": 0.7989, + "step": 4123 + }, + { + "epoch": 0.2269800209147449, + "grad_norm": 0.7478770613670349, + "learning_rate": 9.698825397861017e-06, + "loss": 0.7758, + "step": 4124 + }, + { + "epoch": 0.22703505971710056, + "grad_norm": 0.8988361954689026, + "learning_rate": 9.698677212211668e-06, + "loss": 0.8312, + "step": 4125 + }, + { + "epoch": 0.22709009851945622, + "grad_norm": 0.773028552532196, + "learning_rate": 9.69852899124832e-06, + "loss": 0.7415, + "step": 4126 + }, + { + "epoch": 0.22714513732181188, + "grad_norm": 0.8173778653144836, + "learning_rate": 9.698380734972085e-06, + "loss": 0.8241, + "step": 4127 + }, + { + "epoch": 0.22720017612416754, + "grad_norm": 0.7868672013282776, + "learning_rate": 9.698232443384078e-06, + "loss": 0.7294, + "step": 4128 + }, + { + "epoch": 0.2272552149265232, + "grad_norm": 0.8662189841270447, + "learning_rate": 9.698084116485413e-06, + "loss": 0.9307, + "step": 4129 + }, + { + "epoch": 0.22731025372887886, + "grad_norm": 0.7571321129798889, + "learning_rate": 9.697935754277207e-06, + "loss": 0.7756, + "step": 4130 + }, + { + "epoch": 0.22736529253123453, + "grad_norm": 0.8222649097442627, + "learning_rate": 9.697787356760574e-06, + "loss": 0.8689, + "step": 4131 + }, + { + "epoch": 0.2274203313335902, + "grad_norm": 0.8302241563796997, + "learning_rate": 9.697638923936626e-06, + "loss": 0.8139, + "step": 4132 + }, + { + "epoch": 0.22747537013594585, + "grad_norm": 0.779951274394989, + "learning_rate": 9.697490455806482e-06, + "loss": 0.7493, + "step": 4133 + }, + { + "epoch": 0.2275304089383015, + "grad_norm": 0.8409813046455383, + "learning_rate": 9.697341952371257e-06, + "loss": 0.777, + "step": 4134 + }, + { + "epoch": 0.22758544774065717, + "grad_norm": 0.8599729537963867, + "learning_rate": 9.697193413632068e-06, + "loss": 0.7678, + "step": 4135 + }, + { + "epoch": 0.22764048654301283, + "grad_norm": 0.7505115270614624, + "learning_rate": 9.69704483959003e-06, + "loss": 0.787, + "step": 4136 + }, + { + "epoch": 0.2276955253453685, + "grad_norm": 0.7326868176460266, + "learning_rate": 9.696896230246262e-06, + "loss": 0.7066, + "step": 4137 + }, + { + "epoch": 0.22775056414772415, + "grad_norm": 0.8269753456115723, + "learning_rate": 9.696747585601878e-06, + "loss": 0.7379, + "step": 4138 + }, + { + "epoch": 0.22780560295007982, + "grad_norm": 0.7841970324516296, + "learning_rate": 9.696598905657997e-06, + "loss": 0.764, + "step": 4139 + }, + { + "epoch": 0.22786064175243548, + "grad_norm": 0.7131417989730835, + "learning_rate": 9.696450190415735e-06, + "loss": 0.7629, + "step": 4140 + }, + { + "epoch": 0.22791568055479114, + "grad_norm": 0.7922703623771667, + "learning_rate": 9.69630143987621e-06, + "loss": 0.8354, + "step": 4141 + }, + { + "epoch": 0.2279707193571468, + "grad_norm": 0.9628629684448242, + "learning_rate": 9.696152654040543e-06, + "loss": 0.8077, + "step": 4142 + }, + { + "epoch": 0.22802575815950246, + "grad_norm": 0.8566663265228271, + "learning_rate": 9.696003832909847e-06, + "loss": 0.685, + "step": 4143 + }, + { + "epoch": 0.22808079696185812, + "grad_norm": 0.7181339859962463, + "learning_rate": 9.695854976485244e-06, + "loss": 0.8135, + "step": 4144 + }, + { + "epoch": 0.22813583576421376, + "grad_norm": 0.9119813442230225, + "learning_rate": 9.695706084767853e-06, + "loss": 0.7276, + "step": 4145 + }, + { + "epoch": 0.22819087456656942, + "grad_norm": 0.8547400832176208, + "learning_rate": 9.69555715775879e-06, + "loss": 0.8656, + "step": 4146 + }, + { + "epoch": 0.22824591336892508, + "grad_norm": 0.77585768699646, + "learning_rate": 9.695408195459179e-06, + "loss": 0.8218, + "step": 4147 + }, + { + "epoch": 0.22830095217128074, + "grad_norm": 0.7832447290420532, + "learning_rate": 9.695259197870135e-06, + "loss": 0.8002, + "step": 4148 + }, + { + "epoch": 0.2283559909736364, + "grad_norm": 0.9184865355491638, + "learning_rate": 9.69511016499278e-06, + "loss": 0.8651, + "step": 4149 + }, + { + "epoch": 0.22841102977599206, + "grad_norm": 0.8663797974586487, + "learning_rate": 9.694961096828235e-06, + "loss": 0.7381, + "step": 4150 + }, + { + "epoch": 0.22846606857834773, + "grad_norm": 0.843265950679779, + "learning_rate": 9.694811993377617e-06, + "loss": 0.8546, + "step": 4151 + }, + { + "epoch": 0.2285211073807034, + "grad_norm": 0.8021818399429321, + "learning_rate": 9.694662854642049e-06, + "loss": 0.9166, + "step": 4152 + }, + { + "epoch": 0.22857614618305905, + "grad_norm": 0.7762879729270935, + "learning_rate": 9.694513680622653e-06, + "loss": 0.7055, + "step": 4153 + }, + { + "epoch": 0.2286311849854147, + "grad_norm": 0.809352457523346, + "learning_rate": 9.694364471320548e-06, + "loss": 0.7988, + "step": 4154 + }, + { + "epoch": 0.22868622378777037, + "grad_norm": 0.7239902019500732, + "learning_rate": 9.694215226736858e-06, + "loss": 0.7783, + "step": 4155 + }, + { + "epoch": 0.22874126259012603, + "grad_norm": 0.7072625160217285, + "learning_rate": 9.694065946872702e-06, + "loss": 0.7607, + "step": 4156 + }, + { + "epoch": 0.2287963013924817, + "grad_norm": 0.7696169018745422, + "learning_rate": 9.693916631729201e-06, + "loss": 0.7519, + "step": 4157 + }, + { + "epoch": 0.22885134019483735, + "grad_norm": 0.9198557734489441, + "learning_rate": 9.69376728130748e-06, + "loss": 0.7754, + "step": 4158 + }, + { + "epoch": 0.22890637899719302, + "grad_norm": 0.7589097619056702, + "learning_rate": 9.693617895608662e-06, + "loss": 0.7258, + "step": 4159 + }, + { + "epoch": 0.22896141779954868, + "grad_norm": 0.8351333141326904, + "learning_rate": 9.693468474633867e-06, + "loss": 0.8633, + "step": 4160 + }, + { + "epoch": 0.22901645660190434, + "grad_norm": 0.8331828713417053, + "learning_rate": 9.69331901838422e-06, + "loss": 0.7361, + "step": 4161 + }, + { + "epoch": 0.22907149540426, + "grad_norm": 0.8810774087905884, + "learning_rate": 9.693169526860843e-06, + "loss": 0.7651, + "step": 4162 + }, + { + "epoch": 0.22912653420661566, + "grad_norm": 0.8151684999465942, + "learning_rate": 9.69302000006486e-06, + "loss": 0.8533, + "step": 4163 + }, + { + "epoch": 0.22918157300897132, + "grad_norm": 0.8683320879936218, + "learning_rate": 9.692870437997394e-06, + "loss": 0.8323, + "step": 4164 + }, + { + "epoch": 0.22923661181132698, + "grad_norm": 0.7488875389099121, + "learning_rate": 9.692720840659572e-06, + "loss": 0.8414, + "step": 4165 + }, + { + "epoch": 0.22929165061368265, + "grad_norm": 0.7916452288627625, + "learning_rate": 9.692571208052515e-06, + "loss": 0.7058, + "step": 4166 + }, + { + "epoch": 0.2293466894160383, + "grad_norm": 0.8228384256362915, + "learning_rate": 9.69242154017735e-06, + "loss": 0.7667, + "step": 4167 + }, + { + "epoch": 0.22940172821839397, + "grad_norm": 0.7395613789558411, + "learning_rate": 9.692271837035202e-06, + "loss": 0.7649, + "step": 4168 + }, + { + "epoch": 0.22945676702074963, + "grad_norm": 0.7187666893005371, + "learning_rate": 9.692122098627192e-06, + "loss": 0.7575, + "step": 4169 + }, + { + "epoch": 0.2295118058231053, + "grad_norm": 0.7060030102729797, + "learning_rate": 9.691972324954449e-06, + "loss": 0.8309, + "step": 4170 + }, + { + "epoch": 0.22956684462546095, + "grad_norm": 0.7807210087776184, + "learning_rate": 9.691822516018099e-06, + "loss": 0.8185, + "step": 4171 + }, + { + "epoch": 0.22962188342781661, + "grad_norm": 0.6918593645095825, + "learning_rate": 9.691672671819265e-06, + "loss": 0.6983, + "step": 4172 + }, + { + "epoch": 0.22967692223017228, + "grad_norm": 0.7947858572006226, + "learning_rate": 9.691522792359077e-06, + "loss": 0.8098, + "step": 4173 + }, + { + "epoch": 0.22973196103252794, + "grad_norm": 0.7907306551933289, + "learning_rate": 9.691372877638658e-06, + "loss": 0.8, + "step": 4174 + }, + { + "epoch": 0.2297869998348836, + "grad_norm": 0.7669435739517212, + "learning_rate": 9.691222927659137e-06, + "loss": 0.8121, + "step": 4175 + }, + { + "epoch": 0.22984203863723926, + "grad_norm": 0.8128299117088318, + "learning_rate": 9.691072942421642e-06, + "loss": 0.7554, + "step": 4176 + }, + { + "epoch": 0.22989707743959492, + "grad_norm": 0.9043960571289062, + "learning_rate": 9.690922921927295e-06, + "loss": 0.8601, + "step": 4177 + }, + { + "epoch": 0.22995211624195058, + "grad_norm": 0.835445761680603, + "learning_rate": 9.690772866177229e-06, + "loss": 0.8185, + "step": 4178 + }, + { + "epoch": 0.23000715504430624, + "grad_norm": 0.734601616859436, + "learning_rate": 9.69062277517257e-06, + "loss": 0.6486, + "step": 4179 + }, + { + "epoch": 0.2300621938466619, + "grad_norm": 0.8252671957015991, + "learning_rate": 9.690472648914445e-06, + "loss": 0.8455, + "step": 4180 + }, + { + "epoch": 0.23011723264901757, + "grad_norm": 0.8266329169273376, + "learning_rate": 9.690322487403984e-06, + "loss": 0.7348, + "step": 4181 + }, + { + "epoch": 0.23017227145137323, + "grad_norm": 0.8280256390571594, + "learning_rate": 9.690172290642314e-06, + "loss": 0.8191, + "step": 4182 + }, + { + "epoch": 0.2302273102537289, + "grad_norm": 0.8854276537895203, + "learning_rate": 9.690022058630564e-06, + "loss": 0.9327, + "step": 4183 + }, + { + "epoch": 0.23028234905608455, + "grad_norm": 0.7308807969093323, + "learning_rate": 9.689871791369865e-06, + "loss": 0.8144, + "step": 4184 + }, + { + "epoch": 0.2303373878584402, + "grad_norm": 0.7171719670295715, + "learning_rate": 9.689721488861344e-06, + "loss": 0.8265, + "step": 4185 + }, + { + "epoch": 0.23039242666079587, + "grad_norm": 0.7955548763275146, + "learning_rate": 9.689571151106131e-06, + "loss": 0.7313, + "step": 4186 + }, + { + "epoch": 0.23044746546315154, + "grad_norm": 0.8218876123428345, + "learning_rate": 9.689420778105359e-06, + "loss": 0.883, + "step": 4187 + }, + { + "epoch": 0.23050250426550717, + "grad_norm": 0.79570072889328, + "learning_rate": 9.689270369860154e-06, + "loss": 0.8898, + "step": 4188 + }, + { + "epoch": 0.23055754306786283, + "grad_norm": 0.8163344264030457, + "learning_rate": 9.689119926371649e-06, + "loss": 0.8638, + "step": 4189 + }, + { + "epoch": 0.2306125818702185, + "grad_norm": 0.7767764329910278, + "learning_rate": 9.688969447640972e-06, + "loss": 0.7822, + "step": 4190 + }, + { + "epoch": 0.23066762067257415, + "grad_norm": 0.9357114434242249, + "learning_rate": 9.688818933669258e-06, + "loss": 0.8031, + "step": 4191 + }, + { + "epoch": 0.23072265947492981, + "grad_norm": 0.8340080380439758, + "learning_rate": 9.688668384457635e-06, + "loss": 0.8947, + "step": 4192 + }, + { + "epoch": 0.23077769827728548, + "grad_norm": 0.8187471628189087, + "learning_rate": 9.688517800007235e-06, + "loss": 0.7989, + "step": 4193 + }, + { + "epoch": 0.23083273707964114, + "grad_norm": 0.8131871819496155, + "learning_rate": 9.688367180319191e-06, + "loss": 0.8377, + "step": 4194 + }, + { + "epoch": 0.2308877758819968, + "grad_norm": 0.7933448553085327, + "learning_rate": 9.688216525394634e-06, + "loss": 0.8723, + "step": 4195 + }, + { + "epoch": 0.23094281468435246, + "grad_norm": 0.7262325286865234, + "learning_rate": 9.688065835234695e-06, + "loss": 0.7802, + "step": 4196 + }, + { + "epoch": 0.23099785348670812, + "grad_norm": 0.8289293050765991, + "learning_rate": 9.68791510984051e-06, + "loss": 0.642, + "step": 4197 + }, + { + "epoch": 0.23105289228906378, + "grad_norm": 0.8835988640785217, + "learning_rate": 9.687764349213211e-06, + "loss": 0.9002, + "step": 4198 + }, + { + "epoch": 0.23110793109141944, + "grad_norm": 0.9478649497032166, + "learning_rate": 9.687613553353927e-06, + "loss": 0.8668, + "step": 4199 + }, + { + "epoch": 0.2311629698937751, + "grad_norm": 0.872936487197876, + "learning_rate": 9.687462722263796e-06, + "loss": 0.8312, + "step": 4200 + }, + { + "epoch": 0.23121800869613077, + "grad_norm": 0.7073879241943359, + "learning_rate": 9.68731185594395e-06, + "loss": 0.776, + "step": 4201 + }, + { + "epoch": 0.23127304749848643, + "grad_norm": 0.8265218734741211, + "learning_rate": 9.687160954395522e-06, + "loss": 0.8152, + "step": 4202 + }, + { + "epoch": 0.2313280863008421, + "grad_norm": 0.8027207255363464, + "learning_rate": 9.687010017619649e-06, + "loss": 0.9514, + "step": 4203 + }, + { + "epoch": 0.23138312510319775, + "grad_norm": 0.7416790127754211, + "learning_rate": 9.68685904561746e-06, + "loss": 0.7708, + "step": 4204 + }, + { + "epoch": 0.2314381639055534, + "grad_norm": 0.7916150689125061, + "learning_rate": 9.686708038390096e-06, + "loss": 0.7753, + "step": 4205 + }, + { + "epoch": 0.23149320270790907, + "grad_norm": 0.7213300466537476, + "learning_rate": 9.686556995938688e-06, + "loss": 0.83, + "step": 4206 + }, + { + "epoch": 0.23154824151026474, + "grad_norm": 0.7595892548561096, + "learning_rate": 9.68640591826437e-06, + "loss": 0.8186, + "step": 4207 + }, + { + "epoch": 0.2316032803126204, + "grad_norm": 0.7042104601860046, + "learning_rate": 9.686254805368282e-06, + "loss": 0.7126, + "step": 4208 + }, + { + "epoch": 0.23165831911497606, + "grad_norm": 0.7416805028915405, + "learning_rate": 9.686103657251558e-06, + "loss": 0.7791, + "step": 4209 + }, + { + "epoch": 0.23171335791733172, + "grad_norm": 0.9868568181991577, + "learning_rate": 9.685952473915333e-06, + "loss": 0.8453, + "step": 4210 + }, + { + "epoch": 0.23176839671968738, + "grad_norm": 0.7133191823959351, + "learning_rate": 9.68580125536074e-06, + "loss": 0.6061, + "step": 4211 + }, + { + "epoch": 0.23182343552204304, + "grad_norm": 0.8307366967201233, + "learning_rate": 9.685650001588921e-06, + "loss": 0.8403, + "step": 4212 + }, + { + "epoch": 0.2318784743243987, + "grad_norm": 0.8395226001739502, + "learning_rate": 9.685498712601014e-06, + "loss": 0.7945, + "step": 4213 + }, + { + "epoch": 0.23193351312675436, + "grad_norm": 0.7557219862937927, + "learning_rate": 9.68534738839815e-06, + "loss": 0.7765, + "step": 4214 + }, + { + "epoch": 0.23198855192911003, + "grad_norm": 0.7003554105758667, + "learning_rate": 9.68519602898147e-06, + "loss": 0.7228, + "step": 4215 + }, + { + "epoch": 0.2320435907314657, + "grad_norm": 0.8422999382019043, + "learning_rate": 9.68504463435211e-06, + "loss": 0.8524, + "step": 4216 + }, + { + "epoch": 0.23209862953382135, + "grad_norm": 0.9369016289710999, + "learning_rate": 9.68489320451121e-06, + "loss": 0.7646, + "step": 4217 + }, + { + "epoch": 0.232153668336177, + "grad_norm": 0.8456607460975647, + "learning_rate": 9.684741739459905e-06, + "loss": 0.7481, + "step": 4218 + }, + { + "epoch": 0.23220870713853267, + "grad_norm": 0.9284812211990356, + "learning_rate": 9.684590239199336e-06, + "loss": 0.8192, + "step": 4219 + }, + { + "epoch": 0.23226374594088833, + "grad_norm": 0.8474242687225342, + "learning_rate": 9.68443870373064e-06, + "loss": 0.7143, + "step": 4220 + }, + { + "epoch": 0.232318784743244, + "grad_norm": 0.8259334564208984, + "learning_rate": 9.684287133054957e-06, + "loss": 0.8667, + "step": 4221 + }, + { + "epoch": 0.23237382354559966, + "grad_norm": 0.8016416430473328, + "learning_rate": 9.684135527173427e-06, + "loss": 0.8694, + "step": 4222 + }, + { + "epoch": 0.23242886234795532, + "grad_norm": 0.7575937509536743, + "learning_rate": 9.683983886087186e-06, + "loss": 0.7591, + "step": 4223 + }, + { + "epoch": 0.23248390115031098, + "grad_norm": 0.7004683613777161, + "learning_rate": 9.683832209797377e-06, + "loss": 0.739, + "step": 4224 + }, + { + "epoch": 0.23253893995266664, + "grad_norm": 0.8265832662582397, + "learning_rate": 9.68368049830514e-06, + "loss": 0.7705, + "step": 4225 + }, + { + "epoch": 0.2325939787550223, + "grad_norm": 0.7705711722373962, + "learning_rate": 9.683528751611612e-06, + "loss": 0.7896, + "step": 4226 + }, + { + "epoch": 0.23264901755737796, + "grad_norm": 0.7426978349685669, + "learning_rate": 9.683376969717937e-06, + "loss": 0.8217, + "step": 4227 + }, + { + "epoch": 0.23270405635973362, + "grad_norm": 0.7425839304924011, + "learning_rate": 9.683225152625255e-06, + "loss": 0.7426, + "step": 4228 + }, + { + "epoch": 0.23275909516208929, + "grad_norm": 1.0415440797805786, + "learning_rate": 9.683073300334705e-06, + "loss": 0.8585, + "step": 4229 + }, + { + "epoch": 0.23281413396444495, + "grad_norm": 0.7706055045127869, + "learning_rate": 9.68292141284743e-06, + "loss": 0.8349, + "step": 4230 + }, + { + "epoch": 0.23286917276680058, + "grad_norm": 0.8407607674598694, + "learning_rate": 9.682769490164572e-06, + "loss": 0.8592, + "step": 4231 + }, + { + "epoch": 0.23292421156915624, + "grad_norm": 0.6830767393112183, + "learning_rate": 9.68261753228727e-06, + "loss": 0.6773, + "step": 4232 + }, + { + "epoch": 0.2329792503715119, + "grad_norm": 1.6661429405212402, + "learning_rate": 9.68246553921667e-06, + "loss": 1.005, + "step": 4233 + }, + { + "epoch": 0.23303428917386756, + "grad_norm": 0.7677092552185059, + "learning_rate": 9.682313510953912e-06, + "loss": 0.7689, + "step": 4234 + }, + { + "epoch": 0.23308932797622323, + "grad_norm": 0.7232248187065125, + "learning_rate": 9.682161447500139e-06, + "loss": 0.7765, + "step": 4235 + }, + { + "epoch": 0.2331443667785789, + "grad_norm": 0.8667388558387756, + "learning_rate": 9.682009348856494e-06, + "loss": 0.8099, + "step": 4236 + }, + { + "epoch": 0.23319940558093455, + "grad_norm": 0.8220446705818176, + "learning_rate": 9.68185721502412e-06, + "loss": 0.8078, + "step": 4237 + }, + { + "epoch": 0.2332544443832902, + "grad_norm": 0.9670323133468628, + "learning_rate": 9.68170504600416e-06, + "loss": 0.8912, + "step": 4238 + }, + { + "epoch": 0.23330948318564587, + "grad_norm": 0.7950771450996399, + "learning_rate": 9.68155284179776e-06, + "loss": 0.8165, + "step": 4239 + }, + { + "epoch": 0.23336452198800153, + "grad_norm": 0.7606233358383179, + "learning_rate": 9.68140060240606e-06, + "loss": 0.7795, + "step": 4240 + }, + { + "epoch": 0.2334195607903572, + "grad_norm": 0.9580656886100769, + "learning_rate": 9.681248327830205e-06, + "loss": 0.7949, + "step": 4241 + }, + { + "epoch": 0.23347459959271286, + "grad_norm": 0.6878347992897034, + "learning_rate": 9.681096018071341e-06, + "loss": 0.7776, + "step": 4242 + }, + { + "epoch": 0.23352963839506852, + "grad_norm": 0.8449816107749939, + "learning_rate": 9.680943673130614e-06, + "loss": 0.8456, + "step": 4243 + }, + { + "epoch": 0.23358467719742418, + "grad_norm": 0.77314692735672, + "learning_rate": 9.680791293009167e-06, + "loss": 0.7915, + "step": 4244 + }, + { + "epoch": 0.23363971599977984, + "grad_norm": 0.8034142255783081, + "learning_rate": 9.680638877708146e-06, + "loss": 0.7377, + "step": 4245 + }, + { + "epoch": 0.2336947548021355, + "grad_norm": 0.8754952549934387, + "learning_rate": 9.680486427228695e-06, + "loss": 0.8072, + "step": 4246 + }, + { + "epoch": 0.23374979360449116, + "grad_norm": 0.8169820308685303, + "learning_rate": 9.680333941571963e-06, + "loss": 0.8253, + "step": 4247 + }, + { + "epoch": 0.23380483240684682, + "grad_norm": 0.7848341464996338, + "learning_rate": 9.680181420739092e-06, + "loss": 0.8243, + "step": 4248 + }, + { + "epoch": 0.23385987120920249, + "grad_norm": 0.7599799036979675, + "learning_rate": 9.68002886473123e-06, + "loss": 0.781, + "step": 4249 + }, + { + "epoch": 0.23391491001155815, + "grad_norm": 0.8920254707336426, + "learning_rate": 9.679876273549524e-06, + "loss": 0.8199, + "step": 4250 + }, + { + "epoch": 0.2339699488139138, + "grad_norm": 0.7813586592674255, + "learning_rate": 9.679723647195121e-06, + "loss": 0.7758, + "step": 4251 + }, + { + "epoch": 0.23402498761626947, + "grad_norm": 0.735282838344574, + "learning_rate": 9.679570985669168e-06, + "loss": 0.7651, + "step": 4252 + }, + { + "epoch": 0.23408002641862513, + "grad_norm": 0.7305853962898254, + "learning_rate": 9.679418288972813e-06, + "loss": 0.8202, + "step": 4253 + }, + { + "epoch": 0.2341350652209808, + "grad_norm": 0.8331005573272705, + "learning_rate": 9.6792655571072e-06, + "loss": 0.8784, + "step": 4254 + }, + { + "epoch": 0.23419010402333645, + "grad_norm": 0.8526305556297302, + "learning_rate": 9.679112790073481e-06, + "loss": 0.8116, + "step": 4255 + }, + { + "epoch": 0.23424514282569212, + "grad_norm": 0.741073489189148, + "learning_rate": 9.678959987872805e-06, + "loss": 0.6928, + "step": 4256 + }, + { + "epoch": 0.23430018162804778, + "grad_norm": 0.727859616279602, + "learning_rate": 9.678807150506315e-06, + "loss": 0.7571, + "step": 4257 + }, + { + "epoch": 0.23435522043040344, + "grad_norm": 0.8890698552131653, + "learning_rate": 9.678654277975165e-06, + "loss": 0.8145, + "step": 4258 + }, + { + "epoch": 0.2344102592327591, + "grad_norm": 0.7372937798500061, + "learning_rate": 9.6785013702805e-06, + "loss": 0.7104, + "step": 4259 + }, + { + "epoch": 0.23446529803511476, + "grad_norm": 0.7205008268356323, + "learning_rate": 9.678348427423472e-06, + "loss": 0.7498, + "step": 4260 + }, + { + "epoch": 0.23452033683747042, + "grad_norm": 0.7766392230987549, + "learning_rate": 9.67819544940523e-06, + "loss": 0.7814, + "step": 4261 + }, + { + "epoch": 0.23457537563982608, + "grad_norm": 0.7441498637199402, + "learning_rate": 9.678042436226922e-06, + "loss": 0.7429, + "step": 4262 + }, + { + "epoch": 0.23463041444218175, + "grad_norm": 0.8838522434234619, + "learning_rate": 9.677889387889701e-06, + "loss": 0.8719, + "step": 4263 + }, + { + "epoch": 0.2346854532445374, + "grad_norm": 1.2349655628204346, + "learning_rate": 9.677736304394716e-06, + "loss": 0.8491, + "step": 4264 + }, + { + "epoch": 0.23474049204689307, + "grad_norm": 0.8050087690353394, + "learning_rate": 9.677583185743116e-06, + "loss": 0.795, + "step": 4265 + }, + { + "epoch": 0.23479553084924873, + "grad_norm": 0.7885709404945374, + "learning_rate": 9.677430031936051e-06, + "loss": 0.8594, + "step": 4266 + }, + { + "epoch": 0.2348505696516044, + "grad_norm": 0.7753557562828064, + "learning_rate": 9.677276842974676e-06, + "loss": 0.8196, + "step": 4267 + }, + { + "epoch": 0.23490560845396005, + "grad_norm": 0.7325392961502075, + "learning_rate": 9.67712361886014e-06, + "loss": 0.7905, + "step": 4268 + }, + { + "epoch": 0.2349606472563157, + "grad_norm": 0.7925617694854736, + "learning_rate": 9.676970359593594e-06, + "loss": 0.7416, + "step": 4269 + }, + { + "epoch": 0.23501568605867137, + "grad_norm": 0.7981371283531189, + "learning_rate": 9.676817065176192e-06, + "loss": 0.81, + "step": 4270 + }, + { + "epoch": 0.23507072486102704, + "grad_norm": 0.7490524053573608, + "learning_rate": 9.676663735609084e-06, + "loss": 0.8347, + "step": 4271 + }, + { + "epoch": 0.2351257636633827, + "grad_norm": 1.000349521636963, + "learning_rate": 9.676510370893424e-06, + "loss": 0.7469, + "step": 4272 + }, + { + "epoch": 0.23518080246573836, + "grad_norm": 0.9310774207115173, + "learning_rate": 9.676356971030364e-06, + "loss": 0.8088, + "step": 4273 + }, + { + "epoch": 0.235235841268094, + "grad_norm": 0.8868544101715088, + "learning_rate": 9.676203536021055e-06, + "loss": 0.7472, + "step": 4274 + }, + { + "epoch": 0.23529088007044965, + "grad_norm": 0.7702255845069885, + "learning_rate": 9.676050065866653e-06, + "loss": 0.8395, + "step": 4275 + }, + { + "epoch": 0.23534591887280532, + "grad_norm": 0.7138833999633789, + "learning_rate": 9.675896560568311e-06, + "loss": 0.8529, + "step": 4276 + }, + { + "epoch": 0.23540095767516098, + "grad_norm": 0.8399729132652283, + "learning_rate": 9.675743020127182e-06, + "loss": 0.7844, + "step": 4277 + }, + { + "epoch": 0.23545599647751664, + "grad_norm": 0.8500726819038391, + "learning_rate": 9.67558944454442e-06, + "loss": 0.8209, + "step": 4278 + }, + { + "epoch": 0.2355110352798723, + "grad_norm": 0.766638994216919, + "learning_rate": 9.675435833821178e-06, + "loss": 0.7834, + "step": 4279 + }, + { + "epoch": 0.23556607408222796, + "grad_norm": 0.9121370315551758, + "learning_rate": 9.675282187958613e-06, + "loss": 0.8697, + "step": 4280 + }, + { + "epoch": 0.23562111288458362, + "grad_norm": 0.7862319946289062, + "learning_rate": 9.675128506957879e-06, + "loss": 0.8262, + "step": 4281 + }, + { + "epoch": 0.23567615168693928, + "grad_norm": 1.072777509689331, + "learning_rate": 9.67497479082013e-06, + "loss": 0.7963, + "step": 4282 + }, + { + "epoch": 0.23573119048929495, + "grad_norm": 0.7574695944786072, + "learning_rate": 9.67482103954652e-06, + "loss": 0.8178, + "step": 4283 + }, + { + "epoch": 0.2357862292916506, + "grad_norm": 0.7996877431869507, + "learning_rate": 9.674667253138209e-06, + "loss": 0.8465, + "step": 4284 + }, + { + "epoch": 0.23584126809400627, + "grad_norm": 0.711513340473175, + "learning_rate": 9.674513431596349e-06, + "loss": 0.7445, + "step": 4285 + }, + { + "epoch": 0.23589630689636193, + "grad_norm": 0.7431296706199646, + "learning_rate": 9.674359574922098e-06, + "loss": 0.8102, + "step": 4286 + }, + { + "epoch": 0.2359513456987176, + "grad_norm": 0.7745676040649414, + "learning_rate": 9.674205683116612e-06, + "loss": 0.8733, + "step": 4287 + }, + { + "epoch": 0.23600638450107325, + "grad_norm": 1.0117937326431274, + "learning_rate": 9.674051756181046e-06, + "loss": 0.9035, + "step": 4288 + }, + { + "epoch": 0.2360614233034289, + "grad_norm": 0.7848078608512878, + "learning_rate": 9.67389779411656e-06, + "loss": 0.8486, + "step": 4289 + }, + { + "epoch": 0.23611646210578457, + "grad_norm": 0.8439378142356873, + "learning_rate": 9.673743796924307e-06, + "loss": 0.8032, + "step": 4290 + }, + { + "epoch": 0.23617150090814024, + "grad_norm": 0.8268104791641235, + "learning_rate": 9.673589764605449e-06, + "loss": 0.8182, + "step": 4291 + }, + { + "epoch": 0.2362265397104959, + "grad_norm": 0.8896234631538391, + "learning_rate": 9.67343569716114e-06, + "loss": 0.8081, + "step": 4292 + }, + { + "epoch": 0.23628157851285156, + "grad_norm": 0.8515019416809082, + "learning_rate": 9.67328159459254e-06, + "loss": 0.8239, + "step": 4293 + }, + { + "epoch": 0.23633661731520722, + "grad_norm": 0.7779792547225952, + "learning_rate": 9.673127456900806e-06, + "loss": 0.8437, + "step": 4294 + }, + { + "epoch": 0.23639165611756288, + "grad_norm": 0.7782402634620667, + "learning_rate": 9.672973284087097e-06, + "loss": 0.8498, + "step": 4295 + }, + { + "epoch": 0.23644669491991854, + "grad_norm": 0.7588973641395569, + "learning_rate": 9.67281907615257e-06, + "loss": 0.7034, + "step": 4296 + }, + { + "epoch": 0.2365017337222742, + "grad_norm": 0.8426640629768372, + "learning_rate": 9.67266483309839e-06, + "loss": 0.803, + "step": 4297 + }, + { + "epoch": 0.23655677252462987, + "grad_norm": 0.8945889472961426, + "learning_rate": 9.672510554925707e-06, + "loss": 0.8971, + "step": 4298 + }, + { + "epoch": 0.23661181132698553, + "grad_norm": 0.8604227304458618, + "learning_rate": 9.672356241635688e-06, + "loss": 0.7548, + "step": 4299 + }, + { + "epoch": 0.2366668501293412, + "grad_norm": 0.7277490496635437, + "learning_rate": 9.672201893229489e-06, + "loss": 0.8083, + "step": 4300 + }, + { + "epoch": 0.23672188893169685, + "grad_norm": 0.9089379906654358, + "learning_rate": 9.672047509708273e-06, + "loss": 0.9717, + "step": 4301 + }, + { + "epoch": 0.2367769277340525, + "grad_norm": 0.7207155823707581, + "learning_rate": 9.671893091073198e-06, + "loss": 0.6794, + "step": 4302 + }, + { + "epoch": 0.23683196653640817, + "grad_norm": 0.7319806814193726, + "learning_rate": 9.671738637325425e-06, + "loss": 0.6821, + "step": 4303 + }, + { + "epoch": 0.23688700533876383, + "grad_norm": 0.7339589595794678, + "learning_rate": 9.671584148466112e-06, + "loss": 0.7895, + "step": 4304 + }, + { + "epoch": 0.2369420441411195, + "grad_norm": 0.7725476622581482, + "learning_rate": 9.671429624496428e-06, + "loss": 0.7414, + "step": 4305 + }, + { + "epoch": 0.23699708294347516, + "grad_norm": 0.7040137648582458, + "learning_rate": 9.671275065417527e-06, + "loss": 0.696, + "step": 4306 + }, + { + "epoch": 0.23705212174583082, + "grad_norm": 0.8804189562797546, + "learning_rate": 9.671120471230572e-06, + "loss": 0.8184, + "step": 4307 + }, + { + "epoch": 0.23710716054818648, + "grad_norm": 0.8062872886657715, + "learning_rate": 9.670965841936728e-06, + "loss": 0.7856, + "step": 4308 + }, + { + "epoch": 0.23716219935054214, + "grad_norm": 0.7537097930908203, + "learning_rate": 9.670811177537154e-06, + "loss": 0.7562, + "step": 4309 + }, + { + "epoch": 0.2372172381528978, + "grad_norm": 0.8168618083000183, + "learning_rate": 9.670656478033013e-06, + "loss": 0.7416, + "step": 4310 + }, + { + "epoch": 0.23727227695525346, + "grad_norm": 0.8367040157318115, + "learning_rate": 9.670501743425469e-06, + "loss": 0.7759, + "step": 4311 + }, + { + "epoch": 0.23732731575760913, + "grad_norm": 0.860418975353241, + "learning_rate": 9.670346973715683e-06, + "loss": 0.9013, + "step": 4312 + }, + { + "epoch": 0.2373823545599648, + "grad_norm": 0.8736678957939148, + "learning_rate": 9.67019216890482e-06, + "loss": 0.8677, + "step": 4313 + }, + { + "epoch": 0.23743739336232045, + "grad_norm": 0.8258964419364929, + "learning_rate": 9.670037328994044e-06, + "loss": 0.8208, + "step": 4314 + }, + { + "epoch": 0.2374924321646761, + "grad_norm": 0.7936292886734009, + "learning_rate": 9.669882453984516e-06, + "loss": 0.8643, + "step": 4315 + }, + { + "epoch": 0.23754747096703177, + "grad_norm": 0.805500864982605, + "learning_rate": 9.669727543877401e-06, + "loss": 0.779, + "step": 4316 + }, + { + "epoch": 0.2376025097693874, + "grad_norm": 0.8072311282157898, + "learning_rate": 9.669572598673866e-06, + "loss": 0.8258, + "step": 4317 + }, + { + "epoch": 0.23765754857174307, + "grad_norm": 0.8917607665061951, + "learning_rate": 9.669417618375072e-06, + "loss": 0.7528, + "step": 4318 + }, + { + "epoch": 0.23771258737409873, + "grad_norm": 0.7054246068000793, + "learning_rate": 9.669262602982186e-06, + "loss": 0.86, + "step": 4319 + }, + { + "epoch": 0.2377676261764544, + "grad_norm": 0.8600299954414368, + "learning_rate": 9.66910755249637e-06, + "loss": 0.8165, + "step": 4320 + }, + { + "epoch": 0.23782266497881005, + "grad_norm": 0.8685561418533325, + "learning_rate": 9.668952466918793e-06, + "loss": 0.8129, + "step": 4321 + }, + { + "epoch": 0.2378777037811657, + "grad_norm": 0.7859770655632019, + "learning_rate": 9.668797346250618e-06, + "loss": 0.8703, + "step": 4322 + }, + { + "epoch": 0.23793274258352137, + "grad_norm": 0.8128730058670044, + "learning_rate": 9.668642190493015e-06, + "loss": 0.7595, + "step": 4323 + }, + { + "epoch": 0.23798778138587703, + "grad_norm": 0.8223204612731934, + "learning_rate": 9.668486999647143e-06, + "loss": 0.825, + "step": 4324 + }, + { + "epoch": 0.2380428201882327, + "grad_norm": 0.859619677066803, + "learning_rate": 9.668331773714175e-06, + "loss": 0.8239, + "step": 4325 + }, + { + "epoch": 0.23809785899058836, + "grad_norm": 0.9861679673194885, + "learning_rate": 9.668176512695273e-06, + "loss": 0.8409, + "step": 4326 + }, + { + "epoch": 0.23815289779294402, + "grad_norm": 0.7178627252578735, + "learning_rate": 9.668021216591607e-06, + "loss": 0.818, + "step": 4327 + }, + { + "epoch": 0.23820793659529968, + "grad_norm": 0.9160923957824707, + "learning_rate": 9.667865885404343e-06, + "loss": 0.8703, + "step": 4328 + }, + { + "epoch": 0.23826297539765534, + "grad_norm": 0.7043942213058472, + "learning_rate": 9.667710519134648e-06, + "loss": 0.6884, + "step": 4329 + }, + { + "epoch": 0.238318014200011, + "grad_norm": 1.213121771812439, + "learning_rate": 9.667555117783691e-06, + "loss": 0.7843, + "step": 4330 + }, + { + "epoch": 0.23837305300236666, + "grad_norm": 0.8008033037185669, + "learning_rate": 9.66739968135264e-06, + "loss": 0.9312, + "step": 4331 + }, + { + "epoch": 0.23842809180472233, + "grad_norm": 0.7862009406089783, + "learning_rate": 9.667244209842662e-06, + "loss": 0.6965, + "step": 4332 + }, + { + "epoch": 0.238483130607078, + "grad_norm": 1.081398844718933, + "learning_rate": 9.667088703254923e-06, + "loss": 0.9793, + "step": 4333 + }, + { + "epoch": 0.23853816940943365, + "grad_norm": 0.7672395706176758, + "learning_rate": 9.666933161590597e-06, + "loss": 0.813, + "step": 4334 + }, + { + "epoch": 0.2385932082117893, + "grad_norm": 0.6955493092536926, + "learning_rate": 9.66677758485085e-06, + "loss": 0.7778, + "step": 4335 + }, + { + "epoch": 0.23864824701414497, + "grad_norm": 0.8609682321548462, + "learning_rate": 9.666621973036854e-06, + "loss": 0.7817, + "step": 4336 + }, + { + "epoch": 0.23870328581650063, + "grad_norm": 0.7312196493148804, + "learning_rate": 9.666466326149774e-06, + "loss": 0.7368, + "step": 4337 + }, + { + "epoch": 0.2387583246188563, + "grad_norm": 0.7964538931846619, + "learning_rate": 9.666310644190782e-06, + "loss": 0.8124, + "step": 4338 + }, + { + "epoch": 0.23881336342121195, + "grad_norm": 1.1138910055160522, + "learning_rate": 9.66615492716105e-06, + "loss": 0.8886, + "step": 4339 + }, + { + "epoch": 0.23886840222356762, + "grad_norm": 0.8789949417114258, + "learning_rate": 9.665999175061747e-06, + "loss": 0.7854, + "step": 4340 + }, + { + "epoch": 0.23892344102592328, + "grad_norm": 0.7761380076408386, + "learning_rate": 9.665843387894041e-06, + "loss": 0.7915, + "step": 4341 + }, + { + "epoch": 0.23897847982827894, + "grad_norm": 0.888482928276062, + "learning_rate": 9.665687565659106e-06, + "loss": 0.8799, + "step": 4342 + }, + { + "epoch": 0.2390335186306346, + "grad_norm": 0.7799200415611267, + "learning_rate": 9.665531708358111e-06, + "loss": 0.8519, + "step": 4343 + }, + { + "epoch": 0.23908855743299026, + "grad_norm": 0.7407697439193726, + "learning_rate": 9.665375815992231e-06, + "loss": 0.7637, + "step": 4344 + }, + { + "epoch": 0.23914359623534592, + "grad_norm": 0.8098278045654297, + "learning_rate": 9.665219888562634e-06, + "loss": 0.7991, + "step": 4345 + }, + { + "epoch": 0.23919863503770158, + "grad_norm": 0.7585136294364929, + "learning_rate": 9.665063926070493e-06, + "loss": 0.8478, + "step": 4346 + }, + { + "epoch": 0.23925367384005725, + "grad_norm": 0.7294817566871643, + "learning_rate": 9.66490792851698e-06, + "loss": 0.8312, + "step": 4347 + }, + { + "epoch": 0.2393087126424129, + "grad_norm": 0.8325762748718262, + "learning_rate": 9.664751895903269e-06, + "loss": 0.9365, + "step": 4348 + }, + { + "epoch": 0.23936375144476857, + "grad_norm": 0.9992470741271973, + "learning_rate": 9.66459582823053e-06, + "loss": 0.8649, + "step": 4349 + }, + { + "epoch": 0.23941879024712423, + "grad_norm": 0.7206875681877136, + "learning_rate": 9.664439725499938e-06, + "loss": 0.7013, + "step": 4350 + }, + { + "epoch": 0.2394738290494799, + "grad_norm": 0.946657657623291, + "learning_rate": 9.664283587712665e-06, + "loss": 0.7953, + "step": 4351 + }, + { + "epoch": 0.23952886785183555, + "grad_norm": 0.7684911489486694, + "learning_rate": 9.664127414869887e-06, + "loss": 0.8403, + "step": 4352 + }, + { + "epoch": 0.23958390665419121, + "grad_norm": 0.7875770926475525, + "learning_rate": 9.663971206972773e-06, + "loss": 0.7961, + "step": 4353 + }, + { + "epoch": 0.23963894545654688, + "grad_norm": 0.7387273907661438, + "learning_rate": 9.663814964022502e-06, + "loss": 0.8265, + "step": 4354 + }, + { + "epoch": 0.23969398425890254, + "grad_norm": 0.7413492202758789, + "learning_rate": 9.663658686020245e-06, + "loss": 0.8458, + "step": 4355 + }, + { + "epoch": 0.2397490230612582, + "grad_norm": 0.7563235759735107, + "learning_rate": 9.663502372967177e-06, + "loss": 0.8498, + "step": 4356 + }, + { + "epoch": 0.23980406186361386, + "grad_norm": 0.7529472708702087, + "learning_rate": 9.663346024864475e-06, + "loss": 0.7597, + "step": 4357 + }, + { + "epoch": 0.23985910066596952, + "grad_norm": 0.7582191824913025, + "learning_rate": 9.663189641713314e-06, + "loss": 0.804, + "step": 4358 + }, + { + "epoch": 0.23991413946832518, + "grad_norm": 0.8394485712051392, + "learning_rate": 9.663033223514865e-06, + "loss": 0.8329, + "step": 4359 + }, + { + "epoch": 0.23996917827068082, + "grad_norm": 0.7088292241096497, + "learning_rate": 9.662876770270308e-06, + "loss": 0.7131, + "step": 4360 + }, + { + "epoch": 0.24002421707303648, + "grad_norm": 0.8548080325126648, + "learning_rate": 9.662720281980817e-06, + "loss": 0.8925, + "step": 4361 + }, + { + "epoch": 0.24007925587539214, + "grad_norm": 0.8027567267417908, + "learning_rate": 9.662563758647568e-06, + "loss": 0.8652, + "step": 4362 + }, + { + "epoch": 0.2401342946777478, + "grad_norm": 0.7471736669540405, + "learning_rate": 9.662407200271738e-06, + "loss": 0.7722, + "step": 4363 + }, + { + "epoch": 0.24018933348010346, + "grad_norm": 0.7358804941177368, + "learning_rate": 9.662250606854504e-06, + "loss": 0.767, + "step": 4364 + }, + { + "epoch": 0.24024437228245912, + "grad_norm": 0.7948476672172546, + "learning_rate": 9.662093978397042e-06, + "loss": 0.961, + "step": 4365 + }, + { + "epoch": 0.24029941108481478, + "grad_norm": 0.7030961513519287, + "learning_rate": 9.66193731490053e-06, + "loss": 0.7826, + "step": 4366 + }, + { + "epoch": 0.24035444988717045, + "grad_norm": 0.8376098871231079, + "learning_rate": 9.661780616366145e-06, + "loss": 0.7697, + "step": 4367 + }, + { + "epoch": 0.2404094886895261, + "grad_norm": 0.7449594140052795, + "learning_rate": 9.661623882795065e-06, + "loss": 0.7944, + "step": 4368 + }, + { + "epoch": 0.24046452749188177, + "grad_norm": 0.7317184805870056, + "learning_rate": 9.661467114188468e-06, + "loss": 0.7059, + "step": 4369 + }, + { + "epoch": 0.24051956629423743, + "grad_norm": 0.843912661075592, + "learning_rate": 9.661310310547531e-06, + "loss": 0.7889, + "step": 4370 + }, + { + "epoch": 0.2405746050965931, + "grad_norm": 0.8673211336135864, + "learning_rate": 9.661153471873435e-06, + "loss": 0.7234, + "step": 4371 + }, + { + "epoch": 0.24062964389894875, + "grad_norm": 0.8179688453674316, + "learning_rate": 9.660996598167354e-06, + "loss": 0.8937, + "step": 4372 + }, + { + "epoch": 0.24068468270130441, + "grad_norm": 0.7800211906433105, + "learning_rate": 9.660839689430473e-06, + "loss": 0.8596, + "step": 4373 + }, + { + "epoch": 0.24073972150366008, + "grad_norm": 0.8781671524047852, + "learning_rate": 9.660682745663967e-06, + "loss": 0.8507, + "step": 4374 + }, + { + "epoch": 0.24079476030601574, + "grad_norm": 0.7701708674430847, + "learning_rate": 9.660525766869019e-06, + "loss": 0.8212, + "step": 4375 + }, + { + "epoch": 0.2408497991083714, + "grad_norm": 0.7721084356307983, + "learning_rate": 9.660368753046806e-06, + "loss": 0.7493, + "step": 4376 + }, + { + "epoch": 0.24090483791072706, + "grad_norm": 0.8126489520072937, + "learning_rate": 9.660211704198508e-06, + "loss": 0.8527, + "step": 4377 + }, + { + "epoch": 0.24095987671308272, + "grad_norm": 0.8172717690467834, + "learning_rate": 9.660054620325307e-06, + "loss": 0.8448, + "step": 4378 + }, + { + "epoch": 0.24101491551543838, + "grad_norm": 0.8293611407279968, + "learning_rate": 9.659897501428384e-06, + "loss": 0.9318, + "step": 4379 + }, + { + "epoch": 0.24106995431779404, + "grad_norm": 0.7445098161697388, + "learning_rate": 9.659740347508917e-06, + "loss": 0.7358, + "step": 4380 + }, + { + "epoch": 0.2411249931201497, + "grad_norm": 0.7778907418251038, + "learning_rate": 9.659583158568088e-06, + "loss": 0.7671, + "step": 4381 + }, + { + "epoch": 0.24118003192250537, + "grad_norm": 0.7828608751296997, + "learning_rate": 9.659425934607082e-06, + "loss": 0.8141, + "step": 4382 + }, + { + "epoch": 0.24123507072486103, + "grad_norm": 0.9433113932609558, + "learning_rate": 9.659268675627075e-06, + "loss": 0.7904, + "step": 4383 + }, + { + "epoch": 0.2412901095272167, + "grad_norm": 0.7097491025924683, + "learning_rate": 9.659111381629255e-06, + "loss": 0.7445, + "step": 4384 + }, + { + "epoch": 0.24134514832957235, + "grad_norm": 0.7450230717658997, + "learning_rate": 9.6589540526148e-06, + "loss": 0.6869, + "step": 4385 + }, + { + "epoch": 0.241400187131928, + "grad_norm": 0.7429760694503784, + "learning_rate": 9.658796688584893e-06, + "loss": 0.7367, + "step": 4386 + }, + { + "epoch": 0.24145522593428367, + "grad_norm": 0.7250030040740967, + "learning_rate": 9.658639289540716e-06, + "loss": 0.7502, + "step": 4387 + }, + { + "epoch": 0.24151026473663934, + "grad_norm": 0.6577159762382507, + "learning_rate": 9.658481855483455e-06, + "loss": 0.5785, + "step": 4388 + }, + { + "epoch": 0.241565303538995, + "grad_norm": 0.7846524119377136, + "learning_rate": 9.65832438641429e-06, + "loss": 0.7435, + "step": 4389 + }, + { + "epoch": 0.24162034234135066, + "grad_norm": 0.8370404839515686, + "learning_rate": 9.658166882334408e-06, + "loss": 0.8536, + "step": 4390 + }, + { + "epoch": 0.24167538114370632, + "grad_norm": 0.7451018691062927, + "learning_rate": 9.658009343244987e-06, + "loss": 0.8443, + "step": 4391 + }, + { + "epoch": 0.24173041994606198, + "grad_norm": 0.7629074454307556, + "learning_rate": 9.657851769147218e-06, + "loss": 0.7394, + "step": 4392 + }, + { + "epoch": 0.24178545874841764, + "grad_norm": 0.7767705321311951, + "learning_rate": 9.657694160042282e-06, + "loss": 0.8497, + "step": 4393 + }, + { + "epoch": 0.2418404975507733, + "grad_norm": 0.8635357022285461, + "learning_rate": 9.65753651593136e-06, + "loss": 0.8495, + "step": 4394 + }, + { + "epoch": 0.24189553635312896, + "grad_norm": 0.7652365565299988, + "learning_rate": 9.657378836815643e-06, + "loss": 0.7967, + "step": 4395 + }, + { + "epoch": 0.24195057515548463, + "grad_norm": 0.7721680402755737, + "learning_rate": 9.657221122696313e-06, + "loss": 0.8227, + "step": 4396 + }, + { + "epoch": 0.2420056139578403, + "grad_norm": 1.016366720199585, + "learning_rate": 9.657063373574555e-06, + "loss": 0.8291, + "step": 4397 + }, + { + "epoch": 0.24206065276019595, + "grad_norm": 0.7770145535469055, + "learning_rate": 9.656905589451555e-06, + "loss": 0.8335, + "step": 4398 + }, + { + "epoch": 0.2421156915625516, + "grad_norm": 0.812882125377655, + "learning_rate": 9.6567477703285e-06, + "loss": 0.8189, + "step": 4399 + }, + { + "epoch": 0.24217073036490727, + "grad_norm": 0.7253247499465942, + "learning_rate": 9.656589916206576e-06, + "loss": 0.8418, + "step": 4400 + }, + { + "epoch": 0.24222576916726293, + "grad_norm": 0.7784958481788635, + "learning_rate": 9.656432027086969e-06, + "loss": 0.8541, + "step": 4401 + }, + { + "epoch": 0.2422808079696186, + "grad_norm": 0.8001978397369385, + "learning_rate": 9.656274102970865e-06, + "loss": 0.8888, + "step": 4402 + }, + { + "epoch": 0.24233584677197423, + "grad_norm": 0.7535765767097473, + "learning_rate": 9.656116143859448e-06, + "loss": 0.7691, + "step": 4403 + }, + { + "epoch": 0.2423908855743299, + "grad_norm": 0.6554346680641174, + "learning_rate": 9.655958149753913e-06, + "loss": 0.7592, + "step": 4404 + }, + { + "epoch": 0.24244592437668555, + "grad_norm": 0.8599995374679565, + "learning_rate": 9.655800120655439e-06, + "loss": 0.8396, + "step": 4405 + }, + { + "epoch": 0.2425009631790412, + "grad_norm": 0.8172232508659363, + "learning_rate": 9.65564205656522e-06, + "loss": 0.6931, + "step": 4406 + }, + { + "epoch": 0.24255600198139687, + "grad_norm": 0.8005852699279785, + "learning_rate": 9.65548395748444e-06, + "loss": 0.8344, + "step": 4407 + }, + { + "epoch": 0.24261104078375254, + "grad_norm": 0.7823762893676758, + "learning_rate": 9.65532582341429e-06, + "loss": 0.7991, + "step": 4408 + }, + { + "epoch": 0.2426660795861082, + "grad_norm": 0.7743250727653503, + "learning_rate": 9.655167654355957e-06, + "loss": 0.9048, + "step": 4409 + }, + { + "epoch": 0.24272111838846386, + "grad_norm": 0.9825221300125122, + "learning_rate": 9.655009450310629e-06, + "loss": 0.7491, + "step": 4410 + }, + { + "epoch": 0.24277615719081952, + "grad_norm": 1.2921068668365479, + "learning_rate": 9.654851211279496e-06, + "loss": 0.8175, + "step": 4411 + }, + { + "epoch": 0.24283119599317518, + "grad_norm": 0.8267684578895569, + "learning_rate": 9.65469293726375e-06, + "loss": 0.8896, + "step": 4412 + }, + { + "epoch": 0.24288623479553084, + "grad_norm": 0.8020186424255371, + "learning_rate": 9.654534628264576e-06, + "loss": 0.7145, + "step": 4413 + }, + { + "epoch": 0.2429412735978865, + "grad_norm": 0.8192574977874756, + "learning_rate": 9.654376284283166e-06, + "loss": 0.7451, + "step": 4414 + }, + { + "epoch": 0.24299631240024216, + "grad_norm": 0.7733662128448486, + "learning_rate": 9.65421790532071e-06, + "loss": 0.768, + "step": 4415 + }, + { + "epoch": 0.24305135120259783, + "grad_norm": 0.8342406153678894, + "learning_rate": 9.654059491378396e-06, + "loss": 0.8137, + "step": 4416 + }, + { + "epoch": 0.2431063900049535, + "grad_norm": 1.014755368232727, + "learning_rate": 9.653901042457418e-06, + "loss": 0.8922, + "step": 4417 + }, + { + "epoch": 0.24316142880730915, + "grad_norm": 0.864608645439148, + "learning_rate": 9.653742558558967e-06, + "loss": 0.9412, + "step": 4418 + }, + { + "epoch": 0.2432164676096648, + "grad_norm": 0.7383908033370972, + "learning_rate": 9.65358403968423e-06, + "loss": 0.8261, + "step": 4419 + }, + { + "epoch": 0.24327150641202047, + "grad_norm": 0.7464672923088074, + "learning_rate": 9.653425485834403e-06, + "loss": 0.7074, + "step": 4420 + }, + { + "epoch": 0.24332654521437613, + "grad_norm": 0.7010141611099243, + "learning_rate": 9.653266897010676e-06, + "loss": 0.6849, + "step": 4421 + }, + { + "epoch": 0.2433815840167318, + "grad_norm": 0.7135268449783325, + "learning_rate": 9.653108273214239e-06, + "loss": 0.8228, + "step": 4422 + }, + { + "epoch": 0.24343662281908746, + "grad_norm": 0.8061006665229797, + "learning_rate": 9.652949614446287e-06, + "loss": 0.8345, + "step": 4423 + }, + { + "epoch": 0.24349166162144312, + "grad_norm": 0.6954759955406189, + "learning_rate": 9.652790920708011e-06, + "loss": 0.7189, + "step": 4424 + }, + { + "epoch": 0.24354670042379878, + "grad_norm": 0.8669333457946777, + "learning_rate": 9.652632192000603e-06, + "loss": 0.8872, + "step": 4425 + }, + { + "epoch": 0.24360173922615444, + "grad_norm": 0.7445051670074463, + "learning_rate": 9.652473428325258e-06, + "loss": 0.826, + "step": 4426 + }, + { + "epoch": 0.2436567780285101, + "grad_norm": 0.7444632649421692, + "learning_rate": 9.652314629683165e-06, + "loss": 0.8568, + "step": 4427 + }, + { + "epoch": 0.24371181683086576, + "grad_norm": 0.7160165309906006, + "learning_rate": 9.652155796075524e-06, + "loss": 0.799, + "step": 4428 + }, + { + "epoch": 0.24376685563322142, + "grad_norm": 0.7098904252052307, + "learning_rate": 9.651996927503526e-06, + "loss": 0.8148, + "step": 4429 + }, + { + "epoch": 0.24382189443557709, + "grad_norm": 0.7911115288734436, + "learning_rate": 9.651838023968363e-06, + "loss": 0.8279, + "step": 4430 + }, + { + "epoch": 0.24387693323793275, + "grad_norm": 0.8887501955032349, + "learning_rate": 9.651679085471229e-06, + "loss": 0.8464, + "step": 4431 + }, + { + "epoch": 0.2439319720402884, + "grad_norm": 0.8343196511268616, + "learning_rate": 9.651520112013321e-06, + "loss": 0.7364, + "step": 4432 + }, + { + "epoch": 0.24398701084264407, + "grad_norm": 0.7279361486434937, + "learning_rate": 9.651361103595835e-06, + "loss": 0.7958, + "step": 4433 + }, + { + "epoch": 0.24404204964499973, + "grad_norm": 0.8221089243888855, + "learning_rate": 9.651202060219962e-06, + "loss": 0.7753, + "step": 4434 + }, + { + "epoch": 0.2440970884473554, + "grad_norm": 0.7205086350440979, + "learning_rate": 9.6510429818869e-06, + "loss": 0.7411, + "step": 4435 + }, + { + "epoch": 0.24415212724971105, + "grad_norm": 0.854967474937439, + "learning_rate": 9.650883868597845e-06, + "loss": 0.8192, + "step": 4436 + }, + { + "epoch": 0.24420716605206672, + "grad_norm": 0.7622473835945129, + "learning_rate": 9.65072472035399e-06, + "loss": 0.7645, + "step": 4437 + }, + { + "epoch": 0.24426220485442238, + "grad_norm": 0.7430302500724792, + "learning_rate": 9.650565537156533e-06, + "loss": 0.7817, + "step": 4438 + }, + { + "epoch": 0.24431724365677804, + "grad_norm": 0.8022677898406982, + "learning_rate": 9.650406319006672e-06, + "loss": 0.8035, + "step": 4439 + }, + { + "epoch": 0.2443722824591337, + "grad_norm": 0.7346476912498474, + "learning_rate": 9.6502470659056e-06, + "loss": 0.826, + "step": 4440 + }, + { + "epoch": 0.24442732126148936, + "grad_norm": 0.8393376469612122, + "learning_rate": 9.650087777854517e-06, + "loss": 0.8073, + "step": 4441 + }, + { + "epoch": 0.24448236006384502, + "grad_norm": 0.7920215129852295, + "learning_rate": 9.649928454854618e-06, + "loss": 0.7774, + "step": 4442 + }, + { + "epoch": 0.24453739886620068, + "grad_norm": 0.8192804455757141, + "learning_rate": 9.649769096907102e-06, + "loss": 0.7817, + "step": 4443 + }, + { + "epoch": 0.24459243766855635, + "grad_norm": 0.7727654576301575, + "learning_rate": 9.649609704013167e-06, + "loss": 0.8201, + "step": 4444 + }, + { + "epoch": 0.244647476470912, + "grad_norm": 0.8005746603012085, + "learning_rate": 9.649450276174008e-06, + "loss": 0.8893, + "step": 4445 + }, + { + "epoch": 0.24470251527326764, + "grad_norm": 0.9029125571250916, + "learning_rate": 9.649290813390828e-06, + "loss": 0.7735, + "step": 4446 + }, + { + "epoch": 0.2447575540756233, + "grad_norm": 0.8336170315742493, + "learning_rate": 9.64913131566482e-06, + "loss": 0.7505, + "step": 4447 + }, + { + "epoch": 0.24481259287797896, + "grad_norm": 1.0272265672683716, + "learning_rate": 9.648971782997188e-06, + "loss": 0.8371, + "step": 4448 + }, + { + "epoch": 0.24486763168033462, + "grad_norm": 0.8095843195915222, + "learning_rate": 9.648812215389128e-06, + "loss": 0.7599, + "step": 4449 + }, + { + "epoch": 0.24492267048269029, + "grad_norm": 0.7690166234970093, + "learning_rate": 9.648652612841837e-06, + "loss": 0.8172, + "step": 4450 + }, + { + "epoch": 0.24497770928504595, + "grad_norm": 0.8282617926597595, + "learning_rate": 9.64849297535652e-06, + "loss": 0.8477, + "step": 4451 + }, + { + "epoch": 0.2450327480874016, + "grad_norm": 0.8307822346687317, + "learning_rate": 9.648333302934373e-06, + "loss": 0.7744, + "step": 4452 + }, + { + "epoch": 0.24508778688975727, + "grad_norm": 0.7619080543518066, + "learning_rate": 9.6481735955766e-06, + "loss": 0.8417, + "step": 4453 + }, + { + "epoch": 0.24514282569211293, + "grad_norm": 0.7879447937011719, + "learning_rate": 9.648013853284396e-06, + "loss": 0.7799, + "step": 4454 + }, + { + "epoch": 0.2451978644944686, + "grad_norm": 0.7352256774902344, + "learning_rate": 9.647854076058965e-06, + "loss": 0.8386, + "step": 4455 + }, + { + "epoch": 0.24525290329682425, + "grad_norm": 0.8318933248519897, + "learning_rate": 9.647694263901507e-06, + "loss": 0.7631, + "step": 4456 + }, + { + "epoch": 0.24530794209917992, + "grad_norm": 0.8609912395477295, + "learning_rate": 9.647534416813221e-06, + "loss": 0.7479, + "step": 4457 + }, + { + "epoch": 0.24536298090153558, + "grad_norm": 0.9590480327606201, + "learning_rate": 9.647374534795311e-06, + "loss": 0.8543, + "step": 4458 + }, + { + "epoch": 0.24541801970389124, + "grad_norm": 0.7902723550796509, + "learning_rate": 9.647214617848979e-06, + "loss": 0.6796, + "step": 4459 + }, + { + "epoch": 0.2454730585062469, + "grad_norm": 0.7725642919540405, + "learning_rate": 9.647054665975427e-06, + "loss": 0.7563, + "step": 4460 + }, + { + "epoch": 0.24552809730860256, + "grad_norm": 0.8387014269828796, + "learning_rate": 9.646894679175853e-06, + "loss": 0.8184, + "step": 4461 + }, + { + "epoch": 0.24558313611095822, + "grad_norm": 0.9200852513313293, + "learning_rate": 9.646734657451464e-06, + "loss": 0.8436, + "step": 4462 + }, + { + "epoch": 0.24563817491331388, + "grad_norm": 0.7565840482711792, + "learning_rate": 9.646574600803462e-06, + "loss": 0.7393, + "step": 4463 + }, + { + "epoch": 0.24569321371566955, + "grad_norm": 0.7685559988021851, + "learning_rate": 9.646414509233048e-06, + "loss": 0.7836, + "step": 4464 + }, + { + "epoch": 0.2457482525180252, + "grad_norm": 0.8172003030776978, + "learning_rate": 9.646254382741428e-06, + "loss": 0.787, + "step": 4465 + }, + { + "epoch": 0.24580329132038087, + "grad_norm": 0.902632474899292, + "learning_rate": 9.646094221329802e-06, + "loss": 0.7139, + "step": 4466 + }, + { + "epoch": 0.24585833012273653, + "grad_norm": 0.7810692191123962, + "learning_rate": 9.645934024999374e-06, + "loss": 0.6904, + "step": 4467 + }, + { + "epoch": 0.2459133689250922, + "grad_norm": 0.7242134213447571, + "learning_rate": 9.645773793751352e-06, + "loss": 0.7035, + "step": 4468 + }, + { + "epoch": 0.24596840772744785, + "grad_norm": 0.7192920446395874, + "learning_rate": 9.645613527586938e-06, + "loss": 0.7081, + "step": 4469 + }, + { + "epoch": 0.2460234465298035, + "grad_norm": 0.7613840103149414, + "learning_rate": 9.645453226507336e-06, + "loss": 0.8066, + "step": 4470 + }, + { + "epoch": 0.24607848533215917, + "grad_norm": 0.8154922127723694, + "learning_rate": 9.64529289051375e-06, + "loss": 0.812, + "step": 4471 + }, + { + "epoch": 0.24613352413451484, + "grad_norm": 0.9521573185920715, + "learning_rate": 9.645132519607387e-06, + "loss": 0.7456, + "step": 4472 + }, + { + "epoch": 0.2461885629368705, + "grad_norm": 0.785943329334259, + "learning_rate": 9.64497211378945e-06, + "loss": 0.832, + "step": 4473 + }, + { + "epoch": 0.24624360173922616, + "grad_norm": 0.7675127983093262, + "learning_rate": 9.644811673061148e-06, + "loss": 0.7984, + "step": 4474 + }, + { + "epoch": 0.24629864054158182, + "grad_norm": 0.7317580580711365, + "learning_rate": 9.644651197423683e-06, + "loss": 0.7634, + "step": 4475 + }, + { + "epoch": 0.24635367934393748, + "grad_norm": 0.744937539100647, + "learning_rate": 9.644490686878265e-06, + "loss": 0.729, + "step": 4476 + }, + { + "epoch": 0.24640871814629314, + "grad_norm": 0.7472458481788635, + "learning_rate": 9.644330141426097e-06, + "loss": 0.7517, + "step": 4477 + }, + { + "epoch": 0.2464637569486488, + "grad_norm": 0.8379414677619934, + "learning_rate": 9.644169561068387e-06, + "loss": 0.8008, + "step": 4478 + }, + { + "epoch": 0.24651879575100447, + "grad_norm": 0.8845154047012329, + "learning_rate": 9.64400894580634e-06, + "loss": 0.8135, + "step": 4479 + }, + { + "epoch": 0.24657383455336013, + "grad_norm": 0.7394443154335022, + "learning_rate": 9.643848295641167e-06, + "loss": 0.7697, + "step": 4480 + }, + { + "epoch": 0.2466288733557158, + "grad_norm": 0.8840840458869934, + "learning_rate": 9.643687610574073e-06, + "loss": 0.825, + "step": 4481 + }, + { + "epoch": 0.24668391215807145, + "grad_norm": 0.7924874424934387, + "learning_rate": 9.643526890606265e-06, + "loss": 0.793, + "step": 4482 + }, + { + "epoch": 0.2467389509604271, + "grad_norm": 0.7966769933700562, + "learning_rate": 9.643366135738951e-06, + "loss": 0.8042, + "step": 4483 + }, + { + "epoch": 0.24679398976278277, + "grad_norm": 0.911756694316864, + "learning_rate": 9.643205345973343e-06, + "loss": 0.7801, + "step": 4484 + }, + { + "epoch": 0.24684902856513843, + "grad_norm": 0.903378963470459, + "learning_rate": 9.643044521310645e-06, + "loss": 0.7863, + "step": 4485 + }, + { + "epoch": 0.2469040673674941, + "grad_norm": 0.9021226167678833, + "learning_rate": 9.642883661752067e-06, + "loss": 0.8005, + "step": 4486 + }, + { + "epoch": 0.24695910616984976, + "grad_norm": 0.8853413462638855, + "learning_rate": 9.64272276729882e-06, + "loss": 0.8371, + "step": 4487 + }, + { + "epoch": 0.24701414497220542, + "grad_norm": 1.0654630661010742, + "learning_rate": 9.642561837952108e-06, + "loss": 0.92, + "step": 4488 + }, + { + "epoch": 0.24706918377456105, + "grad_norm": 0.8663573265075684, + "learning_rate": 9.642400873713146e-06, + "loss": 0.8066, + "step": 4489 + }, + { + "epoch": 0.2471242225769167, + "grad_norm": 0.7483134269714355, + "learning_rate": 9.642239874583143e-06, + "loss": 0.9013, + "step": 4490 + }, + { + "epoch": 0.24717926137927237, + "grad_norm": 0.7582293748855591, + "learning_rate": 9.642078840563306e-06, + "loss": 0.7795, + "step": 4491 + }, + { + "epoch": 0.24723430018162804, + "grad_norm": 0.8276637196540833, + "learning_rate": 9.641917771654848e-06, + "loss": 0.7756, + "step": 4492 + }, + { + "epoch": 0.2472893389839837, + "grad_norm": 0.697088360786438, + "learning_rate": 9.641756667858976e-06, + "loss": 0.7092, + "step": 4493 + }, + { + "epoch": 0.24734437778633936, + "grad_norm": 0.8960816860198975, + "learning_rate": 9.641595529176907e-06, + "loss": 0.8835, + "step": 4494 + }, + { + "epoch": 0.24739941658869502, + "grad_norm": 0.9210898280143738, + "learning_rate": 9.641434355609846e-06, + "loss": 0.7881, + "step": 4495 + }, + { + "epoch": 0.24745445539105068, + "grad_norm": 0.7205467820167542, + "learning_rate": 9.64127314715901e-06, + "loss": 0.7204, + "step": 4496 + }, + { + "epoch": 0.24750949419340634, + "grad_norm": 0.7313701510429382, + "learning_rate": 9.641111903825603e-06, + "loss": 0.8296, + "step": 4497 + }, + { + "epoch": 0.247564532995762, + "grad_norm": 0.771159827709198, + "learning_rate": 9.640950625610845e-06, + "loss": 0.7974, + "step": 4498 + }, + { + "epoch": 0.24761957179811767, + "grad_norm": 0.9227705597877502, + "learning_rate": 9.64078931251594e-06, + "loss": 0.9215, + "step": 4499 + }, + { + "epoch": 0.24767461060047333, + "grad_norm": 0.7569915652275085, + "learning_rate": 9.64062796454211e-06, + "loss": 0.83, + "step": 4500 + }, + { + "epoch": 0.247729649402829, + "grad_norm": 0.7453131675720215, + "learning_rate": 9.64046658169056e-06, + "loss": 0.6747, + "step": 4501 + }, + { + "epoch": 0.24778468820518465, + "grad_norm": 0.7228132486343384, + "learning_rate": 9.640305163962504e-06, + "loss": 0.7535, + "step": 4502 + }, + { + "epoch": 0.2478397270075403, + "grad_norm": 0.8160690069198608, + "learning_rate": 9.640143711359159e-06, + "loss": 0.8655, + "step": 4503 + }, + { + "epoch": 0.24789476580989597, + "grad_norm": 0.7641691565513611, + "learning_rate": 9.639982223881735e-06, + "loss": 0.8353, + "step": 4504 + }, + { + "epoch": 0.24794980461225163, + "grad_norm": 0.8669107556343079, + "learning_rate": 9.639820701531445e-06, + "loss": 0.8614, + "step": 4505 + }, + { + "epoch": 0.2480048434146073, + "grad_norm": 0.7433111667633057, + "learning_rate": 9.639659144309508e-06, + "loss": 0.6891, + "step": 4506 + }, + { + "epoch": 0.24805988221696296, + "grad_norm": 1.4303346872329712, + "learning_rate": 9.639497552217131e-06, + "loss": 0.8016, + "step": 4507 + }, + { + "epoch": 0.24811492101931862, + "grad_norm": 0.8684772253036499, + "learning_rate": 9.639335925255535e-06, + "loss": 0.8324, + "step": 4508 + }, + { + "epoch": 0.24816995982167428, + "grad_norm": 0.9222162365913391, + "learning_rate": 9.639174263425932e-06, + "loss": 0.8715, + "step": 4509 + }, + { + "epoch": 0.24822499862402994, + "grad_norm": 0.9789180755615234, + "learning_rate": 9.639012566729535e-06, + "loss": 0.823, + "step": 4510 + }, + { + "epoch": 0.2482800374263856, + "grad_norm": 0.8475140333175659, + "learning_rate": 9.638850835167564e-06, + "loss": 0.768, + "step": 4511 + }, + { + "epoch": 0.24833507622874126, + "grad_norm": 0.7943722605705261, + "learning_rate": 9.63868906874123e-06, + "loss": 0.788, + "step": 4512 + }, + { + "epoch": 0.24839011503109693, + "grad_norm": 0.8723915815353394, + "learning_rate": 9.63852726745175e-06, + "loss": 0.7865, + "step": 4513 + }, + { + "epoch": 0.2484451538334526, + "grad_norm": 0.837001383304596, + "learning_rate": 9.638365431300342e-06, + "loss": 0.7799, + "step": 4514 + }, + { + "epoch": 0.24850019263580825, + "grad_norm": 0.7992665767669678, + "learning_rate": 9.638203560288222e-06, + "loss": 0.8951, + "step": 4515 + }, + { + "epoch": 0.2485552314381639, + "grad_norm": 0.8712993264198303, + "learning_rate": 9.638041654416603e-06, + "loss": 0.8157, + "step": 4516 + }, + { + "epoch": 0.24861027024051957, + "grad_norm": 0.7176356911659241, + "learning_rate": 9.637879713686706e-06, + "loss": 0.8197, + "step": 4517 + }, + { + "epoch": 0.24866530904287523, + "grad_norm": 0.7624368071556091, + "learning_rate": 9.637717738099747e-06, + "loss": 0.7545, + "step": 4518 + }, + { + "epoch": 0.2487203478452309, + "grad_norm": 0.857222318649292, + "learning_rate": 9.637555727656943e-06, + "loss": 0.8146, + "step": 4519 + }, + { + "epoch": 0.24877538664758655, + "grad_norm": 0.7461313605308533, + "learning_rate": 9.637393682359511e-06, + "loss": 0.8569, + "step": 4520 + }, + { + "epoch": 0.24883042544994222, + "grad_norm": 0.8491896986961365, + "learning_rate": 9.637231602208668e-06, + "loss": 0.863, + "step": 4521 + }, + { + "epoch": 0.24888546425229788, + "grad_norm": 0.8139386177062988, + "learning_rate": 9.637069487205635e-06, + "loss": 0.7105, + "step": 4522 + }, + { + "epoch": 0.24894050305465354, + "grad_norm": 0.7782894968986511, + "learning_rate": 9.636907337351629e-06, + "loss": 0.8044, + "step": 4523 + }, + { + "epoch": 0.2489955418570092, + "grad_norm": 0.8225486874580383, + "learning_rate": 9.636745152647868e-06, + "loss": 0.7877, + "step": 4524 + }, + { + "epoch": 0.24905058065936486, + "grad_norm": 0.9087927341461182, + "learning_rate": 9.636582933095573e-06, + "loss": 0.8017, + "step": 4525 + }, + { + "epoch": 0.24910561946172052, + "grad_norm": 0.7392508387565613, + "learning_rate": 9.636420678695962e-06, + "loss": 0.7953, + "step": 4526 + }, + { + "epoch": 0.24916065826407618, + "grad_norm": 0.7906273007392883, + "learning_rate": 9.636258389450253e-06, + "loss": 0.9491, + "step": 4527 + }, + { + "epoch": 0.24921569706643185, + "grad_norm": 0.840394139289856, + "learning_rate": 9.636096065359666e-06, + "loss": 0.8621, + "step": 4528 + }, + { + "epoch": 0.2492707358687875, + "grad_norm": 0.7923862934112549, + "learning_rate": 9.635933706425424e-06, + "loss": 0.8215, + "step": 4529 + }, + { + "epoch": 0.24932577467114317, + "grad_norm": 0.8372805714607239, + "learning_rate": 9.635771312648744e-06, + "loss": 0.8845, + "step": 4530 + }, + { + "epoch": 0.24938081347349883, + "grad_norm": 0.7569165229797363, + "learning_rate": 9.635608884030848e-06, + "loss": 0.8406, + "step": 4531 + }, + { + "epoch": 0.24943585227585446, + "grad_norm": 0.8260865807533264, + "learning_rate": 9.635446420572956e-06, + "loss": 0.8418, + "step": 4532 + }, + { + "epoch": 0.24949089107821013, + "grad_norm": 0.6841318607330322, + "learning_rate": 9.635283922276291e-06, + "loss": 0.6732, + "step": 4533 + }, + { + "epoch": 0.2495459298805658, + "grad_norm": 0.7055326104164124, + "learning_rate": 9.635121389142072e-06, + "loss": 0.7702, + "step": 4534 + }, + { + "epoch": 0.24960096868292145, + "grad_norm": 0.7293457388877869, + "learning_rate": 9.63495882117152e-06, + "loss": 0.6836, + "step": 4535 + }, + { + "epoch": 0.2496560074852771, + "grad_norm": 0.7411924004554749, + "learning_rate": 9.63479621836586e-06, + "loss": 0.8686, + "step": 4536 + }, + { + "epoch": 0.24971104628763277, + "grad_norm": 0.7864643931388855, + "learning_rate": 9.634633580726313e-06, + "loss": 0.7801, + "step": 4537 + }, + { + "epoch": 0.24976608508998843, + "grad_norm": 0.9730797410011292, + "learning_rate": 9.634470908254099e-06, + "loss": 0.8362, + "step": 4538 + }, + { + "epoch": 0.2498211238923441, + "grad_norm": 0.8390370011329651, + "learning_rate": 9.634308200950442e-06, + "loss": 0.8079, + "step": 4539 + }, + { + "epoch": 0.24987616269469975, + "grad_norm": 0.8951246738433838, + "learning_rate": 9.634145458816566e-06, + "loss": 0.7662, + "step": 4540 + }, + { + "epoch": 0.24993120149705542, + "grad_norm": 0.7654157280921936, + "learning_rate": 9.633982681853693e-06, + "loss": 0.8699, + "step": 4541 + }, + { + "epoch": 0.24998624029941108, + "grad_norm": 0.8152109980583191, + "learning_rate": 9.633819870063046e-06, + "loss": 0.7875, + "step": 4542 + }, + { + "epoch": 0.25004127910176677, + "grad_norm": 0.9407321214675903, + "learning_rate": 9.63365702344585e-06, + "loss": 0.7708, + "step": 4543 + }, + { + "epoch": 0.2500963179041224, + "grad_norm": 0.8169927597045898, + "learning_rate": 9.633494142003327e-06, + "loss": 0.8078, + "step": 4544 + }, + { + "epoch": 0.2501513567064781, + "grad_norm": 0.7380755543708801, + "learning_rate": 9.633331225736704e-06, + "loss": 0.7818, + "step": 4545 + }, + { + "epoch": 0.2502063955088337, + "grad_norm": 0.8124812841415405, + "learning_rate": 9.633168274647203e-06, + "loss": 0.8133, + "step": 4546 + }, + { + "epoch": 0.2502614343111894, + "grad_norm": 0.8511367440223694, + "learning_rate": 9.63300528873605e-06, + "loss": 0.7747, + "step": 4547 + }, + { + "epoch": 0.25031647311354505, + "grad_norm": 0.7305121421813965, + "learning_rate": 9.632842268004469e-06, + "loss": 0.8479, + "step": 4548 + }, + { + "epoch": 0.25037151191590074, + "grad_norm": 0.7127692103385925, + "learning_rate": 9.632679212453686e-06, + "loss": 0.8514, + "step": 4549 + }, + { + "epoch": 0.25042655071825637, + "grad_norm": 0.8251872062683105, + "learning_rate": 9.632516122084926e-06, + "loss": 0.7686, + "step": 4550 + }, + { + "epoch": 0.25048158952061206, + "grad_norm": 0.6756613850593567, + "learning_rate": 9.632352996899413e-06, + "loss": 0.5959, + "step": 4551 + }, + { + "epoch": 0.2505366283229677, + "grad_norm": 0.9266120791435242, + "learning_rate": 9.632189836898377e-06, + "loss": 0.7889, + "step": 4552 + }, + { + "epoch": 0.2505916671253233, + "grad_norm": 0.769890546798706, + "learning_rate": 9.63202664208304e-06, + "loss": 0.7864, + "step": 4553 + }, + { + "epoch": 0.250646705927679, + "grad_norm": 0.7314025163650513, + "learning_rate": 9.631863412454634e-06, + "loss": 0.8088, + "step": 4554 + }, + { + "epoch": 0.25070174473003465, + "grad_norm": 0.818317711353302, + "learning_rate": 9.63170014801438e-06, + "loss": 0.7096, + "step": 4555 + }, + { + "epoch": 0.25075678353239034, + "grad_norm": 0.7538807392120361, + "learning_rate": 9.631536848763508e-06, + "loss": 0.7779, + "step": 4556 + }, + { + "epoch": 0.25081182233474597, + "grad_norm": 0.7658100128173828, + "learning_rate": 9.631373514703247e-06, + "loss": 0.8535, + "step": 4557 + }, + { + "epoch": 0.25086686113710166, + "grad_norm": 0.8019290566444397, + "learning_rate": 9.631210145834819e-06, + "loss": 0.8141, + "step": 4558 + }, + { + "epoch": 0.2509218999394573, + "grad_norm": 0.7257653474807739, + "learning_rate": 9.631046742159456e-06, + "loss": 0.7451, + "step": 4559 + }, + { + "epoch": 0.250976938741813, + "grad_norm": 0.7546024918556213, + "learning_rate": 9.630883303678386e-06, + "loss": 0.7707, + "step": 4560 + }, + { + "epoch": 0.2510319775441686, + "grad_norm": 0.7288938760757446, + "learning_rate": 9.630719830392835e-06, + "loss": 0.7362, + "step": 4561 + }, + { + "epoch": 0.2510870163465243, + "grad_norm": 0.7814223170280457, + "learning_rate": 9.630556322304036e-06, + "loss": 0.8514, + "step": 4562 + }, + { + "epoch": 0.25114205514887994, + "grad_norm": 0.7561381459236145, + "learning_rate": 9.630392779413214e-06, + "loss": 0.7659, + "step": 4563 + }, + { + "epoch": 0.25119709395123563, + "grad_norm": 0.750641942024231, + "learning_rate": 9.6302292017216e-06, + "loss": 0.8496, + "step": 4564 + }, + { + "epoch": 0.25125213275359126, + "grad_norm": 0.832155704498291, + "learning_rate": 9.630065589230422e-06, + "loss": 0.7778, + "step": 4565 + }, + { + "epoch": 0.25130717155594695, + "grad_norm": 0.8202440142631531, + "learning_rate": 9.62990194194091e-06, + "loss": 0.8962, + "step": 4566 + }, + { + "epoch": 0.2513622103583026, + "grad_norm": 0.8777977824211121, + "learning_rate": 9.629738259854295e-06, + "loss": 0.7215, + "step": 4567 + }, + { + "epoch": 0.2514172491606583, + "grad_norm": 1.1868599653244019, + "learning_rate": 9.629574542971806e-06, + "loss": 0.8238, + "step": 4568 + }, + { + "epoch": 0.2514722879630139, + "grad_norm": 0.9128753542900085, + "learning_rate": 9.629410791294675e-06, + "loss": 0.7638, + "step": 4569 + }, + { + "epoch": 0.2515273267653696, + "grad_norm": 0.7350082993507385, + "learning_rate": 9.629247004824132e-06, + "loss": 0.8041, + "step": 4570 + }, + { + "epoch": 0.25158236556772523, + "grad_norm": 0.7279660701751709, + "learning_rate": 9.629083183561407e-06, + "loss": 0.7377, + "step": 4571 + }, + { + "epoch": 0.2516374043700809, + "grad_norm": 0.8570461273193359, + "learning_rate": 9.628919327507732e-06, + "loss": 0.8106, + "step": 4572 + }, + { + "epoch": 0.25169244317243655, + "grad_norm": 0.8998312950134277, + "learning_rate": 9.62875543666434e-06, + "loss": 0.8171, + "step": 4573 + }, + { + "epoch": 0.25174748197479224, + "grad_norm": 0.7631624937057495, + "learning_rate": 9.628591511032456e-06, + "loss": 0.7871, + "step": 4574 + }, + { + "epoch": 0.2518025207771479, + "grad_norm": 0.7752320766448975, + "learning_rate": 9.628427550613322e-06, + "loss": 0.8241, + "step": 4575 + }, + { + "epoch": 0.25185755957950356, + "grad_norm": 0.8741563558578491, + "learning_rate": 9.628263555408163e-06, + "loss": 0.7312, + "step": 4576 + }, + { + "epoch": 0.2519125983818592, + "grad_norm": 0.8615008592605591, + "learning_rate": 9.628099525418216e-06, + "loss": 0.8586, + "step": 4577 + }, + { + "epoch": 0.2519676371842149, + "grad_norm": 0.8273662328720093, + "learning_rate": 9.62793546064471e-06, + "loss": 0.7838, + "step": 4578 + }, + { + "epoch": 0.2520226759865705, + "grad_norm": 0.7454090118408203, + "learning_rate": 9.627771361088882e-06, + "loss": 0.8461, + "step": 4579 + }, + { + "epoch": 0.2520777147889262, + "grad_norm": 0.8225379586219788, + "learning_rate": 9.627607226751962e-06, + "loss": 0.7792, + "step": 4580 + }, + { + "epoch": 0.25213275359128184, + "grad_norm": 0.8655416369438171, + "learning_rate": 9.627443057635184e-06, + "loss": 0.8165, + "step": 4581 + }, + { + "epoch": 0.25218779239363753, + "grad_norm": 0.7735984921455383, + "learning_rate": 9.627278853739783e-06, + "loss": 0.8208, + "step": 4582 + }, + { + "epoch": 0.25224283119599317, + "grad_norm": 0.8293350338935852, + "learning_rate": 9.627114615066994e-06, + "loss": 0.7394, + "step": 4583 + }, + { + "epoch": 0.25229786999834886, + "grad_norm": 0.7840214371681213, + "learning_rate": 9.626950341618048e-06, + "loss": 0.8522, + "step": 4584 + }, + { + "epoch": 0.2523529088007045, + "grad_norm": 0.7724186182022095, + "learning_rate": 9.626786033394185e-06, + "loss": 0.8175, + "step": 4585 + }, + { + "epoch": 0.2524079476030602, + "grad_norm": 1.0751588344573975, + "learning_rate": 9.626621690396634e-06, + "loss": 0.9229, + "step": 4586 + }, + { + "epoch": 0.2524629864054158, + "grad_norm": 0.7016913294792175, + "learning_rate": 9.626457312626634e-06, + "loss": 0.6883, + "step": 4587 + }, + { + "epoch": 0.2525180252077715, + "grad_norm": 0.918377697467804, + "learning_rate": 9.626292900085419e-06, + "loss": 0.7889, + "step": 4588 + }, + { + "epoch": 0.25257306401012714, + "grad_norm": 1.006564736366272, + "learning_rate": 9.626128452774226e-06, + "loss": 0.7888, + "step": 4589 + }, + { + "epoch": 0.2526281028124828, + "grad_norm": 1.0214998722076416, + "learning_rate": 9.625963970694287e-06, + "loss": 0.768, + "step": 4590 + }, + { + "epoch": 0.25268314161483846, + "grad_norm": 0.7980843186378479, + "learning_rate": 9.625799453846844e-06, + "loss": 0.8662, + "step": 4591 + }, + { + "epoch": 0.25273818041719415, + "grad_norm": 0.734582245349884, + "learning_rate": 9.625634902233128e-06, + "loss": 0.759, + "step": 4592 + }, + { + "epoch": 0.2527932192195498, + "grad_norm": 0.7185904383659363, + "learning_rate": 9.62547031585438e-06, + "loss": 0.774, + "step": 4593 + }, + { + "epoch": 0.25284825802190547, + "grad_norm": 0.7356622219085693, + "learning_rate": 9.625305694711835e-06, + "loss": 0.7435, + "step": 4594 + }, + { + "epoch": 0.2529032968242611, + "grad_norm": 0.7589355707168579, + "learning_rate": 9.62514103880673e-06, + "loss": 0.807, + "step": 4595 + }, + { + "epoch": 0.25295833562661674, + "grad_norm": 0.889228880405426, + "learning_rate": 9.624976348140305e-06, + "loss": 0.8609, + "step": 4596 + }, + { + "epoch": 0.2530133744289724, + "grad_norm": 0.7546125650405884, + "learning_rate": 9.624811622713793e-06, + "loss": 0.8379, + "step": 4597 + }, + { + "epoch": 0.25306841323132806, + "grad_norm": 0.8262770175933838, + "learning_rate": 9.624646862528436e-06, + "loss": 0.7611, + "step": 4598 + }, + { + "epoch": 0.25312345203368375, + "grad_norm": 0.8876076936721802, + "learning_rate": 9.624482067585472e-06, + "loss": 0.8106, + "step": 4599 + }, + { + "epoch": 0.2531784908360394, + "grad_norm": 0.7045544981956482, + "learning_rate": 9.624317237886137e-06, + "loss": 0.7121, + "step": 4600 + }, + { + "epoch": 0.25323352963839507, + "grad_norm": 0.7693355083465576, + "learning_rate": 9.624152373431672e-06, + "loss": 0.8052, + "step": 4601 + }, + { + "epoch": 0.2532885684407507, + "grad_norm": 0.8072683811187744, + "learning_rate": 9.623987474223316e-06, + "loss": 0.8543, + "step": 4602 + }, + { + "epoch": 0.2533436072431064, + "grad_norm": 0.8158687949180603, + "learning_rate": 9.62382254026231e-06, + "loss": 0.6922, + "step": 4603 + }, + { + "epoch": 0.25339864604546203, + "grad_norm": 0.7688641548156738, + "learning_rate": 9.623657571549887e-06, + "loss": 0.7198, + "step": 4604 + }, + { + "epoch": 0.2534536848478177, + "grad_norm": 0.7806578278541565, + "learning_rate": 9.623492568087293e-06, + "loss": 0.8539, + "step": 4605 + }, + { + "epoch": 0.25350872365017335, + "grad_norm": 0.9557347893714905, + "learning_rate": 9.623327529875769e-06, + "loss": 0.6996, + "step": 4606 + }, + { + "epoch": 0.25356376245252904, + "grad_norm": 0.9465067386627197, + "learning_rate": 9.62316245691655e-06, + "loss": 0.8756, + "step": 4607 + }, + { + "epoch": 0.2536188012548847, + "grad_norm": 0.8029165863990784, + "learning_rate": 9.62299734921088e-06, + "loss": 0.8573, + "step": 4608 + }, + { + "epoch": 0.25367384005724036, + "grad_norm": 0.7530128955841064, + "learning_rate": 9.62283220676e-06, + "loss": 0.7466, + "step": 4609 + }, + { + "epoch": 0.253728878859596, + "grad_norm": 0.6704453825950623, + "learning_rate": 9.622667029565151e-06, + "loss": 0.6512, + "step": 4610 + }, + { + "epoch": 0.2537839176619517, + "grad_norm": 0.7162728309631348, + "learning_rate": 9.622501817627574e-06, + "loss": 0.7615, + "step": 4611 + }, + { + "epoch": 0.2538389564643073, + "grad_norm": 0.7599188089370728, + "learning_rate": 9.622336570948509e-06, + "loss": 0.8463, + "step": 4612 + }, + { + "epoch": 0.253893995266663, + "grad_norm": 0.7922326922416687, + "learning_rate": 9.6221712895292e-06, + "loss": 0.9221, + "step": 4613 + }, + { + "epoch": 0.25394903406901864, + "grad_norm": 1.4635218381881714, + "learning_rate": 9.622005973370892e-06, + "loss": 0.9159, + "step": 4614 + }, + { + "epoch": 0.25400407287137433, + "grad_norm": 0.8695057034492493, + "learning_rate": 9.62184062247482e-06, + "loss": 0.6792, + "step": 4615 + }, + { + "epoch": 0.25405911167372996, + "grad_norm": 0.8070930242538452, + "learning_rate": 9.621675236842235e-06, + "loss": 0.8257, + "step": 4616 + }, + { + "epoch": 0.25411415047608565, + "grad_norm": 0.8642075061798096, + "learning_rate": 9.621509816474372e-06, + "loss": 0.8223, + "step": 4617 + }, + { + "epoch": 0.2541691892784413, + "grad_norm": 0.7131080031394958, + "learning_rate": 9.621344361372483e-06, + "loss": 0.6831, + "step": 4618 + }, + { + "epoch": 0.254224228080797, + "grad_norm": 0.7582216262817383, + "learning_rate": 9.621178871537804e-06, + "loss": 0.8091, + "step": 4619 + }, + { + "epoch": 0.2542792668831526, + "grad_norm": 0.7705016732215881, + "learning_rate": 9.62101334697158e-06, + "loss": 0.7537, + "step": 4620 + }, + { + "epoch": 0.2543343056855083, + "grad_norm": 0.7638342976570129, + "learning_rate": 9.62084778767506e-06, + "loss": 0.7661, + "step": 4621 + }, + { + "epoch": 0.25438934448786393, + "grad_norm": 0.9296607971191406, + "learning_rate": 9.620682193649482e-06, + "loss": 0.8875, + "step": 4622 + }, + { + "epoch": 0.2544443832902196, + "grad_norm": 0.795394778251648, + "learning_rate": 9.620516564896096e-06, + "loss": 0.6884, + "step": 4623 + }, + { + "epoch": 0.25449942209257526, + "grad_norm": 0.9164957404136658, + "learning_rate": 9.620350901416142e-06, + "loss": 0.8693, + "step": 4624 + }, + { + "epoch": 0.25455446089493095, + "grad_norm": 0.8306281566619873, + "learning_rate": 9.62018520321087e-06, + "loss": 0.8972, + "step": 4625 + }, + { + "epoch": 0.2546094996972866, + "grad_norm": 0.778831422328949, + "learning_rate": 9.620019470281521e-06, + "loss": 0.7574, + "step": 4626 + }, + { + "epoch": 0.25466453849964227, + "grad_norm": 0.9326225519180298, + "learning_rate": 9.619853702629343e-06, + "loss": 0.7712, + "step": 4627 + }, + { + "epoch": 0.2547195773019979, + "grad_norm": 0.8772255182266235, + "learning_rate": 9.619687900255581e-06, + "loss": 0.8241, + "step": 4628 + }, + { + "epoch": 0.2547746161043536, + "grad_norm": 0.8777550458908081, + "learning_rate": 9.619522063161482e-06, + "loss": 0.8724, + "step": 4629 + }, + { + "epoch": 0.2548296549067092, + "grad_norm": 0.8332602381706238, + "learning_rate": 9.61935619134829e-06, + "loss": 0.8716, + "step": 4630 + }, + { + "epoch": 0.2548846937090649, + "grad_norm": 0.8246355056762695, + "learning_rate": 9.619190284817255e-06, + "loss": 0.7789, + "step": 4631 + }, + { + "epoch": 0.25493973251142055, + "grad_norm": 0.7200644612312317, + "learning_rate": 9.61902434356962e-06, + "loss": 0.7956, + "step": 4632 + }, + { + "epoch": 0.25499477131377624, + "grad_norm": 0.827756404876709, + "learning_rate": 9.618858367606638e-06, + "loss": 0.7925, + "step": 4633 + }, + { + "epoch": 0.25504981011613187, + "grad_norm": 0.7749341726303101, + "learning_rate": 9.618692356929551e-06, + "loss": 0.8706, + "step": 4634 + }, + { + "epoch": 0.25510484891848756, + "grad_norm": 0.7233432531356812, + "learning_rate": 9.618526311539608e-06, + "loss": 0.7725, + "step": 4635 + }, + { + "epoch": 0.2551598877208432, + "grad_norm": 0.846340537071228, + "learning_rate": 9.618360231438058e-06, + "loss": 0.8758, + "step": 4636 + }, + { + "epoch": 0.2552149265231989, + "grad_norm": 0.8262908458709717, + "learning_rate": 9.61819411662615e-06, + "loss": 0.7758, + "step": 4637 + }, + { + "epoch": 0.2552699653255545, + "grad_norm": 0.7829110026359558, + "learning_rate": 9.61802796710513e-06, + "loss": 0.8494, + "step": 4638 + }, + { + "epoch": 0.25532500412791015, + "grad_norm": 0.7480815649032593, + "learning_rate": 9.617861782876247e-06, + "loss": 0.7639, + "step": 4639 + }, + { + "epoch": 0.25538004293026584, + "grad_norm": 0.8782994747161865, + "learning_rate": 9.617695563940752e-06, + "loss": 0.9651, + "step": 4640 + }, + { + "epoch": 0.25543508173262147, + "grad_norm": 0.7215868234634399, + "learning_rate": 9.617529310299895e-06, + "loss": 0.7833, + "step": 4641 + }, + { + "epoch": 0.25549012053497716, + "grad_norm": 0.8287535905838013, + "learning_rate": 9.617363021954922e-06, + "loss": 0.901, + "step": 4642 + }, + { + "epoch": 0.2555451593373328, + "grad_norm": 0.7679935097694397, + "learning_rate": 9.617196698907084e-06, + "loss": 0.761, + "step": 4643 + }, + { + "epoch": 0.2556001981396885, + "grad_norm": 0.7765942811965942, + "learning_rate": 9.617030341157632e-06, + "loss": 0.7356, + "step": 4644 + }, + { + "epoch": 0.2556552369420441, + "grad_norm": 0.6964583396911621, + "learning_rate": 9.616863948707816e-06, + "loss": 0.7683, + "step": 4645 + }, + { + "epoch": 0.2557102757443998, + "grad_norm": 0.8031953573226929, + "learning_rate": 9.616697521558886e-06, + "loss": 0.7875, + "step": 4646 + }, + { + "epoch": 0.25576531454675544, + "grad_norm": 0.7155965566635132, + "learning_rate": 9.616531059712094e-06, + "loss": 0.6516, + "step": 4647 + }, + { + "epoch": 0.25582035334911113, + "grad_norm": 0.6870070099830627, + "learning_rate": 9.61636456316869e-06, + "loss": 0.7217, + "step": 4648 + }, + { + "epoch": 0.25587539215146676, + "grad_norm": 0.7686315774917603, + "learning_rate": 9.616198031929926e-06, + "loss": 0.8136, + "step": 4649 + }, + { + "epoch": 0.25593043095382245, + "grad_norm": 0.7532772421836853, + "learning_rate": 9.616031465997054e-06, + "loss": 0.696, + "step": 4650 + }, + { + "epoch": 0.2559854697561781, + "grad_norm": 0.8111574053764343, + "learning_rate": 9.615864865371323e-06, + "loss": 0.8501, + "step": 4651 + }, + { + "epoch": 0.2560405085585338, + "grad_norm": 0.771065890789032, + "learning_rate": 9.615698230053989e-06, + "loss": 0.7417, + "step": 4652 + }, + { + "epoch": 0.2560955473608894, + "grad_norm": 0.7468003034591675, + "learning_rate": 9.6155315600463e-06, + "loss": 0.7303, + "step": 4653 + }, + { + "epoch": 0.2561505861632451, + "grad_norm": 0.8041057586669922, + "learning_rate": 9.615364855349514e-06, + "loss": 0.8689, + "step": 4654 + }, + { + "epoch": 0.25620562496560073, + "grad_norm": 0.8439033627510071, + "learning_rate": 9.61519811596488e-06, + "loss": 0.8654, + "step": 4655 + }, + { + "epoch": 0.2562606637679564, + "grad_norm": 0.7768430113792419, + "learning_rate": 9.615031341893653e-06, + "loss": 0.8789, + "step": 4656 + }, + { + "epoch": 0.25631570257031205, + "grad_norm": 0.712876558303833, + "learning_rate": 9.614864533137086e-06, + "loss": 0.7497, + "step": 4657 + }, + { + "epoch": 0.25637074137266774, + "grad_norm": 0.7586949467658997, + "learning_rate": 9.614697689696431e-06, + "loss": 0.81, + "step": 4658 + }, + { + "epoch": 0.2564257801750234, + "grad_norm": 0.717078447341919, + "learning_rate": 9.614530811572946e-06, + "loss": 0.8023, + "step": 4659 + }, + { + "epoch": 0.25648081897737907, + "grad_norm": 0.7369407415390015, + "learning_rate": 9.61436389876788e-06, + "loss": 0.784, + "step": 4660 + }, + { + "epoch": 0.2565358577797347, + "grad_norm": 0.7536265850067139, + "learning_rate": 9.61419695128249e-06, + "loss": 0.7687, + "step": 4661 + }, + { + "epoch": 0.2565908965820904, + "grad_norm": 0.9718124866485596, + "learning_rate": 9.614029969118033e-06, + "loss": 0.8495, + "step": 4662 + }, + { + "epoch": 0.256645935384446, + "grad_norm": 1.1578630208969116, + "learning_rate": 9.613862952275762e-06, + "loss": 0.9189, + "step": 4663 + }, + { + "epoch": 0.2567009741868017, + "grad_norm": 0.7752498984336853, + "learning_rate": 9.613695900756929e-06, + "loss": 0.7677, + "step": 4664 + }, + { + "epoch": 0.25675601298915735, + "grad_norm": 0.9640393257141113, + "learning_rate": 9.613528814562795e-06, + "loss": 0.719, + "step": 4665 + }, + { + "epoch": 0.25681105179151303, + "grad_norm": 0.7690972089767456, + "learning_rate": 9.613361693694614e-06, + "loss": 0.7977, + "step": 4666 + }, + { + "epoch": 0.25686609059386867, + "grad_norm": 0.8390190601348877, + "learning_rate": 9.61319453815364e-06, + "loss": 0.8032, + "step": 4667 + }, + { + "epoch": 0.25692112939622436, + "grad_norm": 0.8293220400810242, + "learning_rate": 9.613027347941131e-06, + "loss": 0.8645, + "step": 4668 + }, + { + "epoch": 0.25697616819858, + "grad_norm": 0.8020731210708618, + "learning_rate": 9.612860123058344e-06, + "loss": 0.8374, + "step": 4669 + }, + { + "epoch": 0.2570312070009357, + "grad_norm": 0.7756736278533936, + "learning_rate": 9.612692863506534e-06, + "loss": 0.7318, + "step": 4670 + }, + { + "epoch": 0.2570862458032913, + "grad_norm": 0.895416259765625, + "learning_rate": 9.61252556928696e-06, + "loss": 0.9654, + "step": 4671 + }, + { + "epoch": 0.257141284605647, + "grad_norm": 0.8647375106811523, + "learning_rate": 9.61235824040088e-06, + "loss": 0.7411, + "step": 4672 + }, + { + "epoch": 0.25719632340800264, + "grad_norm": 0.6927250623703003, + "learning_rate": 9.612190876849546e-06, + "loss": 0.7558, + "step": 4673 + }, + { + "epoch": 0.2572513622103583, + "grad_norm": 0.7614898085594177, + "learning_rate": 9.612023478634222e-06, + "loss": 0.7696, + "step": 4674 + }, + { + "epoch": 0.25730640101271396, + "grad_norm": 0.7910586595535278, + "learning_rate": 9.611856045756166e-06, + "loss": 0.8207, + "step": 4675 + }, + { + "epoch": 0.25736143981506965, + "grad_norm": 0.7330125570297241, + "learning_rate": 9.611688578216632e-06, + "loss": 0.8615, + "step": 4676 + }, + { + "epoch": 0.2574164786174253, + "grad_norm": 0.7703417539596558, + "learning_rate": 9.611521076016882e-06, + "loss": 0.8321, + "step": 4677 + }, + { + "epoch": 0.25747151741978097, + "grad_norm": 0.7121796607971191, + "learning_rate": 9.611353539158174e-06, + "loss": 0.8228, + "step": 4678 + }, + { + "epoch": 0.2575265562221366, + "grad_norm": 0.8313117027282715, + "learning_rate": 9.611185967641768e-06, + "loss": 0.9012, + "step": 4679 + }, + { + "epoch": 0.2575815950244923, + "grad_norm": 0.806776225566864, + "learning_rate": 9.61101836146892e-06, + "loss": 0.769, + "step": 4680 + }, + { + "epoch": 0.2576366338268479, + "grad_norm": 0.7049515843391418, + "learning_rate": 9.610850720640894e-06, + "loss": 0.7938, + "step": 4681 + }, + { + "epoch": 0.25769167262920356, + "grad_norm": 0.7286638021469116, + "learning_rate": 9.610683045158948e-06, + "loss": 0.8168, + "step": 4682 + }, + { + "epoch": 0.25774671143155925, + "grad_norm": 0.7916898727416992, + "learning_rate": 9.610515335024345e-06, + "loss": 0.7681, + "step": 4683 + }, + { + "epoch": 0.2578017502339149, + "grad_norm": 0.7649673819541931, + "learning_rate": 9.61034759023834e-06, + "loss": 0.7273, + "step": 4684 + }, + { + "epoch": 0.2578567890362706, + "grad_norm": 0.8280686736106873, + "learning_rate": 9.610179810802196e-06, + "loss": 0.7968, + "step": 4685 + }, + { + "epoch": 0.2579118278386262, + "grad_norm": 0.7206569910049438, + "learning_rate": 9.610011996717175e-06, + "loss": 0.7359, + "step": 4686 + }, + { + "epoch": 0.2579668666409819, + "grad_norm": 0.7365424036979675, + "learning_rate": 9.60984414798454e-06, + "loss": 0.7962, + "step": 4687 + }, + { + "epoch": 0.25802190544333753, + "grad_norm": 0.8030344247817993, + "learning_rate": 9.609676264605549e-06, + "loss": 0.7931, + "step": 4688 + }, + { + "epoch": 0.2580769442456932, + "grad_norm": 0.8812693357467651, + "learning_rate": 9.609508346581464e-06, + "loss": 0.8493, + "step": 4689 + }, + { + "epoch": 0.25813198304804885, + "grad_norm": 0.8026734590530396, + "learning_rate": 9.60934039391355e-06, + "loss": 0.8368, + "step": 4690 + }, + { + "epoch": 0.25818702185040454, + "grad_norm": 0.8270768523216248, + "learning_rate": 9.609172406603067e-06, + "loss": 0.9077, + "step": 4691 + }, + { + "epoch": 0.2582420606527602, + "grad_norm": 0.7362856864929199, + "learning_rate": 9.609004384651276e-06, + "loss": 0.7384, + "step": 4692 + }, + { + "epoch": 0.25829709945511586, + "grad_norm": 0.7195929288864136, + "learning_rate": 9.608836328059444e-06, + "loss": 0.8475, + "step": 4693 + }, + { + "epoch": 0.2583521382574715, + "grad_norm": 0.7653167843818665, + "learning_rate": 9.60866823682883e-06, + "loss": 0.7704, + "step": 4694 + }, + { + "epoch": 0.2584071770598272, + "grad_norm": 0.7056792974472046, + "learning_rate": 9.6085001109607e-06, + "loss": 0.7835, + "step": 4695 + }, + { + "epoch": 0.2584622158621828, + "grad_norm": 0.7299804091453552, + "learning_rate": 9.60833195045632e-06, + "loss": 0.7894, + "step": 4696 + }, + { + "epoch": 0.2585172546645385, + "grad_norm": 0.7235645055770874, + "learning_rate": 9.608163755316948e-06, + "loss": 0.8113, + "step": 4697 + }, + { + "epoch": 0.25857229346689414, + "grad_norm": 0.7066782116889954, + "learning_rate": 9.60799552554385e-06, + "loss": 0.739, + "step": 4698 + }, + { + "epoch": 0.25862733226924983, + "grad_norm": 0.769930362701416, + "learning_rate": 9.607827261138291e-06, + "loss": 0.7565, + "step": 4699 + }, + { + "epoch": 0.25868237107160547, + "grad_norm": 0.8875935077667236, + "learning_rate": 9.607658962101538e-06, + "loss": 0.849, + "step": 4700 + }, + { + "epoch": 0.25873740987396115, + "grad_norm": 0.7887380123138428, + "learning_rate": 9.60749062843485e-06, + "loss": 0.8795, + "step": 4701 + }, + { + "epoch": 0.2587924486763168, + "grad_norm": 0.7600420117378235, + "learning_rate": 9.607322260139499e-06, + "loss": 0.7581, + "step": 4702 + }, + { + "epoch": 0.2588474874786725, + "grad_norm": 0.7431491017341614, + "learning_rate": 9.607153857216746e-06, + "loss": 0.7119, + "step": 4703 + }, + { + "epoch": 0.2589025262810281, + "grad_norm": 0.7444193363189697, + "learning_rate": 9.606985419667858e-06, + "loss": 0.7492, + "step": 4704 + }, + { + "epoch": 0.2589575650833838, + "grad_norm": 0.8348917365074158, + "learning_rate": 9.6068169474941e-06, + "loss": 0.7656, + "step": 4705 + }, + { + "epoch": 0.25901260388573943, + "grad_norm": 0.6790240406990051, + "learning_rate": 9.60664844069674e-06, + "loss": 0.6354, + "step": 4706 + }, + { + "epoch": 0.2590676426880951, + "grad_norm": 0.8425769805908203, + "learning_rate": 9.606479899277044e-06, + "loss": 0.7927, + "step": 4707 + }, + { + "epoch": 0.25912268149045076, + "grad_norm": 0.7234740853309631, + "learning_rate": 9.606311323236277e-06, + "loss": 0.8122, + "step": 4708 + }, + { + "epoch": 0.25917772029280645, + "grad_norm": 0.839507520198822, + "learning_rate": 9.606142712575707e-06, + "loss": 0.8807, + "step": 4709 + }, + { + "epoch": 0.2592327590951621, + "grad_norm": 0.7155291438102722, + "learning_rate": 9.605974067296601e-06, + "loss": 0.7852, + "step": 4710 + }, + { + "epoch": 0.25928779789751777, + "grad_norm": 0.7222152352333069, + "learning_rate": 9.605805387400228e-06, + "loss": 0.7362, + "step": 4711 + }, + { + "epoch": 0.2593428366998734, + "grad_norm": 0.8350114226341248, + "learning_rate": 9.605636672887854e-06, + "loss": 0.7201, + "step": 4712 + }, + { + "epoch": 0.2593978755022291, + "grad_norm": 0.6805943250656128, + "learning_rate": 9.605467923760747e-06, + "loss": 0.6936, + "step": 4713 + }, + { + "epoch": 0.2594529143045847, + "grad_norm": 0.7863980531692505, + "learning_rate": 9.605299140020177e-06, + "loss": 0.9079, + "step": 4714 + }, + { + "epoch": 0.2595079531069404, + "grad_norm": 0.838843584060669, + "learning_rate": 9.60513032166741e-06, + "loss": 0.839, + "step": 4715 + }, + { + "epoch": 0.25956299190929605, + "grad_norm": 0.7872797250747681, + "learning_rate": 9.60496146870372e-06, + "loss": 0.9164, + "step": 4716 + }, + { + "epoch": 0.25961803071165174, + "grad_norm": 0.7300794720649719, + "learning_rate": 9.604792581130369e-06, + "loss": 0.8227, + "step": 4717 + }, + { + "epoch": 0.25967306951400737, + "grad_norm": 0.8420879244804382, + "learning_rate": 9.60462365894863e-06, + "loss": 0.7865, + "step": 4718 + }, + { + "epoch": 0.25972810831636306, + "grad_norm": 0.807697057723999, + "learning_rate": 9.604454702159771e-06, + "loss": 0.9081, + "step": 4719 + }, + { + "epoch": 0.2597831471187187, + "grad_norm": 0.9041245579719543, + "learning_rate": 9.604285710765064e-06, + "loss": 0.8102, + "step": 4720 + }, + { + "epoch": 0.2598381859210744, + "grad_norm": 0.7061690092086792, + "learning_rate": 9.604116684765779e-06, + "loss": 0.762, + "step": 4721 + }, + { + "epoch": 0.25989322472343, + "grad_norm": 0.7790346741676331, + "learning_rate": 9.603947624163186e-06, + "loss": 0.8038, + "step": 4722 + }, + { + "epoch": 0.2599482635257857, + "grad_norm": 0.8109704256057739, + "learning_rate": 9.603778528958553e-06, + "loss": 0.9105, + "step": 4723 + }, + { + "epoch": 0.26000330232814134, + "grad_norm": 0.7396997213363647, + "learning_rate": 9.603609399153153e-06, + "loss": 0.8384, + "step": 4724 + }, + { + "epoch": 0.260058341130497, + "grad_norm": 0.8594317436218262, + "learning_rate": 9.603440234748257e-06, + "loss": 0.8301, + "step": 4725 + }, + { + "epoch": 0.26011337993285266, + "grad_norm": 0.7087241411209106, + "learning_rate": 9.603271035745138e-06, + "loss": 0.6652, + "step": 4726 + }, + { + "epoch": 0.2601684187352083, + "grad_norm": 0.7405440211296082, + "learning_rate": 9.603101802145065e-06, + "loss": 0.7804, + "step": 4727 + }, + { + "epoch": 0.260223457537564, + "grad_norm": 0.8637508749961853, + "learning_rate": 9.602932533949312e-06, + "loss": 0.8509, + "step": 4728 + }, + { + "epoch": 0.2602784963399196, + "grad_norm": 0.7040451765060425, + "learning_rate": 9.60276323115915e-06, + "loss": 0.7842, + "step": 4729 + }, + { + "epoch": 0.2603335351422753, + "grad_norm": 0.7743955254554749, + "learning_rate": 9.602593893775852e-06, + "loss": 0.8492, + "step": 4730 + }, + { + "epoch": 0.26038857394463094, + "grad_norm": 0.7110480070114136, + "learning_rate": 9.602424521800688e-06, + "loss": 0.7227, + "step": 4731 + }, + { + "epoch": 0.26044361274698663, + "grad_norm": 1.0066583156585693, + "learning_rate": 9.602255115234936e-06, + "loss": 0.8825, + "step": 4732 + }, + { + "epoch": 0.26049865154934226, + "grad_norm": 0.7746492624282837, + "learning_rate": 9.602085674079864e-06, + "loss": 0.8316, + "step": 4733 + }, + { + "epoch": 0.26055369035169795, + "grad_norm": 0.7394356727600098, + "learning_rate": 9.60191619833675e-06, + "loss": 0.746, + "step": 4734 + }, + { + "epoch": 0.2606087291540536, + "grad_norm": 0.7140582203865051, + "learning_rate": 9.601746688006866e-06, + "loss": 0.7204, + "step": 4735 + }, + { + "epoch": 0.2606637679564093, + "grad_norm": 0.753399133682251, + "learning_rate": 9.601577143091483e-06, + "loss": 0.8157, + "step": 4736 + }, + { + "epoch": 0.2607188067587649, + "grad_norm": 0.674320638179779, + "learning_rate": 9.601407563591881e-06, + "loss": 0.7279, + "step": 4737 + }, + { + "epoch": 0.2607738455611206, + "grad_norm": 0.855944037437439, + "learning_rate": 9.60123794950933e-06, + "loss": 0.804, + "step": 4738 + }, + { + "epoch": 0.26082888436347623, + "grad_norm": 0.6833948493003845, + "learning_rate": 9.601068300845106e-06, + "loss": 0.701, + "step": 4739 + }, + { + "epoch": 0.2608839231658319, + "grad_norm": 0.8085536360740662, + "learning_rate": 9.600898617600485e-06, + "loss": 0.8435, + "step": 4740 + }, + { + "epoch": 0.26093896196818755, + "grad_norm": 0.752849817276001, + "learning_rate": 9.600728899776741e-06, + "loss": 0.7205, + "step": 4741 + }, + { + "epoch": 0.26099400077054324, + "grad_norm": 0.7320554852485657, + "learning_rate": 9.600559147375151e-06, + "loss": 0.7556, + "step": 4742 + }, + { + "epoch": 0.2610490395728989, + "grad_norm": 0.7789202928543091, + "learning_rate": 9.600389360396988e-06, + "loss": 0.8467, + "step": 4743 + }, + { + "epoch": 0.26110407837525457, + "grad_norm": 0.8480898141860962, + "learning_rate": 9.600219538843532e-06, + "loss": 0.7762, + "step": 4744 + }, + { + "epoch": 0.2611591171776102, + "grad_norm": 0.8382542133331299, + "learning_rate": 9.600049682716055e-06, + "loss": 0.9051, + "step": 4745 + }, + { + "epoch": 0.2612141559799659, + "grad_norm": 0.8319274187088013, + "learning_rate": 9.599879792015838e-06, + "loss": 0.8221, + "step": 4746 + }, + { + "epoch": 0.2612691947823215, + "grad_norm": 0.7325875163078308, + "learning_rate": 9.599709866744156e-06, + "loss": 0.7968, + "step": 4747 + }, + { + "epoch": 0.2613242335846772, + "grad_norm": 0.7053360342979431, + "learning_rate": 9.599539906902285e-06, + "loss": 0.7073, + "step": 4748 + }, + { + "epoch": 0.26137927238703285, + "grad_norm": 0.763017475605011, + "learning_rate": 9.599369912491503e-06, + "loss": 0.7031, + "step": 4749 + }, + { + "epoch": 0.26143431118938854, + "grad_norm": 0.6816151738166809, + "learning_rate": 9.599199883513088e-06, + "loss": 0.7295, + "step": 4750 + }, + { + "epoch": 0.26148934999174417, + "grad_norm": 0.8143941164016724, + "learning_rate": 9.599029819968319e-06, + "loss": 0.8449, + "step": 4751 + }, + { + "epoch": 0.26154438879409986, + "grad_norm": 0.8093858361244202, + "learning_rate": 9.598859721858471e-06, + "loss": 0.8397, + "step": 4752 + }, + { + "epoch": 0.2615994275964555, + "grad_norm": 0.7431835532188416, + "learning_rate": 9.598689589184827e-06, + "loss": 0.7299, + "step": 4753 + }, + { + "epoch": 0.2616544663988112, + "grad_norm": 0.9871510863304138, + "learning_rate": 9.59851942194866e-06, + "loss": 0.7992, + "step": 4754 + }, + { + "epoch": 0.2617095052011668, + "grad_norm": 0.9304273724555969, + "learning_rate": 9.598349220151254e-06, + "loss": 0.7519, + "step": 4755 + }, + { + "epoch": 0.2617645440035225, + "grad_norm": 0.9361812472343445, + "learning_rate": 9.598178983793886e-06, + "loss": 0.8131, + "step": 4756 + }, + { + "epoch": 0.26181958280587814, + "grad_norm": 0.7783429622650146, + "learning_rate": 9.598008712877835e-06, + "loss": 0.7351, + "step": 4757 + }, + { + "epoch": 0.2618746216082338, + "grad_norm": 0.8739376068115234, + "learning_rate": 9.597838407404381e-06, + "loss": 0.9458, + "step": 4758 + }, + { + "epoch": 0.26192966041058946, + "grad_norm": 0.7076277732849121, + "learning_rate": 9.597668067374805e-06, + "loss": 0.7632, + "step": 4759 + }, + { + "epoch": 0.26198469921294515, + "grad_norm": 0.7652345299720764, + "learning_rate": 9.597497692790386e-06, + "loss": 0.8018, + "step": 4760 + }, + { + "epoch": 0.2620397380153008, + "grad_norm": 0.7332149147987366, + "learning_rate": 9.597327283652405e-06, + "loss": 0.8223, + "step": 4761 + }, + { + "epoch": 0.26209477681765647, + "grad_norm": 0.8361638784408569, + "learning_rate": 9.597156839962145e-06, + "loss": 0.8784, + "step": 4762 + }, + { + "epoch": 0.2621498156200121, + "grad_norm": 1.183772325515747, + "learning_rate": 9.596986361720882e-06, + "loss": 0.8768, + "step": 4763 + }, + { + "epoch": 0.2622048544223678, + "grad_norm": 0.9895418882369995, + "learning_rate": 9.596815848929902e-06, + "loss": 0.714, + "step": 4764 + }, + { + "epoch": 0.26225989322472343, + "grad_norm": 0.8210558295249939, + "learning_rate": 9.59664530159048e-06, + "loss": 0.7246, + "step": 4765 + }, + { + "epoch": 0.2623149320270791, + "grad_norm": 0.8003455996513367, + "learning_rate": 9.596474719703908e-06, + "loss": 0.8385, + "step": 4766 + }, + { + "epoch": 0.26236997082943475, + "grad_norm": 0.7555826306343079, + "learning_rate": 9.59630410327146e-06, + "loss": 0.7243, + "step": 4767 + }, + { + "epoch": 0.2624250096317904, + "grad_norm": 0.7746273279190063, + "learning_rate": 9.596133452294421e-06, + "loss": 0.8763, + "step": 4768 + }, + { + "epoch": 0.2624800484341461, + "grad_norm": 0.7238507866859436, + "learning_rate": 9.595962766774074e-06, + "loss": 0.8302, + "step": 4769 + }, + { + "epoch": 0.2625350872365017, + "grad_norm": 0.7874132394790649, + "learning_rate": 9.595792046711699e-06, + "loss": 0.7979, + "step": 4770 + }, + { + "epoch": 0.2625901260388574, + "grad_norm": 0.8792033791542053, + "learning_rate": 9.595621292108583e-06, + "loss": 0.8555, + "step": 4771 + }, + { + "epoch": 0.26264516484121303, + "grad_norm": 0.7026945948600769, + "learning_rate": 9.595450502966006e-06, + "loss": 0.718, + "step": 4772 + }, + { + "epoch": 0.2627002036435687, + "grad_norm": 0.7747959494590759, + "learning_rate": 9.595279679285254e-06, + "loss": 0.8329, + "step": 4773 + }, + { + "epoch": 0.26275524244592435, + "grad_norm": 0.697979748249054, + "learning_rate": 9.59510882106761e-06, + "loss": 0.7456, + "step": 4774 + }, + { + "epoch": 0.26281028124828004, + "grad_norm": 0.7600447535514832, + "learning_rate": 9.594937928314359e-06, + "loss": 0.875, + "step": 4775 + }, + { + "epoch": 0.2628653200506357, + "grad_norm": 0.7591384649276733, + "learning_rate": 9.594767001026783e-06, + "loss": 0.7607, + "step": 4776 + }, + { + "epoch": 0.26292035885299136, + "grad_norm": 0.9267380833625793, + "learning_rate": 9.59459603920617e-06, + "loss": 0.8926, + "step": 4777 + }, + { + "epoch": 0.262975397655347, + "grad_norm": 0.7751328349113464, + "learning_rate": 9.594425042853802e-06, + "loss": 0.7449, + "step": 4778 + }, + { + "epoch": 0.2630304364577027, + "grad_norm": 0.7066012620925903, + "learning_rate": 9.594254011970966e-06, + "loss": 0.8374, + "step": 4779 + }, + { + "epoch": 0.2630854752600583, + "grad_norm": 0.7564317584037781, + "learning_rate": 9.594082946558945e-06, + "loss": 0.735, + "step": 4780 + }, + { + "epoch": 0.263140514062414, + "grad_norm": 0.8151416182518005, + "learning_rate": 9.593911846619027e-06, + "loss": 0.8575, + "step": 4781 + }, + { + "epoch": 0.26319555286476964, + "grad_norm": 0.719261646270752, + "learning_rate": 9.593740712152497e-06, + "loss": 0.7981, + "step": 4782 + }, + { + "epoch": 0.26325059166712533, + "grad_norm": 0.8627344369888306, + "learning_rate": 9.593569543160642e-06, + "loss": 0.895, + "step": 4783 + }, + { + "epoch": 0.26330563046948097, + "grad_norm": 1.293272614479065, + "learning_rate": 9.593398339644748e-06, + "loss": 0.7531, + "step": 4784 + }, + { + "epoch": 0.26336066927183666, + "grad_norm": 0.8475207686424255, + "learning_rate": 9.593227101606102e-06, + "loss": 0.9091, + "step": 4785 + }, + { + "epoch": 0.2634157080741923, + "grad_norm": 0.78054279088974, + "learning_rate": 9.593055829045989e-06, + "loss": 0.7692, + "step": 4786 + }, + { + "epoch": 0.263470746876548, + "grad_norm": 0.7677399516105652, + "learning_rate": 9.592884521965699e-06, + "loss": 0.6232, + "step": 4787 + }, + { + "epoch": 0.2635257856789036, + "grad_norm": 0.7232677340507507, + "learning_rate": 9.59271318036652e-06, + "loss": 0.8087, + "step": 4788 + }, + { + "epoch": 0.2635808244812593, + "grad_norm": 0.8728463649749756, + "learning_rate": 9.592541804249735e-06, + "loss": 0.7824, + "step": 4789 + }, + { + "epoch": 0.26363586328361494, + "grad_norm": 0.7569910883903503, + "learning_rate": 9.592370393616637e-06, + "loss": 0.7418, + "step": 4790 + }, + { + "epoch": 0.2636909020859706, + "grad_norm": 0.7631934285163879, + "learning_rate": 9.592198948468511e-06, + "loss": 0.7929, + "step": 4791 + }, + { + "epoch": 0.26374594088832626, + "grad_norm": 0.8021631240844727, + "learning_rate": 9.592027468806649e-06, + "loss": 0.8111, + "step": 4792 + }, + { + "epoch": 0.26380097969068195, + "grad_norm": 0.9454651474952698, + "learning_rate": 9.591855954632336e-06, + "loss": 0.8239, + "step": 4793 + }, + { + "epoch": 0.2638560184930376, + "grad_norm": 0.672924280166626, + "learning_rate": 9.591684405946863e-06, + "loss": 0.6877, + "step": 4794 + }, + { + "epoch": 0.26391105729539327, + "grad_norm": 0.7942802906036377, + "learning_rate": 9.59151282275152e-06, + "loss": 0.9002, + "step": 4795 + }, + { + "epoch": 0.2639660960977489, + "grad_norm": 0.7131155133247375, + "learning_rate": 9.591341205047596e-06, + "loss": 0.7692, + "step": 4796 + }, + { + "epoch": 0.2640211349001046, + "grad_norm": 1.0395869016647339, + "learning_rate": 9.59116955283638e-06, + "loss": 0.8352, + "step": 4797 + }, + { + "epoch": 0.2640761737024602, + "grad_norm": 0.9503256678581238, + "learning_rate": 9.590997866119163e-06, + "loss": 1.0287, + "step": 4798 + }, + { + "epoch": 0.2641312125048159, + "grad_norm": 0.7539612054824829, + "learning_rate": 9.590826144897235e-06, + "loss": 0.872, + "step": 4799 + }, + { + "epoch": 0.26418625130717155, + "grad_norm": 0.7067893743515015, + "learning_rate": 9.590654389171885e-06, + "loss": 0.7636, + "step": 4800 + }, + { + "epoch": 0.26424129010952724, + "grad_norm": 0.7355281710624695, + "learning_rate": 9.590482598944407e-06, + "loss": 0.7715, + "step": 4801 + }, + { + "epoch": 0.26429632891188287, + "grad_norm": 0.7589674592018127, + "learning_rate": 9.590310774216089e-06, + "loss": 0.7451, + "step": 4802 + }, + { + "epoch": 0.26435136771423856, + "grad_norm": 0.701386034488678, + "learning_rate": 9.590138914988226e-06, + "loss": 0.7317, + "step": 4803 + }, + { + "epoch": 0.2644064065165942, + "grad_norm": 0.7663118243217468, + "learning_rate": 9.589967021262105e-06, + "loss": 0.8227, + "step": 4804 + }, + { + "epoch": 0.2644614453189499, + "grad_norm": 0.7059655785560608, + "learning_rate": 9.589795093039023e-06, + "loss": 0.7829, + "step": 4805 + }, + { + "epoch": 0.2645164841213055, + "grad_norm": 0.7377020120620728, + "learning_rate": 9.58962313032027e-06, + "loss": 0.8308, + "step": 4806 + }, + { + "epoch": 0.2645715229236612, + "grad_norm": 0.8635388612747192, + "learning_rate": 9.589451133107134e-06, + "loss": 0.7882, + "step": 4807 + }, + { + "epoch": 0.26462656172601684, + "grad_norm": 0.8282824754714966, + "learning_rate": 9.589279101400915e-06, + "loss": 0.8055, + "step": 4808 + }, + { + "epoch": 0.26468160052837253, + "grad_norm": 0.7026814818382263, + "learning_rate": 9.589107035202903e-06, + "loss": 0.7567, + "step": 4809 + }, + { + "epoch": 0.26473663933072816, + "grad_norm": 0.7575708031654358, + "learning_rate": 9.588934934514392e-06, + "loss": 0.7456, + "step": 4810 + }, + { + "epoch": 0.2647916781330838, + "grad_norm": 0.9732069969177246, + "learning_rate": 9.588762799336671e-06, + "loss": 0.8217, + "step": 4811 + }, + { + "epoch": 0.2648467169354395, + "grad_norm": 0.786803126335144, + "learning_rate": 9.58859062967104e-06, + "loss": 0.729, + "step": 4812 + }, + { + "epoch": 0.2649017557377951, + "grad_norm": 0.8068973422050476, + "learning_rate": 9.588418425518789e-06, + "loss": 0.8204, + "step": 4813 + }, + { + "epoch": 0.2649567945401508, + "grad_norm": 0.8222702145576477, + "learning_rate": 9.588246186881213e-06, + "loss": 0.8349, + "step": 4814 + }, + { + "epoch": 0.26501183334250644, + "grad_norm": 0.7560802698135376, + "learning_rate": 9.588073913759608e-06, + "loss": 0.7601, + "step": 4815 + }, + { + "epoch": 0.26506687214486213, + "grad_norm": 0.9221365451812744, + "learning_rate": 9.587901606155266e-06, + "loss": 0.7725, + "step": 4816 + }, + { + "epoch": 0.26512191094721776, + "grad_norm": 0.8092262744903564, + "learning_rate": 9.587729264069485e-06, + "loss": 0.9074, + "step": 4817 + }, + { + "epoch": 0.26517694974957345, + "grad_norm": 0.8183920979499817, + "learning_rate": 9.587556887503557e-06, + "loss": 0.8321, + "step": 4818 + }, + { + "epoch": 0.2652319885519291, + "grad_norm": 0.7023420929908752, + "learning_rate": 9.587384476458781e-06, + "loss": 0.7842, + "step": 4819 + }, + { + "epoch": 0.2652870273542848, + "grad_norm": 1.2864880561828613, + "learning_rate": 9.58721203093645e-06, + "loss": 0.7519, + "step": 4820 + }, + { + "epoch": 0.2653420661566404, + "grad_norm": 0.8133784532546997, + "learning_rate": 9.587039550937864e-06, + "loss": 0.8208, + "step": 4821 + }, + { + "epoch": 0.2653971049589961, + "grad_norm": 0.739732027053833, + "learning_rate": 9.586867036464314e-06, + "loss": 0.8553, + "step": 4822 + }, + { + "epoch": 0.26545214376135173, + "grad_norm": 0.7539162635803223, + "learning_rate": 9.5866944875171e-06, + "loss": 0.7385, + "step": 4823 + }, + { + "epoch": 0.2655071825637074, + "grad_norm": 0.8012336492538452, + "learning_rate": 9.58652190409752e-06, + "loss": 0.8343, + "step": 4824 + }, + { + "epoch": 0.26556222136606306, + "grad_norm": 0.7972521185874939, + "learning_rate": 9.586349286206865e-06, + "loss": 0.8481, + "step": 4825 + }, + { + "epoch": 0.26561726016841875, + "grad_norm": 0.7772900462150574, + "learning_rate": 9.58617663384644e-06, + "loss": 0.7655, + "step": 4826 + }, + { + "epoch": 0.2656722989707744, + "grad_norm": 0.677916944026947, + "learning_rate": 9.586003947017537e-06, + "loss": 0.696, + "step": 4827 + }, + { + "epoch": 0.26572733777313007, + "grad_norm": 0.8254117369651794, + "learning_rate": 9.585831225721455e-06, + "loss": 0.7841, + "step": 4828 + }, + { + "epoch": 0.2657823765754857, + "grad_norm": 0.7256904244422913, + "learning_rate": 9.585658469959496e-06, + "loss": 0.8057, + "step": 4829 + }, + { + "epoch": 0.2658374153778414, + "grad_norm": 0.7651757001876831, + "learning_rate": 9.585485679732953e-06, + "loss": 0.7918, + "step": 4830 + }, + { + "epoch": 0.265892454180197, + "grad_norm": 0.7581052184104919, + "learning_rate": 9.58531285504313e-06, + "loss": 0.759, + "step": 4831 + }, + { + "epoch": 0.2659474929825527, + "grad_norm": 0.7190486192703247, + "learning_rate": 9.58513999589132e-06, + "loss": 0.7403, + "step": 4832 + }, + { + "epoch": 0.26600253178490835, + "grad_norm": 0.8603141903877258, + "learning_rate": 9.584967102278825e-06, + "loss": 0.8944, + "step": 4833 + }, + { + "epoch": 0.26605757058726404, + "grad_norm": 0.806297779083252, + "learning_rate": 9.584794174206947e-06, + "loss": 0.7039, + "step": 4834 + }, + { + "epoch": 0.26611260938961967, + "grad_norm": 0.7604451775550842, + "learning_rate": 9.584621211676981e-06, + "loss": 0.8076, + "step": 4835 + }, + { + "epoch": 0.26616764819197536, + "grad_norm": 0.7276773452758789, + "learning_rate": 9.584448214690232e-06, + "loss": 0.786, + "step": 4836 + }, + { + "epoch": 0.266222686994331, + "grad_norm": 0.8737080693244934, + "learning_rate": 9.584275183247994e-06, + "loss": 0.8071, + "step": 4837 + }, + { + "epoch": 0.2662777257966867, + "grad_norm": 0.8447219133377075, + "learning_rate": 9.584102117351574e-06, + "loss": 0.7682, + "step": 4838 + }, + { + "epoch": 0.2663327645990423, + "grad_norm": 0.7001703381538391, + "learning_rate": 9.583929017002268e-06, + "loss": 0.7077, + "step": 4839 + }, + { + "epoch": 0.266387803401398, + "grad_norm": 0.7935730218887329, + "learning_rate": 9.583755882201377e-06, + "loss": 0.8122, + "step": 4840 + }, + { + "epoch": 0.26644284220375364, + "grad_norm": 0.8763312697410583, + "learning_rate": 9.583582712950207e-06, + "loss": 0.8241, + "step": 4841 + }, + { + "epoch": 0.2664978810061093, + "grad_norm": 0.7910245656967163, + "learning_rate": 9.583409509250055e-06, + "loss": 0.7717, + "step": 4842 + }, + { + "epoch": 0.26655291980846496, + "grad_norm": 0.7975226640701294, + "learning_rate": 9.583236271102222e-06, + "loss": 0.7165, + "step": 4843 + }, + { + "epoch": 0.26660795861082065, + "grad_norm": 0.8060342073440552, + "learning_rate": 9.583062998508014e-06, + "loss": 0.7659, + "step": 4844 + }, + { + "epoch": 0.2666629974131763, + "grad_norm": 0.8779375553131104, + "learning_rate": 9.582889691468732e-06, + "loss": 0.8207, + "step": 4845 + }, + { + "epoch": 0.266718036215532, + "grad_norm": 0.7409310936927795, + "learning_rate": 9.582716349985677e-06, + "loss": 0.8439, + "step": 4846 + }, + { + "epoch": 0.2667730750178876, + "grad_norm": 0.8871899843215942, + "learning_rate": 9.582542974060152e-06, + "loss": 0.8305, + "step": 4847 + }, + { + "epoch": 0.2668281138202433, + "grad_norm": 0.9003115296363831, + "learning_rate": 9.58236956369346e-06, + "loss": 0.8334, + "step": 4848 + }, + { + "epoch": 0.26688315262259893, + "grad_norm": 1.0149577856063843, + "learning_rate": 9.582196118886909e-06, + "loss": 0.7962, + "step": 4849 + }, + { + "epoch": 0.2669381914249546, + "grad_norm": 0.785214900970459, + "learning_rate": 9.582022639641795e-06, + "loss": 0.7806, + "step": 4850 + }, + { + "epoch": 0.26699323022731025, + "grad_norm": 0.9833952188491821, + "learning_rate": 9.581849125959426e-06, + "loss": 0.7607, + "step": 4851 + }, + { + "epoch": 0.26704826902966594, + "grad_norm": 1.404751181602478, + "learning_rate": 9.581675577841104e-06, + "loss": 0.9046, + "step": 4852 + }, + { + "epoch": 0.2671033078320216, + "grad_norm": 0.791159451007843, + "learning_rate": 9.581501995288137e-06, + "loss": 0.6582, + "step": 4853 + }, + { + "epoch": 0.2671583466343772, + "grad_norm": 0.8507272005081177, + "learning_rate": 9.581328378301827e-06, + "loss": 0.8946, + "step": 4854 + }, + { + "epoch": 0.2672133854367329, + "grad_norm": 0.7372786998748779, + "learning_rate": 9.58115472688348e-06, + "loss": 0.7865, + "step": 4855 + }, + { + "epoch": 0.26726842423908853, + "grad_norm": 0.8293853998184204, + "learning_rate": 9.580981041034398e-06, + "loss": 0.9113, + "step": 4856 + }, + { + "epoch": 0.2673234630414442, + "grad_norm": 0.7212402820587158, + "learning_rate": 9.580807320755889e-06, + "loss": 0.7149, + "step": 4857 + }, + { + "epoch": 0.26737850184379985, + "grad_norm": 0.7885197401046753, + "learning_rate": 9.58063356604926e-06, + "loss": 0.8651, + "step": 4858 + }, + { + "epoch": 0.26743354064615554, + "grad_norm": 0.8444308042526245, + "learning_rate": 9.580459776915814e-06, + "loss": 0.7968, + "step": 4859 + }, + { + "epoch": 0.2674885794485112, + "grad_norm": 0.7974254488945007, + "learning_rate": 9.58028595335686e-06, + "loss": 0.8499, + "step": 4860 + }, + { + "epoch": 0.26754361825086687, + "grad_norm": 0.7491242289543152, + "learning_rate": 9.580112095373702e-06, + "loss": 0.8278, + "step": 4861 + }, + { + "epoch": 0.2675986570532225, + "grad_norm": 0.6856499314308167, + "learning_rate": 9.579938202967646e-06, + "loss": 0.7466, + "step": 4862 + }, + { + "epoch": 0.2676536958555782, + "grad_norm": 0.7347447872161865, + "learning_rate": 9.579764276140002e-06, + "loss": 0.8046, + "step": 4863 + }, + { + "epoch": 0.2677087346579338, + "grad_norm": 0.6797083020210266, + "learning_rate": 9.579590314892077e-06, + "loss": 0.7012, + "step": 4864 + }, + { + "epoch": 0.2677637734602895, + "grad_norm": 0.8219562768936157, + "learning_rate": 9.579416319225175e-06, + "loss": 0.7592, + "step": 4865 + }, + { + "epoch": 0.26781881226264515, + "grad_norm": 0.7388357520103455, + "learning_rate": 9.579242289140607e-06, + "loss": 0.8179, + "step": 4866 + }, + { + "epoch": 0.26787385106500083, + "grad_norm": 0.7394490838050842, + "learning_rate": 9.579068224639679e-06, + "loss": 0.694, + "step": 4867 + }, + { + "epoch": 0.26792888986735647, + "grad_norm": 0.7309017181396484, + "learning_rate": 9.578894125723699e-06, + "loss": 0.7882, + "step": 4868 + }, + { + "epoch": 0.26798392866971216, + "grad_norm": 0.7785035967826843, + "learning_rate": 9.578719992393978e-06, + "loss": 0.8142, + "step": 4869 + }, + { + "epoch": 0.2680389674720678, + "grad_norm": 0.8983079195022583, + "learning_rate": 9.57854582465182e-06, + "loss": 0.7809, + "step": 4870 + }, + { + "epoch": 0.2680940062744235, + "grad_norm": 0.7433765530586243, + "learning_rate": 9.578371622498542e-06, + "loss": 0.8937, + "step": 4871 + }, + { + "epoch": 0.2681490450767791, + "grad_norm": 0.8808667659759521, + "learning_rate": 9.578197385935446e-06, + "loss": 0.7821, + "step": 4872 + }, + { + "epoch": 0.2682040838791348, + "grad_norm": 0.825794517993927, + "learning_rate": 9.578023114963843e-06, + "loss": 0.8228, + "step": 4873 + }, + { + "epoch": 0.26825912268149044, + "grad_norm": 1.0165129899978638, + "learning_rate": 9.577848809585046e-06, + "loss": 0.7964, + "step": 4874 + }, + { + "epoch": 0.2683141614838461, + "grad_norm": 0.742028534412384, + "learning_rate": 9.577674469800362e-06, + "loss": 0.9126, + "step": 4875 + }, + { + "epoch": 0.26836920028620176, + "grad_norm": 0.7571890354156494, + "learning_rate": 9.577500095611101e-06, + "loss": 0.879, + "step": 4876 + }, + { + "epoch": 0.26842423908855745, + "grad_norm": 0.7577160596847534, + "learning_rate": 9.577325687018575e-06, + "loss": 0.8048, + "step": 4877 + }, + { + "epoch": 0.2684792778909131, + "grad_norm": 0.7704411745071411, + "learning_rate": 9.577151244024095e-06, + "loss": 0.7451, + "step": 4878 + }, + { + "epoch": 0.26853431669326877, + "grad_norm": 0.8323166966438293, + "learning_rate": 9.57697676662897e-06, + "loss": 0.7591, + "step": 4879 + }, + { + "epoch": 0.2685893554956244, + "grad_norm": 0.7257028222084045, + "learning_rate": 9.576802254834516e-06, + "loss": 0.7941, + "step": 4880 + }, + { + "epoch": 0.2686443942979801, + "grad_norm": 0.8170442581176758, + "learning_rate": 9.57662770864204e-06, + "loss": 0.8617, + "step": 4881 + }, + { + "epoch": 0.2686994331003357, + "grad_norm": 0.7435339689254761, + "learning_rate": 9.576453128052852e-06, + "loss": 0.7683, + "step": 4882 + }, + { + "epoch": 0.2687544719026914, + "grad_norm": 0.7932955026626587, + "learning_rate": 9.576278513068271e-06, + "loss": 0.7103, + "step": 4883 + }, + { + "epoch": 0.26880951070504705, + "grad_norm": 0.8008469939231873, + "learning_rate": 9.576103863689604e-06, + "loss": 0.8144, + "step": 4884 + }, + { + "epoch": 0.26886454950740274, + "grad_norm": 0.8573774695396423, + "learning_rate": 9.575929179918167e-06, + "loss": 0.8992, + "step": 4885 + }, + { + "epoch": 0.2689195883097584, + "grad_norm": 0.7326993942260742, + "learning_rate": 9.57575446175527e-06, + "loss": 0.699, + "step": 4886 + }, + { + "epoch": 0.26897462711211406, + "grad_norm": 0.8249791264533997, + "learning_rate": 9.575579709202228e-06, + "loss": 0.7445, + "step": 4887 + }, + { + "epoch": 0.2690296659144697, + "grad_norm": 0.7136644124984741, + "learning_rate": 9.575404922260351e-06, + "loss": 0.779, + "step": 4888 + }, + { + "epoch": 0.2690847047168254, + "grad_norm": 1.0130438804626465, + "learning_rate": 9.575230100930958e-06, + "loss": 0.8535, + "step": 4889 + }, + { + "epoch": 0.269139743519181, + "grad_norm": 0.6784926652908325, + "learning_rate": 9.575055245215358e-06, + "loss": 0.6745, + "step": 4890 + }, + { + "epoch": 0.2691947823215367, + "grad_norm": 0.7492508888244629, + "learning_rate": 9.57488035511487e-06, + "loss": 0.6748, + "step": 4891 + }, + { + "epoch": 0.26924982112389234, + "grad_norm": 0.7951217889785767, + "learning_rate": 9.574705430630807e-06, + "loss": 0.8119, + "step": 4892 + }, + { + "epoch": 0.26930485992624803, + "grad_norm": 0.9756677746772766, + "learning_rate": 9.574530471764478e-06, + "loss": 0.855, + "step": 4893 + }, + { + "epoch": 0.26935989872860366, + "grad_norm": 0.7806811928749084, + "learning_rate": 9.574355478517206e-06, + "loss": 0.8432, + "step": 4894 + }, + { + "epoch": 0.26941493753095935, + "grad_norm": 0.7814774513244629, + "learning_rate": 9.574180450890301e-06, + "loss": 0.8226, + "step": 4895 + }, + { + "epoch": 0.269469976333315, + "grad_norm": 0.7745325565338135, + "learning_rate": 9.574005388885081e-06, + "loss": 0.7722, + "step": 4896 + }, + { + "epoch": 0.2695250151356706, + "grad_norm": 0.7805666327476501, + "learning_rate": 9.573830292502862e-06, + "loss": 0.8357, + "step": 4897 + }, + { + "epoch": 0.2695800539380263, + "grad_norm": 0.8428031802177429, + "learning_rate": 9.573655161744958e-06, + "loss": 0.8056, + "step": 4898 + }, + { + "epoch": 0.26963509274038194, + "grad_norm": 0.7896600961685181, + "learning_rate": 9.573479996612684e-06, + "loss": 0.7984, + "step": 4899 + }, + { + "epoch": 0.26969013154273763, + "grad_norm": 0.7718683481216431, + "learning_rate": 9.57330479710736e-06, + "loss": 0.7527, + "step": 4900 + }, + { + "epoch": 0.26974517034509327, + "grad_norm": 0.7868129014968872, + "learning_rate": 9.573129563230302e-06, + "loss": 0.7876, + "step": 4901 + }, + { + "epoch": 0.26980020914744895, + "grad_norm": 0.8493777513504028, + "learning_rate": 9.572954294982826e-06, + "loss": 0.864, + "step": 4902 + }, + { + "epoch": 0.2698552479498046, + "grad_norm": 0.7492502331733704, + "learning_rate": 9.57277899236625e-06, + "loss": 0.8236, + "step": 4903 + }, + { + "epoch": 0.2699102867521603, + "grad_norm": 1.0534250736236572, + "learning_rate": 9.57260365538189e-06, + "loss": 0.8012, + "step": 4904 + }, + { + "epoch": 0.2699653255545159, + "grad_norm": 0.7557470202445984, + "learning_rate": 9.572428284031065e-06, + "loss": 0.9084, + "step": 4905 + }, + { + "epoch": 0.2700203643568716, + "grad_norm": 0.8055123686790466, + "learning_rate": 9.572252878315094e-06, + "loss": 0.7468, + "step": 4906 + }, + { + "epoch": 0.27007540315922723, + "grad_norm": 0.8399039506912231, + "learning_rate": 9.572077438235294e-06, + "loss": 0.9293, + "step": 4907 + }, + { + "epoch": 0.2701304419615829, + "grad_norm": 0.9800041317939758, + "learning_rate": 9.571901963792983e-06, + "loss": 0.8664, + "step": 4908 + }, + { + "epoch": 0.27018548076393856, + "grad_norm": 0.7732129096984863, + "learning_rate": 9.571726454989482e-06, + "loss": 0.7227, + "step": 4909 + }, + { + "epoch": 0.27024051956629425, + "grad_norm": 0.730754017829895, + "learning_rate": 9.571550911826109e-06, + "loss": 0.6467, + "step": 4910 + }, + { + "epoch": 0.2702955583686499, + "grad_norm": 0.8245325684547424, + "learning_rate": 9.57137533430418e-06, + "loss": 0.7847, + "step": 4911 + }, + { + "epoch": 0.27035059717100557, + "grad_norm": 0.8606786131858826, + "learning_rate": 9.57119972242502e-06, + "loss": 0.9556, + "step": 4912 + }, + { + "epoch": 0.2704056359733612, + "grad_norm": 0.7480195164680481, + "learning_rate": 9.571024076189947e-06, + "loss": 0.8504, + "step": 4913 + }, + { + "epoch": 0.2704606747757169, + "grad_norm": 0.718913197517395, + "learning_rate": 9.57084839560028e-06, + "loss": 0.7869, + "step": 4914 + }, + { + "epoch": 0.2705157135780725, + "grad_norm": 0.9778180122375488, + "learning_rate": 9.57067268065734e-06, + "loss": 0.8514, + "step": 4915 + }, + { + "epoch": 0.2705707523804282, + "grad_norm": 0.7394844889640808, + "learning_rate": 9.570496931362448e-06, + "loss": 0.7906, + "step": 4916 + }, + { + "epoch": 0.27062579118278385, + "grad_norm": 0.7648600339889526, + "learning_rate": 9.570321147716923e-06, + "loss": 0.8194, + "step": 4917 + }, + { + "epoch": 0.27068082998513954, + "grad_norm": 0.8002632260322571, + "learning_rate": 9.57014532972209e-06, + "loss": 0.8079, + "step": 4918 + }, + { + "epoch": 0.27073586878749517, + "grad_norm": 0.8668341040611267, + "learning_rate": 9.569969477379267e-06, + "loss": 0.8954, + "step": 4919 + }, + { + "epoch": 0.27079090758985086, + "grad_norm": 0.7403327226638794, + "learning_rate": 9.569793590689775e-06, + "loss": 0.7755, + "step": 4920 + }, + { + "epoch": 0.2708459463922065, + "grad_norm": 0.7399682998657227, + "learning_rate": 9.569617669654938e-06, + "loss": 0.8203, + "step": 4921 + }, + { + "epoch": 0.2709009851945622, + "grad_norm": 0.788600504398346, + "learning_rate": 9.56944171427608e-06, + "loss": 0.7565, + "step": 4922 + }, + { + "epoch": 0.2709560239969178, + "grad_norm": 0.7044861912727356, + "learning_rate": 9.56926572455452e-06, + "loss": 0.7073, + "step": 4923 + }, + { + "epoch": 0.2710110627992735, + "grad_norm": 0.8195114135742188, + "learning_rate": 9.569089700491581e-06, + "loss": 0.8658, + "step": 4924 + }, + { + "epoch": 0.27106610160162914, + "grad_norm": 0.7792258858680725, + "learning_rate": 9.568913642088589e-06, + "loss": 0.8628, + "step": 4925 + }, + { + "epoch": 0.27112114040398483, + "grad_norm": 0.764930248260498, + "learning_rate": 9.568737549346862e-06, + "loss": 0.7761, + "step": 4926 + }, + { + "epoch": 0.27117617920634046, + "grad_norm": 0.7226328253746033, + "learning_rate": 9.56856142226773e-06, + "loss": 0.7208, + "step": 4927 + }, + { + "epoch": 0.27123121800869615, + "grad_norm": 0.8726598620414734, + "learning_rate": 9.568385260852512e-06, + "loss": 0.8599, + "step": 4928 + }, + { + "epoch": 0.2712862568110518, + "grad_norm": 1.0126571655273438, + "learning_rate": 9.568209065102533e-06, + "loss": 0.8145, + "step": 4929 + }, + { + "epoch": 0.2713412956134075, + "grad_norm": 0.7764692306518555, + "learning_rate": 9.568032835019116e-06, + "loss": 0.6758, + "step": 4930 + }, + { + "epoch": 0.2713963344157631, + "grad_norm": 0.6955474019050598, + "learning_rate": 9.567856570603589e-06, + "loss": 0.7461, + "step": 4931 + }, + { + "epoch": 0.2714513732181188, + "grad_norm": 0.7136832475662231, + "learning_rate": 9.567680271857274e-06, + "loss": 0.7692, + "step": 4932 + }, + { + "epoch": 0.27150641202047443, + "grad_norm": 1.2288198471069336, + "learning_rate": 9.567503938781497e-06, + "loss": 0.7815, + "step": 4933 + }, + { + "epoch": 0.2715614508228301, + "grad_norm": 0.9182234406471252, + "learning_rate": 9.567327571377584e-06, + "loss": 0.8822, + "step": 4934 + }, + { + "epoch": 0.27161648962518575, + "grad_norm": 0.7684763669967651, + "learning_rate": 9.567151169646859e-06, + "loss": 0.7618, + "step": 4935 + }, + { + "epoch": 0.27167152842754144, + "grad_norm": 0.872360348701477, + "learning_rate": 9.566974733590647e-06, + "loss": 0.7975, + "step": 4936 + }, + { + "epoch": 0.2717265672298971, + "grad_norm": 0.9010463356971741, + "learning_rate": 9.566798263210277e-06, + "loss": 0.7159, + "step": 4937 + }, + { + "epoch": 0.27178160603225276, + "grad_norm": 0.7254281044006348, + "learning_rate": 9.566621758507072e-06, + "loss": 0.6724, + "step": 4938 + }, + { + "epoch": 0.2718366448346084, + "grad_norm": 0.8478212356567383, + "learning_rate": 9.566445219482363e-06, + "loss": 0.659, + "step": 4939 + }, + { + "epoch": 0.27189168363696403, + "grad_norm": 0.9038714170455933, + "learning_rate": 9.56626864613747e-06, + "loss": 0.8766, + "step": 4940 + }, + { + "epoch": 0.2719467224393197, + "grad_norm": 0.9704582691192627, + "learning_rate": 9.566092038473728e-06, + "loss": 0.8972, + "step": 4941 + }, + { + "epoch": 0.27200176124167535, + "grad_norm": 0.7069430947303772, + "learning_rate": 9.565915396492459e-06, + "loss": 0.8116, + "step": 4942 + }, + { + "epoch": 0.27205680004403104, + "grad_norm": 0.7432642579078674, + "learning_rate": 9.565738720194993e-06, + "loss": 0.847, + "step": 4943 + }, + { + "epoch": 0.2721118388463867, + "grad_norm": 0.6813814043998718, + "learning_rate": 9.565562009582655e-06, + "loss": 0.7146, + "step": 4944 + }, + { + "epoch": 0.27216687764874237, + "grad_norm": 0.7447707056999207, + "learning_rate": 9.565385264656776e-06, + "loss": 0.7696, + "step": 4945 + }, + { + "epoch": 0.272221916451098, + "grad_norm": 0.875073254108429, + "learning_rate": 9.565208485418685e-06, + "loss": 0.8714, + "step": 4946 + }, + { + "epoch": 0.2722769552534537, + "grad_norm": 0.7753880620002747, + "learning_rate": 9.565031671869707e-06, + "loss": 0.739, + "step": 4947 + }, + { + "epoch": 0.2723319940558093, + "grad_norm": 0.749264121055603, + "learning_rate": 9.564854824011172e-06, + "loss": 0.7957, + "step": 4948 + }, + { + "epoch": 0.272387032858165, + "grad_norm": 0.6733991503715515, + "learning_rate": 9.564677941844412e-06, + "loss": 0.7402, + "step": 4949 + }, + { + "epoch": 0.27244207166052065, + "grad_norm": 0.7426447868347168, + "learning_rate": 9.564501025370753e-06, + "loss": 0.7977, + "step": 4950 + }, + { + "epoch": 0.27249711046287634, + "grad_norm": 0.7930514812469482, + "learning_rate": 9.564324074591529e-06, + "loss": 0.8485, + "step": 4951 + }, + { + "epoch": 0.27255214926523197, + "grad_norm": 0.8087072968482971, + "learning_rate": 9.564147089508064e-06, + "loss": 0.9215, + "step": 4952 + }, + { + "epoch": 0.27260718806758766, + "grad_norm": 0.7560327053070068, + "learning_rate": 9.563970070121694e-06, + "loss": 0.7966, + "step": 4953 + }, + { + "epoch": 0.2726622268699433, + "grad_norm": 0.735573947429657, + "learning_rate": 9.563793016433744e-06, + "loss": 0.7737, + "step": 4954 + }, + { + "epoch": 0.272717265672299, + "grad_norm": 0.7603545784950256, + "learning_rate": 9.563615928445548e-06, + "loss": 0.7717, + "step": 4955 + }, + { + "epoch": 0.2727723044746546, + "grad_norm": 0.7185375094413757, + "learning_rate": 9.563438806158437e-06, + "loss": 0.8057, + "step": 4956 + }, + { + "epoch": 0.2728273432770103, + "grad_norm": 0.7619272470474243, + "learning_rate": 9.56326164957374e-06, + "loss": 0.8173, + "step": 4957 + }, + { + "epoch": 0.27288238207936594, + "grad_norm": 0.7868000864982605, + "learning_rate": 9.563084458692793e-06, + "loss": 0.6855, + "step": 4958 + }, + { + "epoch": 0.2729374208817216, + "grad_norm": 0.7949535846710205, + "learning_rate": 9.562907233516923e-06, + "loss": 0.7754, + "step": 4959 + }, + { + "epoch": 0.27299245968407726, + "grad_norm": 0.7037919163703918, + "learning_rate": 9.562729974047462e-06, + "loss": 0.7419, + "step": 4960 + }, + { + "epoch": 0.27304749848643295, + "grad_norm": 0.7236568927764893, + "learning_rate": 9.562552680285746e-06, + "loss": 0.7135, + "step": 4961 + }, + { + "epoch": 0.2731025372887886, + "grad_norm": 0.8410467505455017, + "learning_rate": 9.562375352233105e-06, + "loss": 0.8507, + "step": 4962 + }, + { + "epoch": 0.27315757609114427, + "grad_norm": 0.8043560981750488, + "learning_rate": 9.562197989890871e-06, + "loss": 0.8484, + "step": 4963 + }, + { + "epoch": 0.2732126148934999, + "grad_norm": 0.6926127672195435, + "learning_rate": 9.56202059326038e-06, + "loss": 0.8087, + "step": 4964 + }, + { + "epoch": 0.2732676536958556, + "grad_norm": 0.7149024605751038, + "learning_rate": 9.561843162342961e-06, + "loss": 0.7349, + "step": 4965 + }, + { + "epoch": 0.27332269249821123, + "grad_norm": 0.7165781855583191, + "learning_rate": 9.561665697139952e-06, + "loss": 0.8139, + "step": 4966 + }, + { + "epoch": 0.2733777313005669, + "grad_norm": 0.7481133341789246, + "learning_rate": 9.561488197652684e-06, + "loss": 0.7712, + "step": 4967 + }, + { + "epoch": 0.27343277010292255, + "grad_norm": 0.6928209066390991, + "learning_rate": 9.561310663882491e-06, + "loss": 0.7524, + "step": 4968 + }, + { + "epoch": 0.27348780890527824, + "grad_norm": 0.7397856116294861, + "learning_rate": 9.561133095830708e-06, + "loss": 0.718, + "step": 4969 + }, + { + "epoch": 0.2735428477076339, + "grad_norm": 0.7712383270263672, + "learning_rate": 9.560955493498672e-06, + "loss": 0.8201, + "step": 4970 + }, + { + "epoch": 0.27359788650998956, + "grad_norm": 0.96076899766922, + "learning_rate": 9.560777856887714e-06, + "loss": 0.8555, + "step": 4971 + }, + { + "epoch": 0.2736529253123452, + "grad_norm": 0.7331019639968872, + "learning_rate": 9.56060018599917e-06, + "loss": 0.8315, + "step": 4972 + }, + { + "epoch": 0.2737079641147009, + "grad_norm": 0.7157140970230103, + "learning_rate": 9.560422480834374e-06, + "loss": 0.7177, + "step": 4973 + }, + { + "epoch": 0.2737630029170565, + "grad_norm": 0.807614266872406, + "learning_rate": 9.560244741394666e-06, + "loss": 0.8413, + "step": 4974 + }, + { + "epoch": 0.2738180417194122, + "grad_norm": 0.7618574500083923, + "learning_rate": 9.560066967681378e-06, + "loss": 0.8248, + "step": 4975 + }, + { + "epoch": 0.27387308052176784, + "grad_norm": 0.7886885404586792, + "learning_rate": 9.559889159695848e-06, + "loss": 0.8793, + "step": 4976 + }, + { + "epoch": 0.27392811932412353, + "grad_norm": 1.0090755224227905, + "learning_rate": 9.559711317439411e-06, + "loss": 0.9255, + "step": 4977 + }, + { + "epoch": 0.27398315812647916, + "grad_norm": 0.7855443358421326, + "learning_rate": 9.559533440913405e-06, + "loss": 0.8001, + "step": 4978 + }, + { + "epoch": 0.27403819692883485, + "grad_norm": 0.768741250038147, + "learning_rate": 9.559355530119165e-06, + "loss": 0.8109, + "step": 4979 + }, + { + "epoch": 0.2740932357311905, + "grad_norm": 0.759589672088623, + "learning_rate": 9.55917758505803e-06, + "loss": 0.8001, + "step": 4980 + }, + { + "epoch": 0.2741482745335462, + "grad_norm": 0.7937445640563965, + "learning_rate": 9.558999605731338e-06, + "loss": 0.8924, + "step": 4981 + }, + { + "epoch": 0.2742033133359018, + "grad_norm": 0.9041592478752136, + "learning_rate": 9.558821592140423e-06, + "loss": 0.9167, + "step": 4982 + }, + { + "epoch": 0.27425835213825744, + "grad_norm": 0.6971380710601807, + "learning_rate": 9.558643544286627e-06, + "loss": 0.7589, + "step": 4983 + }, + { + "epoch": 0.27431339094061313, + "grad_norm": 0.9292929172515869, + "learning_rate": 9.558465462171287e-06, + "loss": 0.9566, + "step": 4984 + }, + { + "epoch": 0.27436842974296877, + "grad_norm": 0.8320629000663757, + "learning_rate": 9.558287345795738e-06, + "loss": 0.8854, + "step": 4985 + }, + { + "epoch": 0.27442346854532446, + "grad_norm": 0.797272801399231, + "learning_rate": 9.558109195161325e-06, + "loss": 0.7838, + "step": 4986 + }, + { + "epoch": 0.2744785073476801, + "grad_norm": 0.9702700972557068, + "learning_rate": 9.557931010269382e-06, + "loss": 0.8593, + "step": 4987 + }, + { + "epoch": 0.2745335461500358, + "grad_norm": 0.8309103846549988, + "learning_rate": 9.557752791121248e-06, + "loss": 0.8902, + "step": 4988 + }, + { + "epoch": 0.2745885849523914, + "grad_norm": 0.706667959690094, + "learning_rate": 9.557574537718265e-06, + "loss": 0.7259, + "step": 4989 + }, + { + "epoch": 0.2746436237547471, + "grad_norm": 0.770239531993866, + "learning_rate": 9.557396250061771e-06, + "loss": 0.8644, + "step": 4990 + }, + { + "epoch": 0.27469866255710274, + "grad_norm": 0.8695803880691528, + "learning_rate": 9.557217928153108e-06, + "loss": 0.895, + "step": 4991 + }, + { + "epoch": 0.2747537013594584, + "grad_norm": 0.7525948286056519, + "learning_rate": 9.557039571993614e-06, + "loss": 0.7029, + "step": 4992 + }, + { + "epoch": 0.27480874016181406, + "grad_norm": 0.7616680264472961, + "learning_rate": 9.556861181584631e-06, + "loss": 0.8025, + "step": 4993 + }, + { + "epoch": 0.27486377896416975, + "grad_norm": 0.7216167449951172, + "learning_rate": 9.5566827569275e-06, + "loss": 0.8314, + "step": 4994 + }, + { + "epoch": 0.2749188177665254, + "grad_norm": 0.7412614226341248, + "learning_rate": 9.55650429802356e-06, + "loss": 0.7877, + "step": 4995 + }, + { + "epoch": 0.27497385656888107, + "grad_norm": 0.7176525592803955, + "learning_rate": 9.556325804874154e-06, + "loss": 0.7615, + "step": 4996 + }, + { + "epoch": 0.2750288953712367, + "grad_norm": 0.7544515132904053, + "learning_rate": 9.556147277480623e-06, + "loss": 0.8352, + "step": 4997 + }, + { + "epoch": 0.2750839341735924, + "grad_norm": 0.7318205833435059, + "learning_rate": 9.555968715844309e-06, + "loss": 0.7403, + "step": 4998 + }, + { + "epoch": 0.275138972975948, + "grad_norm": 0.7495027780532837, + "learning_rate": 9.555790119966552e-06, + "loss": 0.7611, + "step": 4999 + }, + { + "epoch": 0.2751940117783037, + "grad_norm": 0.7544401288032532, + "learning_rate": 9.555611489848697e-06, + "loss": 0.8594, + "step": 5000 + }, + { + "epoch": 0.27524905058065935, + "grad_norm": 0.7698250412940979, + "learning_rate": 9.555432825492084e-06, + "loss": 0.8438, + "step": 5001 + }, + { + "epoch": 0.27530408938301504, + "grad_norm": 0.7668892741203308, + "learning_rate": 9.555254126898059e-06, + "loss": 0.8082, + "step": 5002 + }, + { + "epoch": 0.27535912818537067, + "grad_norm": 0.9170669317245483, + "learning_rate": 9.555075394067963e-06, + "loss": 0.7443, + "step": 5003 + }, + { + "epoch": 0.27541416698772636, + "grad_norm": 0.7890255451202393, + "learning_rate": 9.55489662700314e-06, + "loss": 0.8269, + "step": 5004 + }, + { + "epoch": 0.275469205790082, + "grad_norm": 0.6740512847900391, + "learning_rate": 9.554717825704932e-06, + "loss": 0.6906, + "step": 5005 + }, + { + "epoch": 0.2755242445924377, + "grad_norm": 0.8032376170158386, + "learning_rate": 9.554538990174685e-06, + "loss": 0.812, + "step": 5006 + }, + { + "epoch": 0.2755792833947933, + "grad_norm": 0.6932135224342346, + "learning_rate": 9.554360120413741e-06, + "loss": 0.7823, + "step": 5007 + }, + { + "epoch": 0.275634322197149, + "grad_norm": 0.7447643876075745, + "learning_rate": 9.554181216423447e-06, + "loss": 0.8753, + "step": 5008 + }, + { + "epoch": 0.27568936099950464, + "grad_norm": 0.8035081624984741, + "learning_rate": 9.554002278205145e-06, + "loss": 0.7135, + "step": 5009 + }, + { + "epoch": 0.27574439980186033, + "grad_norm": 0.7544171214103699, + "learning_rate": 9.553823305760182e-06, + "loss": 0.7574, + "step": 5010 + }, + { + "epoch": 0.27579943860421596, + "grad_norm": 0.6648419499397278, + "learning_rate": 9.553644299089902e-06, + "loss": 0.7566, + "step": 5011 + }, + { + "epoch": 0.27585447740657165, + "grad_norm": 0.7481752038002014, + "learning_rate": 9.55346525819565e-06, + "loss": 0.7862, + "step": 5012 + }, + { + "epoch": 0.2759095162089273, + "grad_norm": 0.7000668048858643, + "learning_rate": 9.55328618307877e-06, + "loss": 0.7767, + "step": 5013 + }, + { + "epoch": 0.275964555011283, + "grad_norm": 0.7435166239738464, + "learning_rate": 9.553107073740612e-06, + "loss": 0.6888, + "step": 5014 + }, + { + "epoch": 0.2760195938136386, + "grad_norm": 0.7593170404434204, + "learning_rate": 9.552927930182521e-06, + "loss": 0.7272, + "step": 5015 + }, + { + "epoch": 0.2760746326159943, + "grad_norm": 0.870079755783081, + "learning_rate": 9.55274875240584e-06, + "loss": 0.8692, + "step": 5016 + }, + { + "epoch": 0.27612967141834993, + "grad_norm": 0.8550307750701904, + "learning_rate": 9.55256954041192e-06, + "loss": 0.8729, + "step": 5017 + }, + { + "epoch": 0.2761847102207056, + "grad_norm": 0.888830304145813, + "learning_rate": 9.552390294202105e-06, + "loss": 0.8607, + "step": 5018 + }, + { + "epoch": 0.27623974902306125, + "grad_norm": 0.8295729160308838, + "learning_rate": 9.552211013777743e-06, + "loss": 0.8722, + "step": 5019 + }, + { + "epoch": 0.27629478782541694, + "grad_norm": 0.7732356190681458, + "learning_rate": 9.552031699140182e-06, + "loss": 0.8332, + "step": 5020 + }, + { + "epoch": 0.2763498266277726, + "grad_norm": 0.9132987856864929, + "learning_rate": 9.55185235029077e-06, + "loss": 0.769, + "step": 5021 + }, + { + "epoch": 0.27640486543012827, + "grad_norm": 0.7221076488494873, + "learning_rate": 9.551672967230851e-06, + "loss": 0.8505, + "step": 5022 + }, + { + "epoch": 0.2764599042324839, + "grad_norm": 0.8526949882507324, + "learning_rate": 9.551493549961778e-06, + "loss": 0.8002, + "step": 5023 + }, + { + "epoch": 0.2765149430348396, + "grad_norm": 0.9513188004493713, + "learning_rate": 9.551314098484901e-06, + "loss": 0.8558, + "step": 5024 + }, + { + "epoch": 0.2765699818371952, + "grad_norm": 0.7543003559112549, + "learning_rate": 9.551134612801563e-06, + "loss": 0.8292, + "step": 5025 + }, + { + "epoch": 0.27662502063955086, + "grad_norm": 0.7531017065048218, + "learning_rate": 9.550955092913115e-06, + "loss": 0.7837, + "step": 5026 + }, + { + "epoch": 0.27668005944190655, + "grad_norm": 0.8725717663764954, + "learning_rate": 9.550775538820907e-06, + "loss": 0.8362, + "step": 5027 + }, + { + "epoch": 0.2767350982442622, + "grad_norm": 0.8122721910476685, + "learning_rate": 9.550595950526288e-06, + "loss": 0.8539, + "step": 5028 + }, + { + "epoch": 0.27679013704661787, + "grad_norm": 0.7756829261779785, + "learning_rate": 9.550416328030608e-06, + "loss": 0.787, + "step": 5029 + }, + { + "epoch": 0.2768451758489735, + "grad_norm": 0.9086001515388489, + "learning_rate": 9.550236671335218e-06, + "loss": 0.7972, + "step": 5030 + }, + { + "epoch": 0.2769002146513292, + "grad_norm": 0.7857060432434082, + "learning_rate": 9.550056980441466e-06, + "loss": 0.7577, + "step": 5031 + }, + { + "epoch": 0.2769552534536848, + "grad_norm": 0.8190392851829529, + "learning_rate": 9.549877255350703e-06, + "loss": 0.81, + "step": 5032 + }, + { + "epoch": 0.2770102922560405, + "grad_norm": 0.7714588642120361, + "learning_rate": 9.549697496064283e-06, + "loss": 0.7916, + "step": 5033 + }, + { + "epoch": 0.27706533105839615, + "grad_norm": 0.7178533673286438, + "learning_rate": 9.549517702583552e-06, + "loss": 0.8001, + "step": 5034 + }, + { + "epoch": 0.27712036986075184, + "grad_norm": 0.7552955150604248, + "learning_rate": 9.549337874909865e-06, + "loss": 0.8361, + "step": 5035 + }, + { + "epoch": 0.27717540866310747, + "grad_norm": 0.7823992371559143, + "learning_rate": 9.549158013044573e-06, + "loss": 0.7033, + "step": 5036 + }, + { + "epoch": 0.27723044746546316, + "grad_norm": 0.731504499912262, + "learning_rate": 9.548978116989026e-06, + "loss": 0.73, + "step": 5037 + }, + { + "epoch": 0.2772854862678188, + "grad_norm": 0.7455994486808777, + "learning_rate": 9.548798186744578e-06, + "loss": 0.8005, + "step": 5038 + }, + { + "epoch": 0.2773405250701745, + "grad_norm": 0.7020164728164673, + "learning_rate": 9.54861822231258e-06, + "loss": 0.6707, + "step": 5039 + }, + { + "epoch": 0.2773955638725301, + "grad_norm": 0.7526360750198364, + "learning_rate": 9.548438223694385e-06, + "loss": 0.7686, + "step": 5040 + }, + { + "epoch": 0.2774506026748858, + "grad_norm": 0.7268579006195068, + "learning_rate": 9.548258190891344e-06, + "loss": 0.7039, + "step": 5041 + }, + { + "epoch": 0.27750564147724144, + "grad_norm": 0.9361631274223328, + "learning_rate": 9.548078123904815e-06, + "loss": 0.8023, + "step": 5042 + }, + { + "epoch": 0.2775606802795971, + "grad_norm": 0.7786710262298584, + "learning_rate": 9.547898022736147e-06, + "loss": 0.6866, + "step": 5043 + }, + { + "epoch": 0.27761571908195276, + "grad_norm": 0.7175624370574951, + "learning_rate": 9.547717887386695e-06, + "loss": 0.7554, + "step": 5044 + }, + { + "epoch": 0.27767075788430845, + "grad_norm": 0.9157657623291016, + "learning_rate": 9.547537717857813e-06, + "loss": 0.7936, + "step": 5045 + }, + { + "epoch": 0.2777257966866641, + "grad_norm": 0.7881377935409546, + "learning_rate": 9.547357514150854e-06, + "loss": 0.8198, + "step": 5046 + }, + { + "epoch": 0.2777808354890198, + "grad_norm": 1.0444039106369019, + "learning_rate": 9.547177276267173e-06, + "loss": 0.7954, + "step": 5047 + }, + { + "epoch": 0.2778358742913754, + "grad_norm": 0.7889506220817566, + "learning_rate": 9.546997004208124e-06, + "loss": 0.7697, + "step": 5048 + }, + { + "epoch": 0.2778909130937311, + "grad_norm": 0.7304134368896484, + "learning_rate": 9.546816697975066e-06, + "loss": 0.7034, + "step": 5049 + }, + { + "epoch": 0.27794595189608673, + "grad_norm": 0.7783082723617554, + "learning_rate": 9.546636357569347e-06, + "loss": 0.8185, + "step": 5050 + }, + { + "epoch": 0.2780009906984424, + "grad_norm": 0.750712513923645, + "learning_rate": 9.54645598299233e-06, + "loss": 0.7336, + "step": 5051 + }, + { + "epoch": 0.27805602950079805, + "grad_norm": 0.7849590182304382, + "learning_rate": 9.546275574245364e-06, + "loss": 0.8088, + "step": 5052 + }, + { + "epoch": 0.27811106830315374, + "grad_norm": 0.8490208983421326, + "learning_rate": 9.546095131329809e-06, + "loss": 0.8507, + "step": 5053 + }, + { + "epoch": 0.2781661071055094, + "grad_norm": 0.8107250928878784, + "learning_rate": 9.54591465424702e-06, + "loss": 0.7787, + "step": 5054 + }, + { + "epoch": 0.27822114590786506, + "grad_norm": 0.8278594613075256, + "learning_rate": 9.54573414299835e-06, + "loss": 0.7836, + "step": 5055 + }, + { + "epoch": 0.2782761847102207, + "grad_norm": 0.7982015013694763, + "learning_rate": 9.545553597585163e-06, + "loss": 0.7672, + "step": 5056 + }, + { + "epoch": 0.2783312235125764, + "grad_norm": 0.7311522364616394, + "learning_rate": 9.54537301800881e-06, + "loss": 0.7571, + "step": 5057 + }, + { + "epoch": 0.278386262314932, + "grad_norm": 0.8039999604225159, + "learning_rate": 9.545192404270651e-06, + "loss": 0.764, + "step": 5058 + }, + { + "epoch": 0.2784413011172877, + "grad_norm": 0.7810946702957153, + "learning_rate": 9.545011756372042e-06, + "loss": 0.9217, + "step": 5059 + }, + { + "epoch": 0.27849633991964334, + "grad_norm": 0.7092248797416687, + "learning_rate": 9.544831074314343e-06, + "loss": 0.7599, + "step": 5060 + }, + { + "epoch": 0.27855137872199903, + "grad_norm": 0.831550657749176, + "learning_rate": 9.544650358098908e-06, + "loss": 0.7278, + "step": 5061 + }, + { + "epoch": 0.27860641752435467, + "grad_norm": 0.7645474076271057, + "learning_rate": 9.544469607727098e-06, + "loss": 0.7945, + "step": 5062 + }, + { + "epoch": 0.27866145632671036, + "grad_norm": 0.6956788301467896, + "learning_rate": 9.544288823200273e-06, + "loss": 0.749, + "step": 5063 + }, + { + "epoch": 0.278716495129066, + "grad_norm": 0.7262974381446838, + "learning_rate": 9.544108004519786e-06, + "loss": 0.8074, + "step": 5064 + }, + { + "epoch": 0.2787715339314217, + "grad_norm": 0.7439202666282654, + "learning_rate": 9.543927151687001e-06, + "loss": 0.9403, + "step": 5065 + }, + { + "epoch": 0.2788265727337773, + "grad_norm": 0.8468778133392334, + "learning_rate": 9.543746264703277e-06, + "loss": 0.8182, + "step": 5066 + }, + { + "epoch": 0.278881611536133, + "grad_norm": 0.8396204113960266, + "learning_rate": 9.54356534356997e-06, + "loss": 0.8067, + "step": 5067 + }, + { + "epoch": 0.27893665033848863, + "grad_norm": 0.718758225440979, + "learning_rate": 9.543384388288445e-06, + "loss": 0.8172, + "step": 5068 + }, + { + "epoch": 0.27899168914084427, + "grad_norm": 0.7562685012817383, + "learning_rate": 9.543203398860056e-06, + "loss": 0.9053, + "step": 5069 + }, + { + "epoch": 0.27904672794319996, + "grad_norm": 0.9592792987823486, + "learning_rate": 9.543022375286169e-06, + "loss": 0.9375, + "step": 5070 + }, + { + "epoch": 0.2791017667455556, + "grad_norm": 0.7162739634513855, + "learning_rate": 9.54284131756814e-06, + "loss": 0.7297, + "step": 5071 + }, + { + "epoch": 0.2791568055479113, + "grad_norm": 0.7703517079353333, + "learning_rate": 9.542660225707335e-06, + "loss": 0.8863, + "step": 5072 + }, + { + "epoch": 0.2792118443502669, + "grad_norm": 0.7860418558120728, + "learning_rate": 9.542479099705109e-06, + "loss": 0.8335, + "step": 5073 + }, + { + "epoch": 0.2792668831526226, + "grad_norm": 0.8880825042724609, + "learning_rate": 9.542297939562825e-06, + "loss": 0.8344, + "step": 5074 + }, + { + "epoch": 0.27932192195497824, + "grad_norm": 0.7900505661964417, + "learning_rate": 9.542116745281849e-06, + "loss": 0.7613, + "step": 5075 + }, + { + "epoch": 0.2793769607573339, + "grad_norm": 0.7446081042289734, + "learning_rate": 9.541935516863536e-06, + "loss": 0.6615, + "step": 5076 + }, + { + "epoch": 0.27943199955968956, + "grad_norm": 0.7831308245658875, + "learning_rate": 9.541754254309254e-06, + "loss": 0.779, + "step": 5077 + }, + { + "epoch": 0.27948703836204525, + "grad_norm": 0.9007606506347656, + "learning_rate": 9.541572957620361e-06, + "loss": 0.8883, + "step": 5078 + }, + { + "epoch": 0.2795420771644009, + "grad_norm": 0.8033407330513, + "learning_rate": 9.541391626798222e-06, + "loss": 0.7354, + "step": 5079 + }, + { + "epoch": 0.27959711596675657, + "grad_norm": 0.9259470105171204, + "learning_rate": 9.5412102618442e-06, + "loss": 0.7602, + "step": 5080 + }, + { + "epoch": 0.2796521547691122, + "grad_norm": 0.786523163318634, + "learning_rate": 9.541028862759656e-06, + "loss": 0.7402, + "step": 5081 + }, + { + "epoch": 0.2797071935714679, + "grad_norm": 0.8053372502326965, + "learning_rate": 9.540847429545954e-06, + "loss": 0.825, + "step": 5082 + }, + { + "epoch": 0.2797622323738235, + "grad_norm": 0.8578022122383118, + "learning_rate": 9.54066596220446e-06, + "loss": 0.7866, + "step": 5083 + }, + { + "epoch": 0.2798172711761792, + "grad_norm": 0.916161835193634, + "learning_rate": 9.540484460736535e-06, + "loss": 0.5961, + "step": 5084 + }, + { + "epoch": 0.27987230997853485, + "grad_norm": 0.7843562960624695, + "learning_rate": 9.540302925143545e-06, + "loss": 0.764, + "step": 5085 + }, + { + "epoch": 0.27992734878089054, + "grad_norm": 0.7392510771751404, + "learning_rate": 9.540121355426852e-06, + "loss": 0.8038, + "step": 5086 + }, + { + "epoch": 0.2799823875832462, + "grad_norm": 0.7406296133995056, + "learning_rate": 9.539939751587825e-06, + "loss": 0.8202, + "step": 5087 + }, + { + "epoch": 0.28003742638560186, + "grad_norm": 0.7274924516677856, + "learning_rate": 9.539758113627823e-06, + "loss": 0.7691, + "step": 5088 + }, + { + "epoch": 0.2800924651879575, + "grad_norm": 0.8563184142112732, + "learning_rate": 9.539576441548218e-06, + "loss": 0.8341, + "step": 5089 + }, + { + "epoch": 0.2801475039903132, + "grad_norm": 0.7708351016044617, + "learning_rate": 9.539394735350366e-06, + "loss": 0.7126, + "step": 5090 + }, + { + "epoch": 0.2802025427926688, + "grad_norm": 0.7314836382865906, + "learning_rate": 9.539212995035642e-06, + "loss": 0.7465, + "step": 5091 + }, + { + "epoch": 0.2802575815950245, + "grad_norm": 0.7594754695892334, + "learning_rate": 9.539031220605409e-06, + "loss": 0.7563, + "step": 5092 + }, + { + "epoch": 0.28031262039738014, + "grad_norm": 0.699414074420929, + "learning_rate": 9.53884941206103e-06, + "loss": 0.7847, + "step": 5093 + }, + { + "epoch": 0.28036765919973583, + "grad_norm": 0.8013063073158264, + "learning_rate": 9.538667569403877e-06, + "loss": 0.7769, + "step": 5094 + }, + { + "epoch": 0.28042269800209146, + "grad_norm": 0.7778805494308472, + "learning_rate": 9.538485692635312e-06, + "loss": 0.7646, + "step": 5095 + }, + { + "epoch": 0.28047773680444715, + "grad_norm": 0.785649299621582, + "learning_rate": 9.538303781756702e-06, + "loss": 0.8162, + "step": 5096 + }, + { + "epoch": 0.2805327756068028, + "grad_norm": 0.7073212265968323, + "learning_rate": 9.538121836769417e-06, + "loss": 0.7208, + "step": 5097 + }, + { + "epoch": 0.2805878144091585, + "grad_norm": 0.7545642852783203, + "learning_rate": 9.53793985767482e-06, + "loss": 0.8673, + "step": 5098 + }, + { + "epoch": 0.2806428532115141, + "grad_norm": 0.6818416118621826, + "learning_rate": 9.537757844474285e-06, + "loss": 0.7576, + "step": 5099 + }, + { + "epoch": 0.2806978920138698, + "grad_norm": 0.6718038320541382, + "learning_rate": 9.537575797169176e-06, + "loss": 0.6683, + "step": 5100 + }, + { + "epoch": 0.28075293081622543, + "grad_norm": 0.7851004600524902, + "learning_rate": 9.53739371576086e-06, + "loss": 0.8871, + "step": 5101 + }, + { + "epoch": 0.2808079696185811, + "grad_norm": 0.7565650343894958, + "learning_rate": 9.53721160025071e-06, + "loss": 0.8799, + "step": 5102 + }, + { + "epoch": 0.28086300842093676, + "grad_norm": 0.7522932887077332, + "learning_rate": 9.537029450640091e-06, + "loss": 0.838, + "step": 5103 + }, + { + "epoch": 0.28091804722329244, + "grad_norm": 0.929634690284729, + "learning_rate": 9.536847266930375e-06, + "loss": 0.7997, + "step": 5104 + }, + { + "epoch": 0.2809730860256481, + "grad_norm": 0.8050084710121155, + "learning_rate": 9.536665049122928e-06, + "loss": 0.7652, + "step": 5105 + }, + { + "epoch": 0.28102812482800377, + "grad_norm": 0.7401233315467834, + "learning_rate": 9.53648279721912e-06, + "loss": 0.7904, + "step": 5106 + }, + { + "epoch": 0.2810831636303594, + "grad_norm": 0.7125453948974609, + "learning_rate": 9.536300511220322e-06, + "loss": 0.7349, + "step": 5107 + }, + { + "epoch": 0.2811382024327151, + "grad_norm": 0.7165758609771729, + "learning_rate": 9.536118191127905e-06, + "loss": 0.7314, + "step": 5108 + }, + { + "epoch": 0.2811932412350707, + "grad_norm": 0.7507439851760864, + "learning_rate": 9.535935836943237e-06, + "loss": 0.7603, + "step": 5109 + }, + { + "epoch": 0.2812482800374264, + "grad_norm": 0.7832109332084656, + "learning_rate": 9.535753448667688e-06, + "loss": 0.7279, + "step": 5110 + }, + { + "epoch": 0.28130331883978205, + "grad_norm": 0.7346609234809875, + "learning_rate": 9.535571026302633e-06, + "loss": 0.6882, + "step": 5111 + }, + { + "epoch": 0.2813583576421377, + "grad_norm": 0.7569608688354492, + "learning_rate": 9.535388569849437e-06, + "loss": 0.8451, + "step": 5112 + }, + { + "epoch": 0.28141339644449337, + "grad_norm": 0.7319865822792053, + "learning_rate": 9.535206079309478e-06, + "loss": 0.8161, + "step": 5113 + }, + { + "epoch": 0.281468435246849, + "grad_norm": 0.7744631171226501, + "learning_rate": 9.535023554684122e-06, + "loss": 0.8025, + "step": 5114 + }, + { + "epoch": 0.2815234740492047, + "grad_norm": 0.6867525577545166, + "learning_rate": 9.534840995974743e-06, + "loss": 0.7693, + "step": 5115 + }, + { + "epoch": 0.2815785128515603, + "grad_norm": 0.7625848054885864, + "learning_rate": 9.534658403182715e-06, + "loss": 0.8034, + "step": 5116 + }, + { + "epoch": 0.281633551653916, + "grad_norm": 0.7369832992553711, + "learning_rate": 9.534475776309406e-06, + "loss": 0.873, + "step": 5117 + }, + { + "epoch": 0.28168859045627165, + "grad_norm": 0.7267127633094788, + "learning_rate": 9.534293115356191e-06, + "loss": 0.7954, + "step": 5118 + }, + { + "epoch": 0.28174362925862734, + "grad_norm": 0.7244247794151306, + "learning_rate": 9.534110420324443e-06, + "loss": 0.7784, + "step": 5119 + }, + { + "epoch": 0.28179866806098297, + "grad_norm": 0.8207812905311584, + "learning_rate": 9.533927691215534e-06, + "loss": 0.8696, + "step": 5120 + }, + { + "epoch": 0.28185370686333866, + "grad_norm": 0.8669891357421875, + "learning_rate": 9.53374492803084e-06, + "loss": 0.8203, + "step": 5121 + }, + { + "epoch": 0.2819087456656943, + "grad_norm": 0.7650816440582275, + "learning_rate": 9.533562130771732e-06, + "loss": 0.77, + "step": 5122 + }, + { + "epoch": 0.28196378446805, + "grad_norm": 0.7664972543716431, + "learning_rate": 9.533379299439584e-06, + "loss": 0.7187, + "step": 5123 + }, + { + "epoch": 0.2820188232704056, + "grad_norm": 0.7921896576881409, + "learning_rate": 9.533196434035772e-06, + "loss": 0.8669, + "step": 5124 + }, + { + "epoch": 0.2820738620727613, + "grad_norm": 0.7714456915855408, + "learning_rate": 9.533013534561669e-06, + "loss": 0.8783, + "step": 5125 + }, + { + "epoch": 0.28212890087511694, + "grad_norm": 0.7222065329551697, + "learning_rate": 9.532830601018648e-06, + "loss": 0.7449, + "step": 5126 + }, + { + "epoch": 0.28218393967747263, + "grad_norm": 0.718142569065094, + "learning_rate": 9.532647633408085e-06, + "loss": 0.8226, + "step": 5127 + }, + { + "epoch": 0.28223897847982826, + "grad_norm": 0.730592668056488, + "learning_rate": 9.532464631731357e-06, + "loss": 0.7878, + "step": 5128 + }, + { + "epoch": 0.28229401728218395, + "grad_norm": 0.7841802835464478, + "learning_rate": 9.532281595989839e-06, + "loss": 0.8262, + "step": 5129 + }, + { + "epoch": 0.2823490560845396, + "grad_norm": 0.8617212772369385, + "learning_rate": 9.532098526184904e-06, + "loss": 0.8368, + "step": 5130 + }, + { + "epoch": 0.2824040948868953, + "grad_norm": 0.6968556642532349, + "learning_rate": 9.53191542231793e-06, + "loss": 0.6848, + "step": 5131 + }, + { + "epoch": 0.2824591336892509, + "grad_norm": 0.7872157096862793, + "learning_rate": 9.531732284390294e-06, + "loss": 0.7898, + "step": 5132 + }, + { + "epoch": 0.2825141724916066, + "grad_norm": 0.7727276086807251, + "learning_rate": 9.53154911240337e-06, + "loss": 0.8506, + "step": 5133 + }, + { + "epoch": 0.28256921129396223, + "grad_norm": 0.7279896140098572, + "learning_rate": 9.531365906358536e-06, + "loss": 0.7415, + "step": 5134 + }, + { + "epoch": 0.2826242500963179, + "grad_norm": 0.7457457780838013, + "learning_rate": 9.53118266625717e-06, + "loss": 0.7652, + "step": 5135 + }, + { + "epoch": 0.28267928889867355, + "grad_norm": 0.8989270329475403, + "learning_rate": 9.530999392100646e-06, + "loss": 0.9085, + "step": 5136 + }, + { + "epoch": 0.28273432770102924, + "grad_norm": 0.9622626304626465, + "learning_rate": 9.530816083890347e-06, + "loss": 0.8726, + "step": 5137 + }, + { + "epoch": 0.2827893665033849, + "grad_norm": 0.7712846994400024, + "learning_rate": 9.530632741627643e-06, + "loss": 0.765, + "step": 5138 + }, + { + "epoch": 0.28284440530574056, + "grad_norm": 0.8320727348327637, + "learning_rate": 9.530449365313918e-06, + "loss": 0.7828, + "step": 5139 + }, + { + "epoch": 0.2828994441080962, + "grad_norm": 0.9310963153839111, + "learning_rate": 9.530265954950549e-06, + "loss": 0.8482, + "step": 5140 + }, + { + "epoch": 0.2829544829104519, + "grad_norm": 0.9984502792358398, + "learning_rate": 9.530082510538914e-06, + "loss": 0.8673, + "step": 5141 + }, + { + "epoch": 0.2830095217128075, + "grad_norm": 0.8300992250442505, + "learning_rate": 9.52989903208039e-06, + "loss": 0.8232, + "step": 5142 + }, + { + "epoch": 0.2830645605151632, + "grad_norm": 0.930052638053894, + "learning_rate": 9.529715519576356e-06, + "loss": 0.7766, + "step": 5143 + }, + { + "epoch": 0.28311959931751884, + "grad_norm": 0.8038359880447388, + "learning_rate": 9.529531973028194e-06, + "loss": 0.712, + "step": 5144 + }, + { + "epoch": 0.28317463811987453, + "grad_norm": 0.856250524520874, + "learning_rate": 9.529348392437283e-06, + "loss": 0.8578, + "step": 5145 + }, + { + "epoch": 0.28322967692223017, + "grad_norm": 0.7602483630180359, + "learning_rate": 9.529164777805002e-06, + "loss": 0.749, + "step": 5146 + }, + { + "epoch": 0.28328471572458586, + "grad_norm": 0.8946549892425537, + "learning_rate": 9.52898112913273e-06, + "loss": 0.8101, + "step": 5147 + }, + { + "epoch": 0.2833397545269415, + "grad_norm": 0.8015615344047546, + "learning_rate": 9.52879744642185e-06, + "loss": 0.8203, + "step": 5148 + }, + { + "epoch": 0.2833947933292972, + "grad_norm": 0.7767183780670166, + "learning_rate": 9.528613729673738e-06, + "loss": 0.8409, + "step": 5149 + }, + { + "epoch": 0.2834498321316528, + "grad_norm": 0.7604000568389893, + "learning_rate": 9.52842997888978e-06, + "loss": 0.8853, + "step": 5150 + }, + { + "epoch": 0.2835048709340085, + "grad_norm": 0.7079401016235352, + "learning_rate": 9.528246194071353e-06, + "loss": 0.6855, + "step": 5151 + }, + { + "epoch": 0.28355990973636414, + "grad_norm": 0.7616782188415527, + "learning_rate": 9.52806237521984e-06, + "loss": 0.785, + "step": 5152 + }, + { + "epoch": 0.2836149485387198, + "grad_norm": 0.7408583760261536, + "learning_rate": 9.527878522336622e-06, + "loss": 0.7105, + "step": 5153 + }, + { + "epoch": 0.28366998734107546, + "grad_norm": 0.694821834564209, + "learning_rate": 9.52769463542308e-06, + "loss": 0.6552, + "step": 5154 + }, + { + "epoch": 0.2837250261434311, + "grad_norm": 0.796925961971283, + "learning_rate": 9.5275107144806e-06, + "loss": 0.7122, + "step": 5155 + }, + { + "epoch": 0.2837800649457868, + "grad_norm": 0.8001971244812012, + "learning_rate": 9.527326759510558e-06, + "loss": 0.8528, + "step": 5156 + }, + { + "epoch": 0.2838351037481424, + "grad_norm": 0.8605831265449524, + "learning_rate": 9.527142770514341e-06, + "loss": 0.7948, + "step": 5157 + }, + { + "epoch": 0.2838901425504981, + "grad_norm": 0.8380078077316284, + "learning_rate": 9.526958747493334e-06, + "loss": 0.8184, + "step": 5158 + }, + { + "epoch": 0.28394518135285374, + "grad_norm": 0.8758485317230225, + "learning_rate": 9.526774690448913e-06, + "loss": 0.7625, + "step": 5159 + }, + { + "epoch": 0.2840002201552094, + "grad_norm": 0.7078989744186401, + "learning_rate": 9.526590599382466e-06, + "loss": 0.8179, + "step": 5160 + }, + { + "epoch": 0.28405525895756506, + "grad_norm": 0.6668990850448608, + "learning_rate": 9.526406474295376e-06, + "loss": 0.7169, + "step": 5161 + }, + { + "epoch": 0.28411029775992075, + "grad_norm": 0.7666084170341492, + "learning_rate": 9.526222315189026e-06, + "loss": 0.8511, + "step": 5162 + }, + { + "epoch": 0.2841653365622764, + "grad_norm": 0.7390545606613159, + "learning_rate": 9.526038122064802e-06, + "loss": 0.7926, + "step": 5163 + }, + { + "epoch": 0.28422037536463207, + "grad_norm": 0.7972092032432556, + "learning_rate": 9.525853894924086e-06, + "loss": 0.9166, + "step": 5164 + }, + { + "epoch": 0.2842754141669877, + "grad_norm": 0.8988455533981323, + "learning_rate": 9.525669633768265e-06, + "loss": 0.9497, + "step": 5165 + }, + { + "epoch": 0.2843304529693434, + "grad_norm": 0.7092710137367249, + "learning_rate": 9.525485338598722e-06, + "loss": 0.7241, + "step": 5166 + }, + { + "epoch": 0.28438549177169903, + "grad_norm": 0.8630063533782959, + "learning_rate": 9.525301009416843e-06, + "loss": 0.8318, + "step": 5167 + }, + { + "epoch": 0.2844405305740547, + "grad_norm": 0.7336890697479248, + "learning_rate": 9.52511664622401e-06, + "loss": 0.7077, + "step": 5168 + }, + { + "epoch": 0.28449556937641035, + "grad_norm": 0.8156722784042358, + "learning_rate": 9.524932249021615e-06, + "loss": 0.8573, + "step": 5169 + }, + { + "epoch": 0.28455060817876604, + "grad_norm": 0.7061388492584229, + "learning_rate": 9.524747817811038e-06, + "loss": 0.7432, + "step": 5170 + }, + { + "epoch": 0.2846056469811217, + "grad_norm": 0.7948413491249084, + "learning_rate": 9.52456335259367e-06, + "loss": 0.8082, + "step": 5171 + }, + { + "epoch": 0.28466068578347736, + "grad_norm": 0.7208091020584106, + "learning_rate": 9.524378853370893e-06, + "loss": 0.7027, + "step": 5172 + }, + { + "epoch": 0.284715724585833, + "grad_norm": 0.8377540111541748, + "learning_rate": 9.524194320144096e-06, + "loss": 0.7093, + "step": 5173 + }, + { + "epoch": 0.2847707633881887, + "grad_norm": 0.8734563589096069, + "learning_rate": 9.524009752914666e-06, + "loss": 0.8422, + "step": 5174 + }, + { + "epoch": 0.2848258021905443, + "grad_norm": 0.7303940653800964, + "learning_rate": 9.523825151683989e-06, + "loss": 0.811, + "step": 5175 + }, + { + "epoch": 0.2848808409929, + "grad_norm": 0.7653842568397522, + "learning_rate": 9.523640516453455e-06, + "loss": 0.8595, + "step": 5176 + }, + { + "epoch": 0.28493587979525564, + "grad_norm": 0.7366930246353149, + "learning_rate": 9.523455847224448e-06, + "loss": 0.7832, + "step": 5177 + }, + { + "epoch": 0.28499091859761133, + "grad_norm": 0.7908505797386169, + "learning_rate": 9.523271143998357e-06, + "loss": 0.8115, + "step": 5178 + }, + { + "epoch": 0.28504595739996696, + "grad_norm": 0.8176048398017883, + "learning_rate": 9.523086406776572e-06, + "loss": 0.8377, + "step": 5179 + }, + { + "epoch": 0.28510099620232265, + "grad_norm": 0.724086344242096, + "learning_rate": 9.52290163556048e-06, + "loss": 0.7804, + "step": 5180 + }, + { + "epoch": 0.2851560350046783, + "grad_norm": 0.6461299657821655, + "learning_rate": 9.52271683035147e-06, + "loss": 0.5727, + "step": 5181 + }, + { + "epoch": 0.285211073807034, + "grad_norm": 0.7275353074073792, + "learning_rate": 9.522531991150932e-06, + "loss": 0.8345, + "step": 5182 + }, + { + "epoch": 0.2852661126093896, + "grad_norm": 0.7321951985359192, + "learning_rate": 9.522347117960253e-06, + "loss": 0.8832, + "step": 5183 + }, + { + "epoch": 0.2853211514117453, + "grad_norm": 0.7526552677154541, + "learning_rate": 9.522162210780825e-06, + "loss": 0.831, + "step": 5184 + }, + { + "epoch": 0.28537619021410093, + "grad_norm": 0.7592381238937378, + "learning_rate": 9.521977269614036e-06, + "loss": 0.7293, + "step": 5185 + }, + { + "epoch": 0.2854312290164566, + "grad_norm": 0.8060448169708252, + "learning_rate": 9.521792294461274e-06, + "loss": 0.819, + "step": 5186 + }, + { + "epoch": 0.28548626781881226, + "grad_norm": 0.7178553342819214, + "learning_rate": 9.521607285323932e-06, + "loss": 0.7526, + "step": 5187 + }, + { + "epoch": 0.28554130662116795, + "grad_norm": 0.8186969757080078, + "learning_rate": 9.521422242203401e-06, + "loss": 0.8526, + "step": 5188 + }, + { + "epoch": 0.2855963454235236, + "grad_norm": 0.8480883240699768, + "learning_rate": 9.521237165101071e-06, + "loss": 0.8088, + "step": 5189 + }, + { + "epoch": 0.28565138422587927, + "grad_norm": 0.8053719401359558, + "learning_rate": 9.521052054018333e-06, + "loss": 0.928, + "step": 5190 + }, + { + "epoch": 0.2857064230282349, + "grad_norm": 0.6937163472175598, + "learning_rate": 9.52086690895658e-06, + "loss": 0.7418, + "step": 5191 + }, + { + "epoch": 0.2857614618305906, + "grad_norm": 1.0616179704666138, + "learning_rate": 9.520681729917196e-06, + "loss": 0.8726, + "step": 5192 + }, + { + "epoch": 0.2858165006329462, + "grad_norm": 0.7504106163978577, + "learning_rate": 9.520496516901582e-06, + "loss": 0.844, + "step": 5193 + }, + { + "epoch": 0.2858715394353019, + "grad_norm": 0.7634509205818176, + "learning_rate": 9.520311269911127e-06, + "loss": 0.7595, + "step": 5194 + }, + { + "epoch": 0.28592657823765755, + "grad_norm": 0.7069799900054932, + "learning_rate": 9.52012598894722e-06, + "loss": 0.7566, + "step": 5195 + }, + { + "epoch": 0.28598161704001324, + "grad_norm": 0.695737361907959, + "learning_rate": 9.519940674011256e-06, + "loss": 0.7534, + "step": 5196 + }, + { + "epoch": 0.28603665584236887, + "grad_norm": 0.7212124466896057, + "learning_rate": 9.51975532510463e-06, + "loss": 0.8237, + "step": 5197 + }, + { + "epoch": 0.2860916946447245, + "grad_norm": 0.7274062633514404, + "learning_rate": 9.519569942228732e-06, + "loss": 0.756, + "step": 5198 + }, + { + "epoch": 0.2861467334470802, + "grad_norm": 0.7038697600364685, + "learning_rate": 9.519384525384956e-06, + "loss": 0.7308, + "step": 5199 + }, + { + "epoch": 0.2862017722494358, + "grad_norm": 0.6897109150886536, + "learning_rate": 9.519199074574694e-06, + "loss": 0.7858, + "step": 5200 + }, + { + "epoch": 0.2862568110517915, + "grad_norm": 0.8471527099609375, + "learning_rate": 9.519013589799343e-06, + "loss": 0.8198, + "step": 5201 + }, + { + "epoch": 0.28631184985414715, + "grad_norm": 0.6828129291534424, + "learning_rate": 9.518828071060295e-06, + "loss": 0.7734, + "step": 5202 + }, + { + "epoch": 0.28636688865650284, + "grad_norm": 0.7437755465507507, + "learning_rate": 9.518642518358946e-06, + "loss": 0.7669, + "step": 5203 + }, + { + "epoch": 0.28642192745885847, + "grad_norm": 0.8841923475265503, + "learning_rate": 9.518456931696689e-06, + "loss": 0.8201, + "step": 5204 + }, + { + "epoch": 0.28647696626121416, + "grad_norm": 0.9514154195785522, + "learning_rate": 9.518271311074917e-06, + "loss": 0.7864, + "step": 5205 + }, + { + "epoch": 0.2865320050635698, + "grad_norm": 0.830795407295227, + "learning_rate": 9.51808565649503e-06, + "loss": 0.8024, + "step": 5206 + }, + { + "epoch": 0.2865870438659255, + "grad_norm": 0.7274934649467468, + "learning_rate": 9.51789996795842e-06, + "loss": 0.7631, + "step": 5207 + }, + { + "epoch": 0.2866420826682811, + "grad_norm": 0.7004290223121643, + "learning_rate": 9.517714245466482e-06, + "loss": 0.7344, + "step": 5208 + }, + { + "epoch": 0.2866971214706368, + "grad_norm": 0.8559010624885559, + "learning_rate": 9.517528489020614e-06, + "loss": 0.7502, + "step": 5209 + }, + { + "epoch": 0.28675216027299244, + "grad_norm": 0.8913494348526001, + "learning_rate": 9.517342698622212e-06, + "loss": 0.8908, + "step": 5210 + }, + { + "epoch": 0.28680719907534813, + "grad_norm": 0.8375207781791687, + "learning_rate": 9.51715687427267e-06, + "loss": 0.7701, + "step": 5211 + }, + { + "epoch": 0.28686223787770376, + "grad_norm": 1.1804776191711426, + "learning_rate": 9.516971015973386e-06, + "loss": 0.8449, + "step": 5212 + }, + { + "epoch": 0.28691727668005945, + "grad_norm": 0.7260473370552063, + "learning_rate": 9.516785123725758e-06, + "loss": 0.7978, + "step": 5213 + }, + { + "epoch": 0.2869723154824151, + "grad_norm": 0.8159041404724121, + "learning_rate": 9.516599197531182e-06, + "loss": 0.7454, + "step": 5214 + }, + { + "epoch": 0.2870273542847708, + "grad_norm": 0.7850227952003479, + "learning_rate": 9.516413237391056e-06, + "loss": 0.8082, + "step": 5215 + }, + { + "epoch": 0.2870823930871264, + "grad_norm": 0.7596960067749023, + "learning_rate": 9.516227243306774e-06, + "loss": 0.7286, + "step": 5216 + }, + { + "epoch": 0.2871374318894821, + "grad_norm": 0.8763321042060852, + "learning_rate": 9.516041215279741e-06, + "loss": 0.8685, + "step": 5217 + }, + { + "epoch": 0.28719247069183773, + "grad_norm": 1.2130110263824463, + "learning_rate": 9.515855153311349e-06, + "loss": 0.8374, + "step": 5218 + }, + { + "epoch": 0.2872475094941934, + "grad_norm": 0.7578628063201904, + "learning_rate": 9.515669057402999e-06, + "loss": 0.793, + "step": 5219 + }, + { + "epoch": 0.28730254829654905, + "grad_norm": 0.9085225462913513, + "learning_rate": 9.515482927556088e-06, + "loss": 0.8366, + "step": 5220 + }, + { + "epoch": 0.28735758709890474, + "grad_norm": 0.7107900977134705, + "learning_rate": 9.515296763772017e-06, + "loss": 0.6571, + "step": 5221 + }, + { + "epoch": 0.2874126259012604, + "grad_norm": 0.7742018699645996, + "learning_rate": 9.515110566052183e-06, + "loss": 0.8387, + "step": 5222 + }, + { + "epoch": 0.28746766470361607, + "grad_norm": 0.8934319615364075, + "learning_rate": 9.514924334397987e-06, + "loss": 0.8546, + "step": 5223 + }, + { + "epoch": 0.2875227035059717, + "grad_norm": 0.720245897769928, + "learning_rate": 9.51473806881083e-06, + "loss": 0.7459, + "step": 5224 + }, + { + "epoch": 0.2875777423083274, + "grad_norm": 0.7074370384216309, + "learning_rate": 9.514551769292109e-06, + "loss": 0.8598, + "step": 5225 + }, + { + "epoch": 0.287632781110683, + "grad_norm": 0.7608621120452881, + "learning_rate": 9.514365435843226e-06, + "loss": 0.7263, + "step": 5226 + }, + { + "epoch": 0.2876878199130387, + "grad_norm": 0.7581011652946472, + "learning_rate": 9.51417906846558e-06, + "loss": 0.7498, + "step": 5227 + }, + { + "epoch": 0.28774285871539435, + "grad_norm": 0.8184412121772766, + "learning_rate": 9.513992667160572e-06, + "loss": 0.6889, + "step": 5228 + }, + { + "epoch": 0.28779789751775003, + "grad_norm": 0.6835145354270935, + "learning_rate": 9.513806231929605e-06, + "loss": 0.7399, + "step": 5229 + }, + { + "epoch": 0.28785293632010567, + "grad_norm": 0.7601536512374878, + "learning_rate": 9.513619762774077e-06, + "loss": 0.846, + "step": 5230 + }, + { + "epoch": 0.28790797512246136, + "grad_norm": 0.781491219997406, + "learning_rate": 9.513433259695392e-06, + "loss": 0.8326, + "step": 5231 + }, + { + "epoch": 0.287963013924817, + "grad_norm": 0.7978106141090393, + "learning_rate": 9.513246722694951e-06, + "loss": 0.7917, + "step": 5232 + }, + { + "epoch": 0.2880180527271727, + "grad_norm": 0.8071381449699402, + "learning_rate": 9.513060151774156e-06, + "loss": 0.8054, + "step": 5233 + }, + { + "epoch": 0.2880730915295283, + "grad_norm": 0.815567135810852, + "learning_rate": 9.512873546934406e-06, + "loss": 0.8647, + "step": 5234 + }, + { + "epoch": 0.288128130331884, + "grad_norm": 0.8255048990249634, + "learning_rate": 9.512686908177111e-06, + "loss": 0.9011, + "step": 5235 + }, + { + "epoch": 0.28818316913423964, + "grad_norm": 0.8392062187194824, + "learning_rate": 9.512500235503666e-06, + "loss": 0.8778, + "step": 5236 + }, + { + "epoch": 0.2882382079365953, + "grad_norm": 0.7256191372871399, + "learning_rate": 9.512313528915478e-06, + "loss": 0.7231, + "step": 5237 + }, + { + "epoch": 0.28829324673895096, + "grad_norm": 0.9041032195091248, + "learning_rate": 9.51212678841395e-06, + "loss": 0.8469, + "step": 5238 + }, + { + "epoch": 0.28834828554130665, + "grad_norm": 0.7857525944709778, + "learning_rate": 9.511940014000485e-06, + "loss": 0.7447, + "step": 5239 + }, + { + "epoch": 0.2884033243436623, + "grad_norm": 0.6925225257873535, + "learning_rate": 9.511753205676485e-06, + "loss": 0.8302, + "step": 5240 + }, + { + "epoch": 0.2884583631460179, + "grad_norm": 0.7253623008728027, + "learning_rate": 9.511566363443356e-06, + "loss": 0.8373, + "step": 5241 + }, + { + "epoch": 0.2885134019483736, + "grad_norm": 0.7198607921600342, + "learning_rate": 9.511379487302504e-06, + "loss": 0.79, + "step": 5242 + }, + { + "epoch": 0.28856844075072924, + "grad_norm": 0.7966421246528625, + "learning_rate": 9.511192577255328e-06, + "loss": 0.7933, + "step": 5243 + }, + { + "epoch": 0.2886234795530849, + "grad_norm": 0.9159359931945801, + "learning_rate": 9.511005633303239e-06, + "loss": 0.7254, + "step": 5244 + }, + { + "epoch": 0.28867851835544056, + "grad_norm": 0.9514481425285339, + "learning_rate": 9.510818655447638e-06, + "loss": 0.8916, + "step": 5245 + }, + { + "epoch": 0.28873355715779625, + "grad_norm": 0.7505099773406982, + "learning_rate": 9.510631643689932e-06, + "loss": 0.765, + "step": 5246 + }, + { + "epoch": 0.2887885959601519, + "grad_norm": 0.7824658751487732, + "learning_rate": 9.510444598031526e-06, + "loss": 0.6972, + "step": 5247 + }, + { + "epoch": 0.2888436347625076, + "grad_norm": 0.7778681516647339, + "learning_rate": 9.510257518473824e-06, + "loss": 0.8705, + "step": 5248 + }, + { + "epoch": 0.2888986735648632, + "grad_norm": 0.6785199642181396, + "learning_rate": 9.510070405018235e-06, + "loss": 0.6889, + "step": 5249 + }, + { + "epoch": 0.2889537123672189, + "grad_norm": 0.7045316100120544, + "learning_rate": 9.509883257666164e-06, + "loss": 0.7979, + "step": 5250 + }, + { + "epoch": 0.28900875116957453, + "grad_norm": 1.3174562454223633, + "learning_rate": 9.509696076419018e-06, + "loss": 0.8802, + "step": 5251 + }, + { + "epoch": 0.2890637899719302, + "grad_norm": 1.1800767183303833, + "learning_rate": 9.509508861278205e-06, + "loss": 0.9246, + "step": 5252 + }, + { + "epoch": 0.28911882877428585, + "grad_norm": 0.7057580947875977, + "learning_rate": 9.509321612245128e-06, + "loss": 0.7565, + "step": 5253 + }, + { + "epoch": 0.28917386757664154, + "grad_norm": 0.7681905031204224, + "learning_rate": 9.509134329321197e-06, + "loss": 0.8678, + "step": 5254 + }, + { + "epoch": 0.2892289063789972, + "grad_norm": 0.96025550365448, + "learning_rate": 9.50894701250782e-06, + "loss": 0.9108, + "step": 5255 + }, + { + "epoch": 0.28928394518135286, + "grad_norm": 0.7786841988563538, + "learning_rate": 9.508759661806405e-06, + "loss": 0.7747, + "step": 5256 + }, + { + "epoch": 0.2893389839837085, + "grad_norm": 0.7073540091514587, + "learning_rate": 9.508572277218358e-06, + "loss": 0.7573, + "step": 5257 + }, + { + "epoch": 0.2893940227860642, + "grad_norm": 0.6648856401443481, + "learning_rate": 9.50838485874509e-06, + "loss": 0.7294, + "step": 5258 + }, + { + "epoch": 0.2894490615884198, + "grad_norm": 0.6794270873069763, + "learning_rate": 9.508197406388007e-06, + "loss": 0.7001, + "step": 5259 + }, + { + "epoch": 0.2895041003907755, + "grad_norm": 0.6819350123405457, + "learning_rate": 9.50800992014852e-06, + "loss": 0.7114, + "step": 5260 + }, + { + "epoch": 0.28955913919313114, + "grad_norm": 0.6616997122764587, + "learning_rate": 9.507822400028036e-06, + "loss": 0.7108, + "step": 5261 + }, + { + "epoch": 0.28961417799548683, + "grad_norm": 0.7447230219841003, + "learning_rate": 9.507634846027966e-06, + "loss": 0.7865, + "step": 5262 + }, + { + "epoch": 0.28966921679784247, + "grad_norm": 0.7826278209686279, + "learning_rate": 9.50744725814972e-06, + "loss": 0.7922, + "step": 5263 + }, + { + "epoch": 0.28972425560019816, + "grad_norm": 0.8054459095001221, + "learning_rate": 9.507259636394706e-06, + "loss": 0.795, + "step": 5264 + }, + { + "epoch": 0.2897792944025538, + "grad_norm": 0.9539191722869873, + "learning_rate": 9.507071980764335e-06, + "loss": 0.9495, + "step": 5265 + }, + { + "epoch": 0.2898343332049095, + "grad_norm": 0.8877993226051331, + "learning_rate": 9.506884291260017e-06, + "loss": 0.8418, + "step": 5266 + }, + { + "epoch": 0.2898893720072651, + "grad_norm": 0.6620327234268188, + "learning_rate": 9.506696567883164e-06, + "loss": 0.6285, + "step": 5267 + }, + { + "epoch": 0.2899444108096208, + "grad_norm": 0.7604434490203857, + "learning_rate": 9.506508810635187e-06, + "loss": 0.8562, + "step": 5268 + }, + { + "epoch": 0.28999944961197643, + "grad_norm": 0.8181812763214111, + "learning_rate": 9.506321019517494e-06, + "loss": 0.905, + "step": 5269 + }, + { + "epoch": 0.2900544884143321, + "grad_norm": 0.7776391506195068, + "learning_rate": 9.5061331945315e-06, + "loss": 0.8871, + "step": 5270 + }, + { + "epoch": 0.29010952721668776, + "grad_norm": 0.8125039339065552, + "learning_rate": 9.505945335678613e-06, + "loss": 0.7254, + "step": 5271 + }, + { + "epoch": 0.29016456601904345, + "grad_norm": 0.7229846715927124, + "learning_rate": 9.50575744296025e-06, + "loss": 0.8192, + "step": 5272 + }, + { + "epoch": 0.2902196048213991, + "grad_norm": 0.72443026304245, + "learning_rate": 9.505569516377817e-06, + "loss": 0.7813, + "step": 5273 + }, + { + "epoch": 0.29027464362375477, + "grad_norm": 0.6798073053359985, + "learning_rate": 9.505381555932731e-06, + "loss": 0.7655, + "step": 5274 + }, + { + "epoch": 0.2903296824261104, + "grad_norm": 1.0805624723434448, + "learning_rate": 9.505193561626404e-06, + "loss": 0.9035, + "step": 5275 + }, + { + "epoch": 0.2903847212284661, + "grad_norm": 0.7579694986343384, + "learning_rate": 9.505005533460247e-06, + "loss": 0.8612, + "step": 5276 + }, + { + "epoch": 0.2904397600308217, + "grad_norm": 1.2496099472045898, + "learning_rate": 9.504817471435676e-06, + "loss": 0.813, + "step": 5277 + }, + { + "epoch": 0.2904947988331774, + "grad_norm": 0.6915673017501831, + "learning_rate": 9.504629375554102e-06, + "loss": 0.6891, + "step": 5278 + }, + { + "epoch": 0.29054983763553305, + "grad_norm": 0.8581767082214355, + "learning_rate": 9.504441245816937e-06, + "loss": 0.7137, + "step": 5279 + }, + { + "epoch": 0.29060487643788874, + "grad_norm": 0.7469545006752014, + "learning_rate": 9.504253082225601e-06, + "loss": 0.7621, + "step": 5280 + }, + { + "epoch": 0.29065991524024437, + "grad_norm": 0.7725615501403809, + "learning_rate": 9.504064884781503e-06, + "loss": 0.7988, + "step": 5281 + }, + { + "epoch": 0.29071495404260006, + "grad_norm": 1.0187722444534302, + "learning_rate": 9.503876653486058e-06, + "loss": 0.7772, + "step": 5282 + }, + { + "epoch": 0.2907699928449557, + "grad_norm": 0.675574779510498, + "learning_rate": 9.503688388340683e-06, + "loss": 0.7096, + "step": 5283 + }, + { + "epoch": 0.2908250316473113, + "grad_norm": 0.7980207800865173, + "learning_rate": 9.503500089346792e-06, + "loss": 0.8291, + "step": 5284 + }, + { + "epoch": 0.290880070449667, + "grad_norm": 0.6891655325889587, + "learning_rate": 9.503311756505797e-06, + "loss": 0.7186, + "step": 5285 + }, + { + "epoch": 0.29093510925202265, + "grad_norm": 0.7273408770561218, + "learning_rate": 9.50312338981912e-06, + "loss": 0.7483, + "step": 5286 + }, + { + "epoch": 0.29099014805437834, + "grad_norm": 0.7346869111061096, + "learning_rate": 9.50293498928817e-06, + "loss": 0.766, + "step": 5287 + }, + { + "epoch": 0.291045186856734, + "grad_norm": 0.7627394795417786, + "learning_rate": 9.502746554914368e-06, + "loss": 0.867, + "step": 5288 + }, + { + "epoch": 0.29110022565908966, + "grad_norm": 0.8477200865745544, + "learning_rate": 9.502558086699128e-06, + "loss": 0.8317, + "step": 5289 + }, + { + "epoch": 0.2911552644614453, + "grad_norm": 0.7696006894111633, + "learning_rate": 9.502369584643867e-06, + "loss": 0.7814, + "step": 5290 + }, + { + "epoch": 0.291210303263801, + "grad_norm": 0.7614455819129944, + "learning_rate": 9.502181048749999e-06, + "loss": 0.7398, + "step": 5291 + }, + { + "epoch": 0.2912653420661566, + "grad_norm": 0.7877628207206726, + "learning_rate": 9.501992479018946e-06, + "loss": 0.8731, + "step": 5292 + }, + { + "epoch": 0.2913203808685123, + "grad_norm": 0.7455846667289734, + "learning_rate": 9.50180387545212e-06, + "loss": 0.7059, + "step": 5293 + }, + { + "epoch": 0.29137541967086794, + "grad_norm": 1.145520567893982, + "learning_rate": 9.501615238050944e-06, + "loss": 0.6968, + "step": 5294 + }, + { + "epoch": 0.29143045847322363, + "grad_norm": 0.8100234866142273, + "learning_rate": 9.501426566816831e-06, + "loss": 0.8122, + "step": 5295 + }, + { + "epoch": 0.29148549727557926, + "grad_norm": 0.6813066005706787, + "learning_rate": 9.501237861751203e-06, + "loss": 0.6718, + "step": 5296 + }, + { + "epoch": 0.29154053607793495, + "grad_norm": 0.7400195002555847, + "learning_rate": 9.501049122855473e-06, + "loss": 0.802, + "step": 5297 + }, + { + "epoch": 0.2915955748802906, + "grad_norm": 0.7948681712150574, + "learning_rate": 9.500860350131065e-06, + "loss": 0.8237, + "step": 5298 + }, + { + "epoch": 0.2916506136826463, + "grad_norm": 0.772093653678894, + "learning_rate": 9.500671543579394e-06, + "loss": 0.7687, + "step": 5299 + }, + { + "epoch": 0.2917056524850019, + "grad_norm": 0.7468486428260803, + "learning_rate": 9.500482703201881e-06, + "loss": 0.7827, + "step": 5300 + }, + { + "epoch": 0.2917606912873576, + "grad_norm": 0.7284440398216248, + "learning_rate": 9.500293828999945e-06, + "loss": 0.8086, + "step": 5301 + }, + { + "epoch": 0.29181573008971323, + "grad_norm": 0.8014211654663086, + "learning_rate": 9.500104920975005e-06, + "loss": 0.8409, + "step": 5302 + }, + { + "epoch": 0.2918707688920689, + "grad_norm": 0.7588346004486084, + "learning_rate": 9.49991597912848e-06, + "loss": 0.7149, + "step": 5303 + }, + { + "epoch": 0.29192580769442456, + "grad_norm": 0.8098518252372742, + "learning_rate": 9.499727003461794e-06, + "loss": 0.8375, + "step": 5304 + }, + { + "epoch": 0.29198084649678024, + "grad_norm": 0.8502426743507385, + "learning_rate": 9.499537993976363e-06, + "loss": 0.8177, + "step": 5305 + }, + { + "epoch": 0.2920358852991359, + "grad_norm": 0.8010903596878052, + "learning_rate": 9.499348950673607e-06, + "loss": 0.8457, + "step": 5306 + }, + { + "epoch": 0.29209092410149157, + "grad_norm": 0.6628156304359436, + "learning_rate": 9.49915987355495e-06, + "loss": 0.7327, + "step": 5307 + }, + { + "epoch": 0.2921459629038472, + "grad_norm": 0.7414939999580383, + "learning_rate": 9.49897076262181e-06, + "loss": 0.8271, + "step": 5308 + }, + { + "epoch": 0.2922010017062029, + "grad_norm": 0.7490847706794739, + "learning_rate": 9.498781617875613e-06, + "loss": 0.7689, + "step": 5309 + }, + { + "epoch": 0.2922560405085585, + "grad_norm": 0.7913424968719482, + "learning_rate": 9.498592439317777e-06, + "loss": 0.8571, + "step": 5310 + }, + { + "epoch": 0.2923110793109142, + "grad_norm": 0.6903867125511169, + "learning_rate": 9.498403226949724e-06, + "loss": 0.7325, + "step": 5311 + }, + { + "epoch": 0.29236611811326985, + "grad_norm": 0.8087130188941956, + "learning_rate": 9.498213980772875e-06, + "loss": 0.8167, + "step": 5312 + }, + { + "epoch": 0.29242115691562554, + "grad_norm": 1.1316752433776855, + "learning_rate": 9.498024700788655e-06, + "loss": 0.912, + "step": 5313 + }, + { + "epoch": 0.29247619571798117, + "grad_norm": 0.8701719045639038, + "learning_rate": 9.497835386998486e-06, + "loss": 0.8728, + "step": 5314 + }, + { + "epoch": 0.29253123452033686, + "grad_norm": 0.6688953638076782, + "learning_rate": 9.49764603940379e-06, + "loss": 0.6561, + "step": 5315 + }, + { + "epoch": 0.2925862733226925, + "grad_norm": 0.8067505359649658, + "learning_rate": 9.49745665800599e-06, + "loss": 0.8419, + "step": 5316 + }, + { + "epoch": 0.2926413121250482, + "grad_norm": 0.7157390117645264, + "learning_rate": 9.49726724280651e-06, + "loss": 0.7964, + "step": 5317 + }, + { + "epoch": 0.2926963509274038, + "grad_norm": 0.7038627862930298, + "learning_rate": 9.497077793806772e-06, + "loss": 0.7343, + "step": 5318 + }, + { + "epoch": 0.2927513897297595, + "grad_norm": 0.7674478888511658, + "learning_rate": 9.4968883110082e-06, + "loss": 0.7624, + "step": 5319 + }, + { + "epoch": 0.29280642853211514, + "grad_norm": 0.6708847284317017, + "learning_rate": 9.496698794412223e-06, + "loss": 0.6554, + "step": 5320 + }, + { + "epoch": 0.2928614673344708, + "grad_norm": 0.8332329392433167, + "learning_rate": 9.49650924402026e-06, + "loss": 0.9357, + "step": 5321 + }, + { + "epoch": 0.29291650613682646, + "grad_norm": 0.7601341605186462, + "learning_rate": 9.496319659833737e-06, + "loss": 0.8208, + "step": 5322 + }, + { + "epoch": 0.29297154493918215, + "grad_norm": 0.8320396542549133, + "learning_rate": 9.496130041854077e-06, + "loss": 0.8423, + "step": 5323 + }, + { + "epoch": 0.2930265837415378, + "grad_norm": 0.8242839574813843, + "learning_rate": 9.49594039008271e-06, + "loss": 0.9101, + "step": 5324 + }, + { + "epoch": 0.29308162254389347, + "grad_norm": 0.8906320333480835, + "learning_rate": 9.495750704521058e-06, + "loss": 0.7343, + "step": 5325 + }, + { + "epoch": 0.2931366613462491, + "grad_norm": 0.7964318990707397, + "learning_rate": 9.495560985170546e-06, + "loss": 0.7789, + "step": 5326 + }, + { + "epoch": 0.29319170014860474, + "grad_norm": 0.8267771601676941, + "learning_rate": 9.495371232032602e-06, + "loss": 0.7447, + "step": 5327 + }, + { + "epoch": 0.29324673895096043, + "grad_norm": 0.8120046257972717, + "learning_rate": 9.49518144510865e-06, + "loss": 0.7803, + "step": 5328 + }, + { + "epoch": 0.29330177775331606, + "grad_norm": 0.7314801812171936, + "learning_rate": 9.494991624400119e-06, + "loss": 0.6758, + "step": 5329 + }, + { + "epoch": 0.29335681655567175, + "grad_norm": 0.6989930272102356, + "learning_rate": 9.494801769908433e-06, + "loss": 0.7945, + "step": 5330 + }, + { + "epoch": 0.2934118553580274, + "grad_norm": 0.7804785966873169, + "learning_rate": 9.494611881635021e-06, + "loss": 0.7977, + "step": 5331 + }, + { + "epoch": 0.2934668941603831, + "grad_norm": 0.8377045392990112, + "learning_rate": 9.494421959581308e-06, + "loss": 0.8077, + "step": 5332 + }, + { + "epoch": 0.2935219329627387, + "grad_norm": 0.7463418245315552, + "learning_rate": 9.494232003748724e-06, + "loss": 0.783, + "step": 5333 + }, + { + "epoch": 0.2935769717650944, + "grad_norm": 0.7598912715911865, + "learning_rate": 9.494042014138695e-06, + "loss": 0.7869, + "step": 5334 + }, + { + "epoch": 0.29363201056745003, + "grad_norm": 0.7634113430976868, + "learning_rate": 9.493851990752648e-06, + "loss": 0.8108, + "step": 5335 + }, + { + "epoch": 0.2936870493698057, + "grad_norm": 0.8056474328041077, + "learning_rate": 9.493661933592013e-06, + "loss": 0.7921, + "step": 5336 + }, + { + "epoch": 0.29374208817216135, + "grad_norm": 0.8699371218681335, + "learning_rate": 9.493471842658219e-06, + "loss": 0.8833, + "step": 5337 + }, + { + "epoch": 0.29379712697451704, + "grad_norm": 0.8803261518478394, + "learning_rate": 9.493281717952691e-06, + "loss": 0.7848, + "step": 5338 + }, + { + "epoch": 0.2938521657768727, + "grad_norm": 0.7678453922271729, + "learning_rate": 9.493091559476864e-06, + "loss": 0.836, + "step": 5339 + }, + { + "epoch": 0.29390720457922836, + "grad_norm": 0.7653701305389404, + "learning_rate": 9.49290136723216e-06, + "loss": 0.8215, + "step": 5340 + }, + { + "epoch": 0.293962243381584, + "grad_norm": 0.768120527267456, + "learning_rate": 9.492711141220013e-06, + "loss": 0.7498, + "step": 5341 + }, + { + "epoch": 0.2940172821839397, + "grad_norm": 0.7665749788284302, + "learning_rate": 9.492520881441854e-06, + "loss": 0.7883, + "step": 5342 + }, + { + "epoch": 0.2940723209862953, + "grad_norm": 0.7405015230178833, + "learning_rate": 9.492330587899108e-06, + "loss": 0.8112, + "step": 5343 + }, + { + "epoch": 0.294127359788651, + "grad_norm": 0.7183459997177124, + "learning_rate": 9.492140260593208e-06, + "loss": 0.8227, + "step": 5344 + }, + { + "epoch": 0.29418239859100664, + "grad_norm": 0.7453572154045105, + "learning_rate": 9.491949899525585e-06, + "loss": 0.8148, + "step": 5345 + }, + { + "epoch": 0.29423743739336233, + "grad_norm": 0.8963750600814819, + "learning_rate": 9.491759504697669e-06, + "loss": 0.9261, + "step": 5346 + }, + { + "epoch": 0.29429247619571797, + "grad_norm": 0.7631667256355286, + "learning_rate": 9.49156907611089e-06, + "loss": 0.7708, + "step": 5347 + }, + { + "epoch": 0.29434751499807366, + "grad_norm": 0.6324381232261658, + "learning_rate": 9.49137861376668e-06, + "loss": 0.6688, + "step": 5348 + }, + { + "epoch": 0.2944025538004293, + "grad_norm": 0.6969807147979736, + "learning_rate": 9.491188117666472e-06, + "loss": 0.7516, + "step": 5349 + }, + { + "epoch": 0.294457592602785, + "grad_norm": 1.633340835571289, + "learning_rate": 9.490997587811697e-06, + "loss": 0.8111, + "step": 5350 + }, + { + "epoch": 0.2945126314051406, + "grad_norm": 0.7084371447563171, + "learning_rate": 9.490807024203785e-06, + "loss": 0.8375, + "step": 5351 + }, + { + "epoch": 0.2945676702074963, + "grad_norm": 0.7335958480834961, + "learning_rate": 9.490616426844169e-06, + "loss": 0.7884, + "step": 5352 + }, + { + "epoch": 0.29462270900985194, + "grad_norm": 0.7560276985168457, + "learning_rate": 9.490425795734282e-06, + "loss": 0.8918, + "step": 5353 + }, + { + "epoch": 0.2946777478122076, + "grad_norm": 0.9185894727706909, + "learning_rate": 9.490235130875557e-06, + "loss": 0.7976, + "step": 5354 + }, + { + "epoch": 0.29473278661456326, + "grad_norm": 0.7871553897857666, + "learning_rate": 9.490044432269427e-06, + "loss": 0.8564, + "step": 5355 + }, + { + "epoch": 0.29478782541691895, + "grad_norm": 0.8736812472343445, + "learning_rate": 9.489853699917326e-06, + "loss": 0.8114, + "step": 5356 + }, + { + "epoch": 0.2948428642192746, + "grad_norm": 0.8068968653678894, + "learning_rate": 9.489662933820684e-06, + "loss": 0.9198, + "step": 5357 + }, + { + "epoch": 0.29489790302163027, + "grad_norm": 0.7816325426101685, + "learning_rate": 9.489472133980939e-06, + "loss": 0.8012, + "step": 5358 + }, + { + "epoch": 0.2949529418239859, + "grad_norm": 0.7248200178146362, + "learning_rate": 9.489281300399522e-06, + "loss": 0.8099, + "step": 5359 + }, + { + "epoch": 0.2950079806263416, + "grad_norm": 0.7887724041938782, + "learning_rate": 9.48909043307787e-06, + "loss": 0.884, + "step": 5360 + }, + { + "epoch": 0.2950630194286972, + "grad_norm": 0.765163004398346, + "learning_rate": 9.488899532017415e-06, + "loss": 0.8563, + "step": 5361 + }, + { + "epoch": 0.2951180582310529, + "grad_norm": 0.7658557295799255, + "learning_rate": 9.488708597219592e-06, + "loss": 0.8897, + "step": 5362 + }, + { + "epoch": 0.29517309703340855, + "grad_norm": 0.6653227806091309, + "learning_rate": 9.488517628685838e-06, + "loss": 0.7107, + "step": 5363 + }, + { + "epoch": 0.29522813583576424, + "grad_norm": 0.787739098072052, + "learning_rate": 9.488326626417586e-06, + "loss": 0.8181, + "step": 5364 + }, + { + "epoch": 0.29528317463811987, + "grad_norm": 0.7822532057762146, + "learning_rate": 9.488135590416275e-06, + "loss": 0.8238, + "step": 5365 + }, + { + "epoch": 0.29533821344047556, + "grad_norm": 0.7797419428825378, + "learning_rate": 9.487944520683334e-06, + "loss": 0.8484, + "step": 5366 + }, + { + "epoch": 0.2953932522428312, + "grad_norm": 0.7230222225189209, + "learning_rate": 9.487753417220207e-06, + "loss": 0.8193, + "step": 5367 + }, + { + "epoch": 0.2954482910451869, + "grad_norm": 0.8256810307502747, + "learning_rate": 9.487562280028325e-06, + "loss": 0.7691, + "step": 5368 + }, + { + "epoch": 0.2955033298475425, + "grad_norm": 0.7704648375511169, + "learning_rate": 9.487371109109127e-06, + "loss": 0.8235, + "step": 5369 + }, + { + "epoch": 0.29555836864989815, + "grad_norm": 0.7580391764640808, + "learning_rate": 9.487179904464048e-06, + "loss": 0.7911, + "step": 5370 + }, + { + "epoch": 0.29561340745225384, + "grad_norm": 0.7211806774139404, + "learning_rate": 9.486988666094526e-06, + "loss": 0.7188, + "step": 5371 + }, + { + "epoch": 0.2956684462546095, + "grad_norm": 0.8375828862190247, + "learning_rate": 9.486797394001999e-06, + "loss": 0.881, + "step": 5372 + }, + { + "epoch": 0.29572348505696516, + "grad_norm": 0.8500093221664429, + "learning_rate": 9.486606088187903e-06, + "loss": 0.8632, + "step": 5373 + }, + { + "epoch": 0.2957785238593208, + "grad_norm": 0.7754727005958557, + "learning_rate": 9.486414748653677e-06, + "loss": 0.8124, + "step": 5374 + }, + { + "epoch": 0.2958335626616765, + "grad_norm": 0.9395208954811096, + "learning_rate": 9.486223375400759e-06, + "loss": 0.8046, + "step": 5375 + }, + { + "epoch": 0.2958886014640321, + "grad_norm": 0.7587517499923706, + "learning_rate": 9.486031968430587e-06, + "loss": 0.7852, + "step": 5376 + }, + { + "epoch": 0.2959436402663878, + "grad_norm": 0.6921781301498413, + "learning_rate": 9.485840527744599e-06, + "loss": 0.7392, + "step": 5377 + }, + { + "epoch": 0.29599867906874344, + "grad_norm": 0.8768522143363953, + "learning_rate": 9.485649053344233e-06, + "loss": 0.7819, + "step": 5378 + }, + { + "epoch": 0.29605371787109913, + "grad_norm": 0.7565680146217346, + "learning_rate": 9.485457545230932e-06, + "loss": 0.7489, + "step": 5379 + }, + { + "epoch": 0.29610875667345476, + "grad_norm": 0.7760992050170898, + "learning_rate": 9.485266003406132e-06, + "loss": 0.8129, + "step": 5380 + }, + { + "epoch": 0.29616379547581045, + "grad_norm": 0.7726097106933594, + "learning_rate": 9.485074427871272e-06, + "loss": 0.725, + "step": 5381 + }, + { + "epoch": 0.2962188342781661, + "grad_norm": 0.6885473728179932, + "learning_rate": 9.484882818627796e-06, + "loss": 0.685, + "step": 5382 + }, + { + "epoch": 0.2962738730805218, + "grad_norm": 0.776509702205658, + "learning_rate": 9.484691175677138e-06, + "loss": 0.8077, + "step": 5383 + }, + { + "epoch": 0.2963289118828774, + "grad_norm": 0.7436297535896301, + "learning_rate": 9.484499499020744e-06, + "loss": 0.8161, + "step": 5384 + }, + { + "epoch": 0.2963839506852331, + "grad_norm": 0.7604314088821411, + "learning_rate": 9.484307788660052e-06, + "loss": 0.825, + "step": 5385 + }, + { + "epoch": 0.29643898948758873, + "grad_norm": 0.7230789065361023, + "learning_rate": 9.484116044596501e-06, + "loss": 0.8005, + "step": 5386 + }, + { + "epoch": 0.2964940282899444, + "grad_norm": 0.820442259311676, + "learning_rate": 9.483924266831536e-06, + "loss": 0.789, + "step": 5387 + }, + { + "epoch": 0.29654906709230006, + "grad_norm": 0.7514582276344299, + "learning_rate": 9.483732455366596e-06, + "loss": 0.8531, + "step": 5388 + }, + { + "epoch": 0.29660410589465575, + "grad_norm": 0.6671503782272339, + "learning_rate": 9.483540610203124e-06, + "loss": 0.7627, + "step": 5389 + }, + { + "epoch": 0.2966591446970114, + "grad_norm": 0.6955942511558533, + "learning_rate": 9.483348731342559e-06, + "loss": 0.726, + "step": 5390 + }, + { + "epoch": 0.29671418349936707, + "grad_norm": 0.769781768321991, + "learning_rate": 9.483156818786347e-06, + "loss": 0.8064, + "step": 5391 + }, + { + "epoch": 0.2967692223017227, + "grad_norm": 1.0764707326889038, + "learning_rate": 9.482964872535927e-06, + "loss": 0.8249, + "step": 5392 + }, + { + "epoch": 0.2968242611040784, + "grad_norm": 1.0508921146392822, + "learning_rate": 9.482772892592744e-06, + "loss": 0.706, + "step": 5393 + }, + { + "epoch": 0.296879299906434, + "grad_norm": 0.6442564129829407, + "learning_rate": 9.482580878958239e-06, + "loss": 0.6025, + "step": 5394 + }, + { + "epoch": 0.2969343387087897, + "grad_norm": 0.7622735500335693, + "learning_rate": 9.482388831633856e-06, + "loss": 0.7639, + "step": 5395 + }, + { + "epoch": 0.29698937751114535, + "grad_norm": 0.8179057240486145, + "learning_rate": 9.482196750621038e-06, + "loss": 0.7641, + "step": 5396 + }, + { + "epoch": 0.29704441631350104, + "grad_norm": 0.7955192923545837, + "learning_rate": 9.48200463592123e-06, + "loss": 0.8407, + "step": 5397 + }, + { + "epoch": 0.29709945511585667, + "grad_norm": 0.7909773588180542, + "learning_rate": 9.481812487535875e-06, + "loss": 0.7833, + "step": 5398 + }, + { + "epoch": 0.29715449391821236, + "grad_norm": 0.8409042954444885, + "learning_rate": 9.481620305466417e-06, + "loss": 0.7788, + "step": 5399 + }, + { + "epoch": 0.297209532720568, + "grad_norm": 0.7521414160728455, + "learning_rate": 9.4814280897143e-06, + "loss": 0.7192, + "step": 5400 + }, + { + "epoch": 0.2972645715229237, + "grad_norm": 0.7016280889511108, + "learning_rate": 9.481235840280969e-06, + "loss": 0.7181, + "step": 5401 + }, + { + "epoch": 0.2973196103252793, + "grad_norm": 0.7257362604141235, + "learning_rate": 9.48104355716787e-06, + "loss": 0.7845, + "step": 5402 + }, + { + "epoch": 0.297374649127635, + "grad_norm": 0.8048765659332275, + "learning_rate": 9.480851240376445e-06, + "loss": 0.7921, + "step": 5403 + }, + { + "epoch": 0.29742968792999064, + "grad_norm": 0.8715546131134033, + "learning_rate": 9.480658889908143e-06, + "loss": 0.856, + "step": 5404 + }, + { + "epoch": 0.2974847267323463, + "grad_norm": 0.7211160063743591, + "learning_rate": 9.480466505764408e-06, + "loss": 0.7687, + "step": 5405 + }, + { + "epoch": 0.29753976553470196, + "grad_norm": 0.8749645352363586, + "learning_rate": 9.480274087946686e-06, + "loss": 0.8419, + "step": 5406 + }, + { + "epoch": 0.29759480433705765, + "grad_norm": 0.7986398935317993, + "learning_rate": 9.480081636456424e-06, + "loss": 0.8309, + "step": 5407 + }, + { + "epoch": 0.2976498431394133, + "grad_norm": 0.8435508012771606, + "learning_rate": 9.479889151295067e-06, + "loss": 0.7457, + "step": 5408 + }, + { + "epoch": 0.297704881941769, + "grad_norm": 0.8725010752677917, + "learning_rate": 9.479696632464063e-06, + "loss": 0.8069, + "step": 5409 + }, + { + "epoch": 0.2977599207441246, + "grad_norm": 0.7364320158958435, + "learning_rate": 9.479504079964856e-06, + "loss": 0.8316, + "step": 5410 + }, + { + "epoch": 0.2978149595464803, + "grad_norm": 0.7967824935913086, + "learning_rate": 9.479311493798898e-06, + "loss": 0.7689, + "step": 5411 + }, + { + "epoch": 0.29786999834883593, + "grad_norm": 0.8415414094924927, + "learning_rate": 9.479118873967632e-06, + "loss": 0.8288, + "step": 5412 + }, + { + "epoch": 0.29792503715119156, + "grad_norm": 0.9723265767097473, + "learning_rate": 9.478926220472508e-06, + "loss": 0.7422, + "step": 5413 + }, + { + "epoch": 0.29798007595354725, + "grad_norm": 0.7203155159950256, + "learning_rate": 9.478733533314974e-06, + "loss": 0.707, + "step": 5414 + }, + { + "epoch": 0.2980351147559029, + "grad_norm": 0.7643926739692688, + "learning_rate": 9.478540812496478e-06, + "loss": 0.7793, + "step": 5415 + }, + { + "epoch": 0.2980901535582586, + "grad_norm": 0.9177087545394897, + "learning_rate": 9.478348058018467e-06, + "loss": 0.865, + "step": 5416 + }, + { + "epoch": 0.2981451923606142, + "grad_norm": 0.678931713104248, + "learning_rate": 9.478155269882392e-06, + "loss": 0.7716, + "step": 5417 + }, + { + "epoch": 0.2982002311629699, + "grad_norm": 0.8440513610839844, + "learning_rate": 9.4779624480897e-06, + "loss": 0.8904, + "step": 5418 + }, + { + "epoch": 0.29825526996532553, + "grad_norm": 0.8508756756782532, + "learning_rate": 9.47776959264184e-06, + "loss": 0.7994, + "step": 5419 + }, + { + "epoch": 0.2983103087676812, + "grad_norm": 0.8736951947212219, + "learning_rate": 9.477576703540265e-06, + "loss": 0.8374, + "step": 5420 + }, + { + "epoch": 0.29836534757003685, + "grad_norm": 0.8063240051269531, + "learning_rate": 9.47738378078642e-06, + "loss": 0.7217, + "step": 5421 + }, + { + "epoch": 0.29842038637239254, + "grad_norm": 1.1495088338851929, + "learning_rate": 9.477190824381757e-06, + "loss": 0.8902, + "step": 5422 + }, + { + "epoch": 0.2984754251747482, + "grad_norm": 1.0241554975509644, + "learning_rate": 9.476997834327725e-06, + "loss": 0.9354, + "step": 5423 + }, + { + "epoch": 0.29853046397710387, + "grad_norm": 0.939950168132782, + "learning_rate": 9.476804810625779e-06, + "loss": 0.8714, + "step": 5424 + }, + { + "epoch": 0.2985855027794595, + "grad_norm": 0.7592660188674927, + "learning_rate": 9.476611753277364e-06, + "loss": 0.7513, + "step": 5425 + }, + { + "epoch": 0.2986405415818152, + "grad_norm": 0.776153028011322, + "learning_rate": 9.476418662283935e-06, + "loss": 0.7828, + "step": 5426 + }, + { + "epoch": 0.2986955803841708, + "grad_norm": 0.9317814707756042, + "learning_rate": 9.47622553764694e-06, + "loss": 0.865, + "step": 5427 + }, + { + "epoch": 0.2987506191865265, + "grad_norm": 0.7770501971244812, + "learning_rate": 9.476032379367832e-06, + "loss": 0.7281, + "step": 5428 + }, + { + "epoch": 0.29880565798888215, + "grad_norm": 0.7815201282501221, + "learning_rate": 9.475839187448064e-06, + "loss": 0.7565, + "step": 5429 + }, + { + "epoch": 0.29886069679123783, + "grad_norm": 0.7992607951164246, + "learning_rate": 9.475645961889086e-06, + "loss": 0.8109, + "step": 5430 + }, + { + "epoch": 0.29891573559359347, + "grad_norm": 0.7780614495277405, + "learning_rate": 9.475452702692351e-06, + "loss": 0.7814, + "step": 5431 + }, + { + "epoch": 0.29897077439594916, + "grad_norm": 0.7409062385559082, + "learning_rate": 9.475259409859313e-06, + "loss": 0.7712, + "step": 5432 + }, + { + "epoch": 0.2990258131983048, + "grad_norm": 0.7935584187507629, + "learning_rate": 9.47506608339142e-06, + "loss": 0.8301, + "step": 5433 + }, + { + "epoch": 0.2990808520006605, + "grad_norm": 0.6931030750274658, + "learning_rate": 9.474872723290132e-06, + "loss": 0.7471, + "step": 5434 + }, + { + "epoch": 0.2991358908030161, + "grad_norm": 0.7622918486595154, + "learning_rate": 9.474679329556894e-06, + "loss": 0.7727, + "step": 5435 + }, + { + "epoch": 0.2991909296053718, + "grad_norm": 0.7957701086997986, + "learning_rate": 9.474485902193169e-06, + "loss": 0.7663, + "step": 5436 + }, + { + "epoch": 0.29924596840772744, + "grad_norm": 1.0600612163543701, + "learning_rate": 9.474292441200404e-06, + "loss": 0.7861, + "step": 5437 + }, + { + "epoch": 0.2993010072100831, + "grad_norm": 0.7343600392341614, + "learning_rate": 9.474098946580053e-06, + "loss": 0.8609, + "step": 5438 + }, + { + "epoch": 0.29935604601243876, + "grad_norm": 0.7477726340293884, + "learning_rate": 9.473905418333573e-06, + "loss": 0.7683, + "step": 5439 + }, + { + "epoch": 0.29941108481479445, + "grad_norm": 0.7955546379089355, + "learning_rate": 9.473711856462417e-06, + "loss": 0.8406, + "step": 5440 + }, + { + "epoch": 0.2994661236171501, + "grad_norm": 0.8291183114051819, + "learning_rate": 9.47351826096804e-06, + "loss": 0.6919, + "step": 5441 + }, + { + "epoch": 0.29952116241950577, + "grad_norm": 0.8899849057197571, + "learning_rate": 9.473324631851898e-06, + "loss": 0.9403, + "step": 5442 + }, + { + "epoch": 0.2995762012218614, + "grad_norm": 0.837066650390625, + "learning_rate": 9.473130969115445e-06, + "loss": 0.8676, + "step": 5443 + }, + { + "epoch": 0.2996312400242171, + "grad_norm": 0.8385708928108215, + "learning_rate": 9.472937272760138e-06, + "loss": 0.7588, + "step": 5444 + }, + { + "epoch": 0.2996862788265727, + "grad_norm": 0.6990595459938049, + "learning_rate": 9.472743542787431e-06, + "loss": 0.6769, + "step": 5445 + }, + { + "epoch": 0.2997413176289284, + "grad_norm": 0.789165735244751, + "learning_rate": 9.472549779198781e-06, + "loss": 0.8084, + "step": 5446 + }, + { + "epoch": 0.29979635643128405, + "grad_norm": 0.8820298314094543, + "learning_rate": 9.472355981995643e-06, + "loss": 0.8262, + "step": 5447 + }, + { + "epoch": 0.29985139523363974, + "grad_norm": 0.8928382992744446, + "learning_rate": 9.472162151179475e-06, + "loss": 0.8123, + "step": 5448 + }, + { + "epoch": 0.2999064340359954, + "grad_norm": 0.7688086032867432, + "learning_rate": 9.471968286751735e-06, + "loss": 0.6846, + "step": 5449 + }, + { + "epoch": 0.29996147283835106, + "grad_norm": 0.6962918043136597, + "learning_rate": 9.471774388713877e-06, + "loss": 0.7872, + "step": 5450 + }, + { + "epoch": 0.3000165116407067, + "grad_norm": 0.7467569708824158, + "learning_rate": 9.47158045706736e-06, + "loss": 0.8201, + "step": 5451 + }, + { + "epoch": 0.3000715504430624, + "grad_norm": 0.7651814222335815, + "learning_rate": 9.471386491813642e-06, + "loss": 0.7734, + "step": 5452 + }, + { + "epoch": 0.300126589245418, + "grad_norm": 0.8001144528388977, + "learning_rate": 9.47119249295418e-06, + "loss": 0.8266, + "step": 5453 + }, + { + "epoch": 0.3001816280477737, + "grad_norm": 0.7937704920768738, + "learning_rate": 9.47099846049043e-06, + "loss": 0.8025, + "step": 5454 + } + ], + "logging_steps": 1, + "max_steps": 36338, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 909, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6095088284731965e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5454/training_args.bin b/checkpoint-5454/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..006183302820d451c9ca29db41c5d8a020225b2a --- /dev/null +++ b/checkpoint-5454/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccc35e326b781438eda71a1f881999ddc9f323429e8f60e362a744617b4ee255 +size 7928 diff --git a/checkpoint-5454/zero_to_fp32.py b/checkpoint-5454/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-5454/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters)