diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..cddabaeb8ca303ec826bce9551051b3ea039e8dd 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +saves/chess/generate_strategy/checkpoint-19208/tokenizer.json filter=lfs diff=lfs merge=lfs -text +saves/chess/generate_strategy/tokenizer.json filter=lfs diff=lfs merge=lfs -text +saves/chess/no_explain/checkpoint-4000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +saves/chess/no_explain/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/saves/chess/generate_strategy/README.md b/saves/chess/generate_strategy/README.md new file mode 100644 index 0000000000000000000000000000000000000000..aaaf741ac88f423958af8fc7b22f7b2aa2400948 --- /dev/null +++ b/saves/chess/generate_strategy/README.md @@ -0,0 +1,75 @@ +--- +library_name: transformers +license: other +base_model: meta-llama/Meta-Llama-3-8B-Instruct +tags: +- llama-factory +- full +- generated_from_trainer +model-index: +- name: generate_strategy + results: [] +--- + + + +# generate_strategy + +This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on the chess_generate_strategy_00, the chess_generate_strategy_01, the chess_generate_strategy_02, the chess_generate_strategy_03, the chess_generate_strategy_04, the chess_generate_strategy_05, the chess_generate_strategy_06, the chess_generate_strategy_07, the chess_generate_strategy_08, the chess_generate_strategy_09, the chess_generate_strategy_10, the chess_generate_strategy_11, the chess_generate_strategy_12, the chess_generate_strategy_13 and the chess_generate_strategy_14 datasets. +It achieves the following results on the evaluation set: +- Loss: 0.1802 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-06 +- train_batch_size: 64 +- eval_batch_size: 64 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 8 +- gradient_accumulation_steps: 2 +- total_train_batch_size: 1024 +- total_eval_batch_size: 512 +- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 10.0 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | +|:-------------:|:------:|:-----:|:---------------:| +| 0.1808 | 1.0 | 2401 | 0.1811 | +| 0.1796 | 2.0 | 4802 | 0.1797 | +| 0.1794 | 3.0 | 7203 | 0.1795 | +| 0.1792 | 4.0 | 9604 | 0.1792 | +| 0.1791 | 5.0 | 12005 | 0.1791 | +| 0.179 | 6.0 | 14406 | 0.1791 | +| 0.1787 | 7.0 | 16807 | 0.1789 | +| 0.1786 | 8.0 | 19208 | 0.1789 | +| 0.1781 | 9.0 | 21609 | 0.1791 | +| 0.1766 | 9.9960 | 24000 | 0.1802 | + + +### Framework versions + +- Transformers 4.48.2 +- Pytorch 2.6.0+cu124 +- Datasets 2.21.0 +- Tokenizers 0.21.0 diff --git a/saves/chess/generate_strategy/all_results.json b/saves/chess/generate_strategy/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b4dbfa355b25f2fe1c86be95afbbd232a920065c --- /dev/null +++ b/saves/chess/generate_strategy/all_results.json @@ -0,0 +1,12 @@ +{ + "epoch": 9.996042491147678, + "eval_loss": 0.18023133277893066, + "eval_runtime": 195.9606, + "eval_samples_per_second": 1393.637, + "eval_steps_per_second": 2.725, + "total_flos": 5485114750402560.0, + "train_loss": 0.19645737719535827, + "train_runtime": 70712.6152, + "train_samples_per_second": 347.587, + "train_steps_per_second": 0.339 +} \ No newline at end of file diff --git a/saves/chess/generate_strategy/checkpoint-19208/config.json b/saves/chess/generate_strategy/checkpoint-19208/config.json new file mode 100644 index 0000000000000000000000000000000000000000..fe9ce0e7d2a8ad9d74229897630ae54102a0a1a3 --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.48.2", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/saves/chess/generate_strategy/checkpoint-19208/generation_config.json b/saves/chess/generate_strategy/checkpoint-19208/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eb70ec09806f7ce366dd58e8239ad0ca2d5babf1 --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128009 + ], + "max_length": 4096, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.48.2" +} diff --git a/saves/chess/generate_strategy/checkpoint-19208/latest b/saves/chess/generate_strategy/checkpoint-19208/latest new file mode 100644 index 0000000000000000000000000000000000000000..6ccd66a2151500380927f3754aedd0bfc9be23ee --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/latest @@ -0,0 +1 @@ +global_step19204 \ No newline at end of file diff --git a/saves/chess/generate_strategy/checkpoint-19208/model-00001-of-00004.safetensors b/saves/chess/generate_strategy/checkpoint-19208/model-00001-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e6779b0b4c6cdf5945b4f462103c5220b6ac2cfb --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:679dfe090c52caf92b5838be67f7183765351855fac89b7c43861328dbb24e80 +size 4976698672 diff --git a/saves/chess/generate_strategy/checkpoint-19208/model-00002-of-00004.safetensors b/saves/chess/generate_strategy/checkpoint-19208/model-00002-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..167403a3580aa4c4039bc3ef5f10347d215e7d2c --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbc5434cdd2428cb25c44c86b9f6b0a603b7b8253f35f965276d5af380c6b81b +size 4999802720 diff --git a/saves/chess/generate_strategy/checkpoint-19208/model-00003-of-00004.safetensors b/saves/chess/generate_strategy/checkpoint-19208/model-00003-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e89483a008b8f804c04379cb7729349759043a80 --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51e43bcda95abe59d86336aa9bfd3fbdd9e118d8930bd6a9664953c053c9a9bc +size 4915916176 diff --git a/saves/chess/generate_strategy/checkpoint-19208/model-00004-of-00004.safetensors b/saves/chess/generate_strategy/checkpoint-19208/model-00004-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..10a29f0101a61aac22de12c8945c0ea036c66bab --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2f9a9755629abc894bba719c6852dfbf3311d8bf2cbabc726d3117682ba2844 +size 1168138808 diff --git a/saves/chess/generate_strategy/checkpoint-19208/model.safetensors.index.json b/saves/chess/generate_strategy/checkpoint-19208/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0fd8120f1c6acddc268ebc2583058efaf699a771 --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/saves/chess/generate_strategy/checkpoint-19208/rng_state_0.pth b/saves/chess/generate_strategy/checkpoint-19208/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..331a542ef30cc221562b6a988bba872aca28732e --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb7c3bc1248de8b4739437317b988d953fd64a5de9736606d74f9c8277f1b485 +size 15984 diff --git a/saves/chess/generate_strategy/checkpoint-19208/rng_state_1.pth b/saves/chess/generate_strategy/checkpoint-19208/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d445f1a845bda18b54837a3234302870193ebea4 --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8e571d57a85eb2cdabf3f46c86e446bdb7d26aba8b1467b5e4b5bbe29ad42a7 +size 15984 diff --git a/saves/chess/generate_strategy/checkpoint-19208/rng_state_2.pth b/saves/chess/generate_strategy/checkpoint-19208/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3a1a5fda176cefd8a1f05e423f2c82ed9f2333bf --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:489e5542988617525a395c45dc83ec6bf25b473812e139122f0a3f3d92f031d0 +size 15984 diff --git a/saves/chess/generate_strategy/checkpoint-19208/rng_state_3.pth b/saves/chess/generate_strategy/checkpoint-19208/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a7495a1bc89c5532615f548b4a177c4b6de82a0a --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd77682efb711872c5be25e87e87a2726a2e7105422cddd00f04da7be35ca20 +size 15984 diff --git a/saves/chess/generate_strategy/checkpoint-19208/rng_state_4.pth b/saves/chess/generate_strategy/checkpoint-19208/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..a0dd539c338038495aec8fdc04c5e6d165086b28 --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44d9e7d535f5fbcd7cfef16ba22d32d5f445aacceba782a05df1f97d47a608a +size 15984 diff --git a/saves/chess/generate_strategy/checkpoint-19208/rng_state_5.pth b/saves/chess/generate_strategy/checkpoint-19208/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd7cb309d087786d365a3ca391edef06504b3bb4 --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a107290a0d9898930bc6abe369ee246ef7322541985fc2a5320e7775f5ea5c88 +size 15984 diff --git a/saves/chess/generate_strategy/checkpoint-19208/rng_state_6.pth b/saves/chess/generate_strategy/checkpoint-19208/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c760c81b8bffb4ba6cb4dcda4460911ef5e78df --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88ab49d56ee4079c2a208376064f825918f070addc8f0c58c5c594265f9e8a78 +size 15984 diff --git a/saves/chess/generate_strategy/checkpoint-19208/rng_state_7.pth b/saves/chess/generate_strategy/checkpoint-19208/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..62523a33304462480531f2f10d91dcdd14562719 --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d15033d06420b17d80db45c89544170faa67833d5a0d9c30a51a38a1102b073 +size 15984 diff --git a/saves/chess/generate_strategy/checkpoint-19208/scheduler.pt b/saves/chess/generate_strategy/checkpoint-19208/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..200e917e590f65c4d828c2dc613bcd34c26dcdfd --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c563ec26d3e6a26a391a7a14609370350d4e8af185e7a822f099a4fa127f834f +size 1064 diff --git a/saves/chess/generate_strategy/checkpoint-19208/special_tokens_map.json b/saves/chess/generate_strategy/checkpoint-19208/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..df5c3a478b842fa66e6a8c10265478284c1d4f41 --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/special_tokens_map.json @@ -0,0 +1,33 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/saves/chess/generate_strategy/checkpoint-19208/tokenizer.json b/saves/chess/generate_strategy/checkpoint-19208/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..172311123ab62378f1f6d90f3068a676b7d939ed --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 +size 17210148 diff --git a/saves/chess/generate_strategy/checkpoint-19208/tokenizer_config.json b/saves/chess/generate_strategy/checkpoint-19208/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e2afd45c14285320f15500548266d7adba98d07a --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/tokenizer_config.json @@ -0,0 +1,2078 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eot_id|>", + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 2048, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/saves/chess/generate_strategy/checkpoint-19208/trainer_state.json b/saves/chess/generate_strategy/checkpoint-19208/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3bb999169caa55aee8772e53d80dafa837fe80f7 --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/trainer_state.json @@ -0,0 +1,1441 @@ +{ + "best_metric": 0.17886345088481903, + "best_model_checkpoint": "saves/chess/generate_strategy/checkpoint-19208", + "epoch": 8.0, + "eval_steps": 500, + "global_step": 19208, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0416579879191835, + "grad_norm": 8.262849587594042, + "learning_rate": 2.0833333333333333e-07, + "loss": 3.9539, + "step": 100 + }, + { + "epoch": 0.083315975838367, + "grad_norm": 2.1815007336055197, + "learning_rate": 4.1666666666666667e-07, + "loss": 0.4086, + "step": 200 + }, + { + "epoch": 0.12497396375755052, + "grad_norm": 1.094766614987478, + "learning_rate": 6.25e-07, + "loss": 0.2144, + "step": 300 + }, + { + "epoch": 0.166631951676734, + "grad_norm": 1.015902700288932, + "learning_rate": 8.333333333333333e-07, + "loss": 0.2103, + "step": 400 + }, + { + "epoch": 0.20828993959591752, + "grad_norm": 1.083927107302103, + "learning_rate": 1.0416666666666667e-06, + "loss": 0.2075, + "step": 500 + }, + { + "epoch": 0.24994792751510103, + "grad_norm": 0.8787980351861964, + "learning_rate": 1.25e-06, + "loss": 0.2049, + "step": 600 + }, + { + "epoch": 0.29160591543428455, + "grad_norm": 0.5454433660253264, + "learning_rate": 1.4583333333333335e-06, + "loss": 0.2001, + "step": 700 + }, + { + "epoch": 0.333263903353468, + "grad_norm": 0.6745519185509095, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.1916, + "step": 800 + }, + { + "epoch": 0.3749218912726515, + "grad_norm": 0.3263511819812891, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.1849, + "step": 900 + }, + { + "epoch": 0.41657987919183503, + "grad_norm": 3.199309878765134, + "learning_rate": 2.0833333333333334e-06, + "loss": 0.1847, + "step": 1000 + }, + { + "epoch": 0.45823786711101855, + "grad_norm": 0.4060106618321982, + "learning_rate": 2.2916666666666666e-06, + "loss": 0.1845, + "step": 1100 + }, + { + "epoch": 0.49989585503020206, + "grad_norm": 0.36591848729629267, + "learning_rate": 2.5e-06, + "loss": 0.1818, + "step": 1200 + }, + { + "epoch": 0.5415538429493856, + "grad_norm": 0.35361804320631923, + "learning_rate": 2.7083333333333334e-06, + "loss": 0.1807, + "step": 1300 + }, + { + "epoch": 0.5832118308685691, + "grad_norm": 0.35892337648275896, + "learning_rate": 2.916666666666667e-06, + "loss": 0.1806, + "step": 1400 + }, + { + "epoch": 0.6248698187877526, + "grad_norm": 0.2820867931414937, + "learning_rate": 3.125e-06, + "loss": 0.1806, + "step": 1500 + }, + { + "epoch": 0.666527806706936, + "grad_norm": 0.3098924570604735, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.1808, + "step": 1600 + }, + { + "epoch": 0.7081857946261195, + "grad_norm": 0.29714949257038253, + "learning_rate": 3.5416666666666673e-06, + "loss": 0.1803, + "step": 1700 + }, + { + "epoch": 0.749843782545303, + "grad_norm": 0.302226244442205, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.1805, + "step": 1800 + }, + { + "epoch": 0.7915017704644866, + "grad_norm": 0.3329180855942572, + "learning_rate": 3.958333333333333e-06, + "loss": 0.1833, + "step": 1900 + }, + { + "epoch": 0.8331597583836701, + "grad_norm": 0.28770265809452183, + "learning_rate": 4.166666666666667e-06, + "loss": 0.1807, + "step": 2000 + }, + { + "epoch": 0.8748177463028536, + "grad_norm": 0.3308819875323557, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.1804, + "step": 2100 + }, + { + "epoch": 0.9164757342220371, + "grad_norm": 0.3163212399640271, + "learning_rate": 4.583333333333333e-06, + "loss": 0.1805, + "step": 2200 + }, + { + "epoch": 0.9581337221412206, + "grad_norm": 0.3898310274135571, + "learning_rate": 4.791666666666668e-06, + "loss": 0.1803, + "step": 2300 + }, + { + "epoch": 0.9997917100604041, + "grad_norm": 0.27784332983216586, + "learning_rate": 5e-06, + "loss": 0.1808, + "step": 2400 + }, + { + "epoch": 1.0, + "eval_loss": 0.18106774985790253, + "eval_runtime": 196.6682, + "eval_samples_per_second": 1388.623, + "eval_steps_per_second": 2.715, + "step": 2401 + }, + { + "epoch": 1.0412414080399917, + "grad_norm": 0.2936543487056633, + "learning_rate": 4.999735579817769e-06, + "loss": 0.1807, + "step": 2500 + }, + { + "epoch": 1.082899395959175, + "grad_norm": 0.2809875255295402, + "learning_rate": 4.998942375205502e-06, + "loss": 0.1801, + "step": 2600 + }, + { + "epoch": 1.1245573838783587, + "grad_norm": 0.2760622198201079, + "learning_rate": 4.997620553954645e-06, + "loss": 0.1801, + "step": 2700 + }, + { + "epoch": 1.166215371797542, + "grad_norm": 0.2710350326429577, + "learning_rate": 4.995770395678171e-06, + "loss": 0.1803, + "step": 2800 + }, + { + "epoch": 1.2078733597167257, + "grad_norm": 0.20931696168572392, + "learning_rate": 4.993392291751431e-06, + "loss": 0.1803, + "step": 2900 + }, + { + "epoch": 1.2495313476359091, + "grad_norm": 0.24323887106839603, + "learning_rate": 4.990486745229364e-06, + "loss": 0.1799, + "step": 3000 + }, + { + "epoch": 1.2911893355550927, + "grad_norm": 0.2815796357302052, + "learning_rate": 4.9870543707400835e-06, + "loss": 0.1798, + "step": 3100 + }, + { + "epoch": 1.3328473234742761, + "grad_norm": 0.23664820561946712, + "learning_rate": 4.983095894354858e-06, + "loss": 0.1801, + "step": 3200 + }, + { + "epoch": 1.3745053113934598, + "grad_norm": 0.3083911955290968, + "learning_rate": 4.978612153434527e-06, + "loss": 0.1801, + "step": 3300 + }, + { + "epoch": 1.4161632993126432, + "grad_norm": 0.24337206279187154, + "learning_rate": 4.973604096452361e-06, + "loss": 0.1799, + "step": 3400 + }, + { + "epoch": 1.4578212872318268, + "grad_norm": 0.2691338598173961, + "learning_rate": 4.968072782793436e-06, + "loss": 0.1798, + "step": 3500 + }, + { + "epoch": 1.4994792751510102, + "grad_norm": 0.1859964729302664, + "learning_rate": 4.962019382530521e-06, + "loss": 0.18, + "step": 3600 + }, + { + "epoch": 1.5411372630701936, + "grad_norm": 0.29588302582709847, + "learning_rate": 4.955445176176577e-06, + "loss": 0.18, + "step": 3700 + }, + { + "epoch": 1.5827952509893772, + "grad_norm": 0.24224751463035848, + "learning_rate": 4.948351554413879e-06, + "loss": 0.1993, + "step": 3800 + }, + { + "epoch": 1.6244532389085609, + "grad_norm": 0.24926986804364754, + "learning_rate": 4.9407400177998335e-06, + "loss": 0.1799, + "step": 3900 + }, + { + "epoch": 1.6661112268277443, + "grad_norm": 0.26907499271712193, + "learning_rate": 4.93261217644956e-06, + "loss": 0.1796, + "step": 4000 + }, + { + "epoch": 1.7077692147469277, + "grad_norm": 0.24652167596434857, + "learning_rate": 4.9239697496952904e-06, + "loss": 0.1797, + "step": 4100 + }, + { + "epoch": 1.7494272026661113, + "grad_norm": 0.26360641338937, + "learning_rate": 4.914814565722671e-06, + "loss": 0.1797, + "step": 4200 + }, + { + "epoch": 1.7910851905852947, + "grad_norm": 0.21211424396568565, + "learning_rate": 4.905148561184033e-06, + "loss": 0.1798, + "step": 4300 + }, + { + "epoch": 1.832743178504478, + "grad_norm": 0.23174306094818595, + "learning_rate": 4.894973780788722e-06, + "loss": 0.1798, + "step": 4400 + }, + { + "epoch": 1.8744011664236617, + "grad_norm": 0.20239856810705756, + "learning_rate": 4.884292376870567e-06, + "loss": 0.1797, + "step": 4500 + }, + { + "epoch": 1.9160591543428453, + "grad_norm": 0.20895880362963307, + "learning_rate": 4.873106608932585e-06, + "loss": 0.1796, + "step": 4600 + }, + { + "epoch": 1.9577171422620288, + "grad_norm": 0.2341875351736524, + "learning_rate": 4.861418843169012e-06, + "loss": 0.1797, + "step": 4700 + }, + { + "epoch": 1.9993751301812122, + "grad_norm": 0.20045835157915606, + "learning_rate": 4.849231551964771e-06, + "loss": 0.1796, + "step": 4800 + }, + { + "epoch": 2.0, + "eval_loss": 0.17972978949546814, + "eval_runtime": 196.3636, + "eval_samples_per_second": 1390.777, + "eval_steps_per_second": 2.719, + "step": 4802 + }, + { + "epoch": 2.0408248281607997, + "grad_norm": 0.21309941078379252, + "learning_rate": 4.836547313372472e-06, + "loss": 0.1795, + "step": 4900 + }, + { + "epoch": 2.0824828160799833, + "grad_norm": 0.19717578427183138, + "learning_rate": 4.823368810567056e-06, + "loss": 0.1794, + "step": 5000 + }, + { + "epoch": 2.124140803999167, + "grad_norm": 0.23023011075724995, + "learning_rate": 4.809698831278217e-06, + "loss": 0.1802, + "step": 5100 + }, + { + "epoch": 2.16579879191835, + "grad_norm": 0.21578484379978355, + "learning_rate": 4.7955402672006855e-06, + "loss": 0.18, + "step": 5200 + }, + { + "epoch": 2.2074567798375337, + "grad_norm": 0.21410225528440446, + "learning_rate": 4.780896113382536e-06, + "loss": 0.1798, + "step": 5300 + }, + { + "epoch": 2.2491147677567174, + "grad_norm": 0.24923656549560563, + "learning_rate": 4.765769467591626e-06, + "loss": 0.1796, + "step": 5400 + }, + { + "epoch": 2.290772755675901, + "grad_norm": 0.27043973727195314, + "learning_rate": 4.750163529660303e-06, + "loss": 0.1799, + "step": 5500 + }, + { + "epoch": 2.332430743595084, + "grad_norm": 0.20084508849747548, + "learning_rate": 4.734081600808531e-06, + "loss": 0.1796, + "step": 5600 + }, + { + "epoch": 2.374088731514268, + "grad_norm": 0.17037675166345598, + "learning_rate": 4.717527082945555e-06, + "loss": 0.1797, + "step": 5700 + }, + { + "epoch": 2.4157467194334514, + "grad_norm": 0.20792174660657012, + "learning_rate": 4.700503477950278e-06, + "loss": 0.1797, + "step": 5800 + }, + { + "epoch": 2.457404707352635, + "grad_norm": 0.20444912332175158, + "learning_rate": 4.6830143869304904e-06, + "loss": 0.1799, + "step": 5900 + }, + { + "epoch": 2.4990626952718182, + "grad_norm": 0.2160441899332462, + "learning_rate": 4.665063509461098e-06, + "loss": 0.1797, + "step": 6000 + }, + { + "epoch": 2.540720683191002, + "grad_norm": 0.25556787549882387, + "learning_rate": 4.646654642801533e-06, + "loss": 0.1794, + "step": 6100 + }, + { + "epoch": 2.5823786711101855, + "grad_norm": 0.22198410769602075, + "learning_rate": 4.627791681092499e-06, + "loss": 0.1794, + "step": 6200 + }, + { + "epoch": 2.624036659029369, + "grad_norm": 0.19549701905963526, + "learning_rate": 4.608478614532215e-06, + "loss": 0.1795, + "step": 6300 + }, + { + "epoch": 2.6656946469485523, + "grad_norm": 0.24454736703986502, + "learning_rate": 4.588719528532342e-06, + "loss": 0.1797, + "step": 6400 + }, + { + "epoch": 2.707352634867736, + "grad_norm": 0.20111965276500102, + "learning_rate": 4.568518602853776e-06, + "loss": 0.1797, + "step": 6500 + }, + { + "epoch": 2.7490106227869195, + "grad_norm": 0.2155615827433472, + "learning_rate": 4.54788011072248e-06, + "loss": 0.1796, + "step": 6600 + }, + { + "epoch": 2.7906686107061027, + "grad_norm": 0.23518049751986453, + "learning_rate": 4.526808417925531e-06, + "loss": 0.1796, + "step": 6700 + }, + { + "epoch": 2.8323265986252864, + "grad_norm": 0.2088881277827675, + "learning_rate": 4.50530798188761e-06, + "loss": 0.1795, + "step": 6800 + }, + { + "epoch": 2.87398458654447, + "grad_norm": 0.22027451607755855, + "learning_rate": 4.4833833507280884e-06, + "loss": 0.1794, + "step": 6900 + }, + { + "epoch": 2.9156425744636536, + "grad_norm": 0.20366425013850817, + "learning_rate": 4.46103916229894e-06, + "loss": 0.1793, + "step": 7000 + }, + { + "epoch": 2.957300562382837, + "grad_norm": 0.2718663681076218, + "learning_rate": 4.438280143203665e-06, + "loss": 0.1796, + "step": 7100 + }, + { + "epoch": 2.9989585503020204, + "grad_norm": 0.19182709064421555, + "learning_rate": 4.415111107797445e-06, + "loss": 0.1794, + "step": 7200 + }, + { + "epoch": 3.0, + "eval_loss": 0.1794959157705307, + "eval_runtime": 196.4289, + "eval_samples_per_second": 1390.315, + "eval_steps_per_second": 2.719, + "step": 7203 + }, + { + "epoch": 3.040408248281608, + "grad_norm": 0.195058367609666, + "learning_rate": 4.391536957168733e-06, + "loss": 0.1798, + "step": 7300 + }, + { + "epoch": 3.0820662362007916, + "grad_norm": 0.2256357073328012, + "learning_rate": 4.367562678102491e-06, + "loss": 0.1795, + "step": 7400 + }, + { + "epoch": 3.123724224119975, + "grad_norm": 0.2129481809880029, + "learning_rate": 4.34319334202531e-06, + "loss": 0.1795, + "step": 7500 + }, + { + "epoch": 3.1653822120391584, + "grad_norm": 0.1689665633552094, + "learning_rate": 4.318434103932622e-06, + "loss": 0.1795, + "step": 7600 + }, + { + "epoch": 3.207040199958342, + "grad_norm": 0.18434140023135, + "learning_rate": 4.293290201298224e-06, + "loss": 0.1796, + "step": 7700 + }, + { + "epoch": 3.2486981878775256, + "grad_norm": 0.2103528683280332, + "learning_rate": 4.267766952966369e-06, + "loss": 0.1793, + "step": 7800 + }, + { + "epoch": 3.290356175796709, + "grad_norm": 0.16087446181904855, + "learning_rate": 4.241869758026638e-06, + "loss": 0.1794, + "step": 7900 + }, + { + "epoch": 3.3320141637158924, + "grad_norm": 0.22569144057534085, + "learning_rate": 4.215604094671835e-06, + "loss": 0.1792, + "step": 8000 + }, + { + "epoch": 3.373672151635076, + "grad_norm": 0.19990473196998446, + "learning_rate": 4.188975519039151e-06, + "loss": 0.1794, + "step": 8100 + }, + { + "epoch": 3.4153301395542597, + "grad_norm": 0.1902243355455867, + "learning_rate": 4.161989664034844e-06, + "loss": 0.1794, + "step": 8200 + }, + { + "epoch": 3.456988127473443, + "grad_norm": 0.18824118604006632, + "learning_rate": 4.134652238142674e-06, + "loss": 0.1794, + "step": 8300 + }, + { + "epoch": 3.4986461153926265, + "grad_norm": 0.19597204875441573, + "learning_rate": 4.106969024216348e-06, + "loss": 0.1794, + "step": 8400 + }, + { + "epoch": 3.54030410331181, + "grad_norm": 0.17674897479656335, + "learning_rate": 4.078945878256244e-06, + "loss": 0.1793, + "step": 8500 + }, + { + "epoch": 3.5819620912309933, + "grad_norm": 0.19658906636767987, + "learning_rate": 4.0505887281706505e-06, + "loss": 0.1794, + "step": 8600 + }, + { + "epoch": 3.623620079150177, + "grad_norm": 0.1607909455989355, + "learning_rate": 4.021903572521802e-06, + "loss": 0.1794, + "step": 8700 + }, + { + "epoch": 3.6652780670693605, + "grad_norm": 0.18982136425367155, + "learning_rate": 3.992896479256966e-06, + "loss": 0.1793, + "step": 8800 + }, + { + "epoch": 3.706936054988544, + "grad_norm": 0.18212426964310202, + "learning_rate": 3.963573584424852e-06, + "loss": 0.1794, + "step": 8900 + }, + { + "epoch": 3.748594042907728, + "grad_norm": 0.18731109638030716, + "learning_rate": 3.933941090877615e-06, + "loss": 0.1799, + "step": 9000 + }, + { + "epoch": 3.790252030826911, + "grad_norm": 0.2243920924541318, + "learning_rate": 3.9040052669587325e-06, + "loss": 0.1863, + "step": 9100 + }, + { + "epoch": 3.8319100187460946, + "grad_norm": 0.19665494095424324, + "learning_rate": 3.8737724451770155e-06, + "loss": 0.1793, + "step": 9200 + }, + { + "epoch": 3.8735680066652782, + "grad_norm": 0.1709097835399287, + "learning_rate": 3.8432490208670605e-06, + "loss": 0.1792, + "step": 9300 + }, + { + "epoch": 3.9152259945844614, + "grad_norm": 0.1519558310026607, + "learning_rate": 3.8124414508364005e-06, + "loss": 0.1792, + "step": 9400 + }, + { + "epoch": 3.956883982503645, + "grad_norm": 0.18615584510557248, + "learning_rate": 3.7813562519996633e-06, + "loss": 0.1791, + "step": 9500 + }, + { + "epoch": 3.9985419704228287, + "grad_norm": 0.14216906700933155, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.1792, + "step": 9600 + }, + { + "epoch": 4.0, + "eval_loss": 0.17919312417507172, + "eval_runtime": 196.5199, + "eval_samples_per_second": 1389.671, + "eval_steps_per_second": 2.717, + "step": 9604 + }, + { + "epoch": 4.039991668402416, + "grad_norm": 0.1981373334933009, + "learning_rate": 3.7183793278181063e-06, + "loss": 0.1793, + "step": 9700 + }, + { + "epoch": 4.081649656321599, + "grad_norm": 0.1796707844873524, + "learning_rate": 3.6865009243691015e-06, + "loss": 0.1791, + "step": 9800 + }, + { + "epoch": 4.123307644240783, + "grad_norm": 0.21582792834146144, + "learning_rate": 3.654371533087586e-06, + "loss": 0.1792, + "step": 9900 + }, + { + "epoch": 4.164965632159967, + "grad_norm": 0.22285894509633086, + "learning_rate": 3.621997950501156e-06, + "loss": 0.179, + "step": 10000 + }, + { + "epoch": 4.20662362007915, + "grad_norm": 0.1947839176316504, + "learning_rate": 3.5893870247926986e-06, + "loss": 0.1792, + "step": 10100 + }, + { + "epoch": 4.248281607998334, + "grad_norm": 0.18044045004936568, + "learning_rate": 3.556545654351749e-06, + "loss": 0.1791, + "step": 10200 + }, + { + "epoch": 4.2899395959175175, + "grad_norm": 0.21629122720481903, + "learning_rate": 3.5234807863152316e-06, + "loss": 0.1793, + "step": 10300 + }, + { + "epoch": 4.3315975838367, + "grad_norm": 0.15404290423986947, + "learning_rate": 3.4901994150978926e-06, + "loss": 0.1791, + "step": 10400 + }, + { + "epoch": 4.373255571755884, + "grad_norm": 0.16032922618842949, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.1791, + "step": 10500 + }, + { + "epoch": 4.4149135596750675, + "grad_norm": 0.1495191719599753, + "learning_rate": 3.4230153682817112e-06, + "loss": 0.1791, + "step": 10600 + }, + { + "epoch": 4.456571547594251, + "grad_norm": 0.19697439856186114, + "learning_rate": 3.389126904537192e-06, + "loss": 0.1791, + "step": 10700 + }, + { + "epoch": 4.498229535513435, + "grad_norm": 0.17156322418134476, + "learning_rate": 3.3550503583141726e-06, + "loss": 0.1791, + "step": 10800 + }, + { + "epoch": 4.539887523432618, + "grad_norm": 0.1561878142062692, + "learning_rate": 3.3207929380339034e-06, + "loss": 0.1792, + "step": 10900 + }, + { + "epoch": 4.581545511351802, + "grad_norm": 0.1828679685381653, + "learning_rate": 3.2863618903790346e-06, + "loss": 0.1791, + "step": 11000 + }, + { + "epoch": 4.623203499270986, + "grad_norm": 0.1802733896031037, + "learning_rate": 3.2517644987606827e-06, + "loss": 0.1792, + "step": 11100 + }, + { + "epoch": 4.664861487190168, + "grad_norm": 0.15579534435978112, + "learning_rate": 3.217008081777726e-06, + "loss": 0.1791, + "step": 11200 + }, + { + "epoch": 4.706519475109352, + "grad_norm": 0.16638908065693153, + "learning_rate": 3.182099991668653e-06, + "loss": 0.1791, + "step": 11300 + }, + { + "epoch": 4.748177463028536, + "grad_norm": 0.18397163828033228, + "learning_rate": 3.147047612756302e-06, + "loss": 0.1792, + "step": 11400 + }, + { + "epoch": 4.789835450947719, + "grad_norm": 0.17751483450519995, + "learning_rate": 3.1118583598858097e-06, + "loss": 0.179, + "step": 11500 + }, + { + "epoch": 4.831493438866903, + "grad_norm": 0.1808778224251496, + "learning_rate": 3.0765396768561005e-06, + "loss": 0.179, + "step": 11600 + }, + { + "epoch": 4.8731514267860865, + "grad_norm": 0.17593346330767928, + "learning_rate": 3.0410990348452572e-06, + "loss": 0.1793, + "step": 11700 + }, + { + "epoch": 4.91480941470527, + "grad_norm": 0.15824861181745342, + "learning_rate": 3.0055439308300954e-06, + "loss": 0.1791, + "step": 11800 + }, + { + "epoch": 4.956467402624453, + "grad_norm": 0.21055777806239853, + "learning_rate": 2.96988188600028e-06, + "loss": 0.1792, + "step": 11900 + }, + { + "epoch": 4.9981253905436365, + "grad_norm": 0.15352806003656314, + "learning_rate": 2.9341204441673267e-06, + "loss": 0.1791, + "step": 12000 + }, + { + "epoch": 5.0, + "eval_loss": 0.17911389470100403, + "eval_runtime": 196.4564, + "eval_samples_per_second": 1390.12, + "eval_steps_per_second": 2.718, + "step": 12005 + }, + { + "epoch": 5.0395750885232244, + "grad_norm": 0.1891820592041876, + "learning_rate": 2.898267170168807e-06, + "loss": 0.1791, + "step": 12100 + }, + { + "epoch": 5.081233076442408, + "grad_norm": 0.14302405130068518, + "learning_rate": 2.862329648268117e-06, + "loss": 0.1789, + "step": 12200 + }, + { + "epoch": 5.122891064361592, + "grad_norm": 0.2215960599158716, + "learning_rate": 2.82631548055013e-06, + "loss": 0.1792, + "step": 12300 + }, + { + "epoch": 5.164549052280774, + "grad_norm": 0.1566593937408507, + "learning_rate": 2.7902322853130758e-06, + "loss": 0.179, + "step": 12400 + }, + { + "epoch": 5.206207040199958, + "grad_norm": 0.15513379693358573, + "learning_rate": 2.754087695457005e-06, + "loss": 0.1791, + "step": 12500 + }, + { + "epoch": 5.247865028119142, + "grad_norm": 0.14968722299942713, + "learning_rate": 2.717889356869146e-06, + "loss": 0.179, + "step": 12600 + }, + { + "epoch": 5.289523016038325, + "grad_norm": 0.2097123380235341, + "learning_rate": 2.681644926806527e-06, + "loss": 0.179, + "step": 12700 + }, + { + "epoch": 5.331181003957509, + "grad_norm": 0.19315969222642626, + "learning_rate": 2.6453620722761897e-06, + "loss": 0.179, + "step": 12800 + }, + { + "epoch": 5.372838991876693, + "grad_norm": 0.2209634744371871, + "learning_rate": 2.6090484684133406e-06, + "loss": 0.1791, + "step": 12900 + }, + { + "epoch": 5.414496979795876, + "grad_norm": 0.20430693758591473, + "learning_rate": 2.572711796857779e-06, + "loss": 0.179, + "step": 13000 + }, + { + "epoch": 5.45615496771506, + "grad_norm": 0.18903967369853375, + "learning_rate": 2.5363597441289574e-06, + "loss": 0.179, + "step": 13100 + }, + { + "epoch": 5.4978129556342425, + "grad_norm": 0.15616083753477006, + "learning_rate": 2.5e-06, + "loss": 0.179, + "step": 13200 + }, + { + "epoch": 5.539470943553426, + "grad_norm": 0.1507559008561688, + "learning_rate": 2.4636402558710434e-06, + "loss": 0.1791, + "step": 13300 + }, + { + "epoch": 5.58112893147261, + "grad_norm": 0.16640062646644058, + "learning_rate": 2.4272882031422216e-06, + "loss": 0.179, + "step": 13400 + }, + { + "epoch": 5.622786919391793, + "grad_norm": 0.1824434916593794, + "learning_rate": 2.3909515315866606e-06, + "loss": 0.1791, + "step": 13500 + }, + { + "epoch": 5.664444907310977, + "grad_norm": 0.2004975100759413, + "learning_rate": 2.3546379277238107e-06, + "loss": 0.179, + "step": 13600 + }, + { + "epoch": 5.706102895230161, + "grad_norm": 0.17154522514366766, + "learning_rate": 2.318355073193474e-06, + "loss": 0.1791, + "step": 13700 + }, + { + "epoch": 5.747760883149343, + "grad_norm": 0.13248550006328844, + "learning_rate": 2.2821106431308546e-06, + "loss": 0.179, + "step": 13800 + }, + { + "epoch": 5.789418871068527, + "grad_norm": 0.1915171020600886, + "learning_rate": 2.2459123045429953e-06, + "loss": 0.1792, + "step": 13900 + }, + { + "epoch": 5.831076858987711, + "grad_norm": 0.16235356856597902, + "learning_rate": 2.2097677146869242e-06, + "loss": 0.1791, + "step": 14000 + }, + { + "epoch": 5.872734846906894, + "grad_norm": 0.1627140490119954, + "learning_rate": 2.173684519449872e-06, + "loss": 0.1789, + "step": 14100 + }, + { + "epoch": 5.914392834826078, + "grad_norm": 0.16466884224746445, + "learning_rate": 2.1376703517318835e-06, + "loss": 0.179, + "step": 14200 + }, + { + "epoch": 5.9560508227452615, + "grad_norm": 0.20611687756993843, + "learning_rate": 2.101732829831194e-06, + "loss": 0.179, + "step": 14300 + }, + { + "epoch": 5.997708810664445, + "grad_norm": 0.16559158144998481, + "learning_rate": 2.0658795558326745e-06, + "loss": 0.179, + "step": 14400 + }, + { + "epoch": 6.0, + "eval_loss": 0.17907121777534485, + "eval_runtime": 196.4273, + "eval_samples_per_second": 1390.326, + "eval_steps_per_second": 2.719, + "step": 14406 + }, + { + "epoch": 6.039158508644032, + "grad_norm": 0.16927649861039284, + "learning_rate": 2.0301181139997206e-06, + "loss": 0.1789, + "step": 14500 + }, + { + "epoch": 6.080816496563216, + "grad_norm": 0.1752142512252337, + "learning_rate": 1.994456069169906e-06, + "loss": 0.179, + "step": 14600 + }, + { + "epoch": 6.1224744844823995, + "grad_norm": 0.21170178196900302, + "learning_rate": 1.958900965154743e-06, + "loss": 0.1789, + "step": 14700 + }, + { + "epoch": 6.164132472401583, + "grad_norm": 0.21884267966966597, + "learning_rate": 1.9234603231439e-06, + "loss": 0.1788, + "step": 14800 + }, + { + "epoch": 6.205790460320767, + "grad_norm": 0.17106948371146288, + "learning_rate": 1.8881416401141905e-06, + "loss": 0.1788, + "step": 14900 + }, + { + "epoch": 6.24744844823995, + "grad_norm": 0.174097273230219, + "learning_rate": 1.852952387243698e-06, + "loss": 0.1788, + "step": 15000 + }, + { + "epoch": 6.289106436159133, + "grad_norm": 0.20862365699110258, + "learning_rate": 1.8179000083313483e-06, + "loss": 0.1788, + "step": 15100 + }, + { + "epoch": 6.330764424078317, + "grad_norm": 0.17885797151549512, + "learning_rate": 1.7829919182222752e-06, + "loss": 0.1788, + "step": 15200 + }, + { + "epoch": 6.3724224119975, + "grad_norm": 0.19498914359958716, + "learning_rate": 1.7482355012393177e-06, + "loss": 0.1789, + "step": 15300 + }, + { + "epoch": 6.414080399916684, + "grad_norm": 0.1389966716220221, + "learning_rate": 1.7136381096209665e-06, + "loss": 0.179, + "step": 15400 + }, + { + "epoch": 6.455738387835868, + "grad_norm": 0.1786092324697337, + "learning_rate": 1.6792070619660977e-06, + "loss": 0.179, + "step": 15500 + }, + { + "epoch": 6.497396375755051, + "grad_norm": 0.19161758807721282, + "learning_rate": 1.6449496416858285e-06, + "loss": 0.1788, + "step": 15600 + }, + { + "epoch": 6.539054363674235, + "grad_norm": 0.19197303954060144, + "learning_rate": 1.6108730954628093e-06, + "loss": 0.1788, + "step": 15700 + }, + { + "epoch": 6.580712351593418, + "grad_norm": 0.16743828588501417, + "learning_rate": 1.5769846317182894e-06, + "loss": 0.1787, + "step": 15800 + }, + { + "epoch": 6.622370339512601, + "grad_norm": 0.16492318029574304, + "learning_rate": 1.5432914190872757e-06, + "loss": 0.1788, + "step": 15900 + }, + { + "epoch": 6.664028327431785, + "grad_norm": 0.15440438163304784, + "learning_rate": 1.509800584902108e-06, + "loss": 0.1789, + "step": 16000 + }, + { + "epoch": 6.7056863153509685, + "grad_norm": 0.17667275704806315, + "learning_rate": 1.4765192136847686e-06, + "loss": 0.1789, + "step": 16100 + }, + { + "epoch": 6.747344303270152, + "grad_norm": 0.17904015323124156, + "learning_rate": 1.443454345648252e-06, + "loss": 0.1789, + "step": 16200 + }, + { + "epoch": 6.789002291189336, + "grad_norm": 0.16736730033822061, + "learning_rate": 1.4106129752073023e-06, + "loss": 0.179, + "step": 16300 + }, + { + "epoch": 6.830660279108519, + "grad_norm": 0.16038102753372047, + "learning_rate": 1.3780020494988447e-06, + "loss": 0.179, + "step": 16400 + }, + { + "epoch": 6.872318267027703, + "grad_norm": 0.15315299560909978, + "learning_rate": 1.3456284669124159e-06, + "loss": 0.1786, + "step": 16500 + }, + { + "epoch": 6.913976254946886, + "grad_norm": 0.1430660492396621, + "learning_rate": 1.313499075630899e-06, + "loss": 0.179, + "step": 16600 + }, + { + "epoch": 6.955634242866069, + "grad_norm": 0.17326024703322063, + "learning_rate": 1.2816206721818944e-06, + "loss": 0.1789, + "step": 16700 + }, + { + "epoch": 6.997292230785253, + "grad_norm": 0.14987232796770428, + "learning_rate": 1.2500000000000007e-06, + "loss": 0.1787, + "step": 16800 + }, + { + "epoch": 7.0, + "eval_loss": 0.17893224954605103, + "eval_runtime": 196.4121, + "eval_samples_per_second": 1390.434, + "eval_steps_per_second": 2.719, + "step": 16807 + }, + { + "epoch": 7.038741928764841, + "grad_norm": 0.1439804790666206, + "learning_rate": 1.218643748000337e-06, + "loss": 0.1787, + "step": 16900 + }, + { + "epoch": 7.080399916684025, + "grad_norm": 0.1820620837643405, + "learning_rate": 1.1875585491636e-06, + "loss": 0.1788, + "step": 17000 + }, + { + "epoch": 7.122057904603207, + "grad_norm": 0.1619570282327302, + "learning_rate": 1.1567509791329402e-06, + "loss": 0.1786, + "step": 17100 + }, + { + "epoch": 7.163715892522391, + "grad_norm": 0.2470491812569796, + "learning_rate": 1.1262275548229852e-06, + "loss": 0.1791, + "step": 17200 + }, + { + "epoch": 7.205373880441575, + "grad_norm": 0.18058952670407366, + "learning_rate": 1.0959947330412681e-06, + "loss": 0.1789, + "step": 17300 + }, + { + "epoch": 7.247031868360758, + "grad_norm": 0.20589528394837478, + "learning_rate": 1.0660589091223854e-06, + "loss": 0.1786, + "step": 17400 + }, + { + "epoch": 7.288689856279942, + "grad_norm": 0.13562633767825757, + "learning_rate": 1.0364264155751489e-06, + "loss": 0.1786, + "step": 17500 + }, + { + "epoch": 7.330347844199125, + "grad_norm": 0.194696644563295, + "learning_rate": 1.0071035207430352e-06, + "loss": 0.1787, + "step": 17600 + }, + { + "epoch": 7.372005832118309, + "grad_norm": 0.19213496981753242, + "learning_rate": 9.780964274781984e-07, + "loss": 0.1786, + "step": 17700 + }, + { + "epoch": 7.413663820037492, + "grad_norm": 0.19876379595232896, + "learning_rate": 9.494112718293503e-07, + "loss": 0.1787, + "step": 17800 + }, + { + "epoch": 7.455321807956675, + "grad_norm": 0.1684329683430977, + "learning_rate": 9.210541217437566e-07, + "loss": 0.1787, + "step": 17900 + }, + { + "epoch": 7.496979795875859, + "grad_norm": 0.1823625942631362, + "learning_rate": 8.930309757836517e-07, + "loss": 0.1785, + "step": 18000 + }, + { + "epoch": 7.538637783795043, + "grad_norm": 0.18725762365246973, + "learning_rate": 8.653477618573261e-07, + "loss": 0.1786, + "step": 18100 + }, + { + "epoch": 7.580295771714226, + "grad_norm": 0.1507247392992477, + "learning_rate": 8.380103359651554e-07, + "loss": 0.1787, + "step": 18200 + }, + { + "epoch": 7.62195375963341, + "grad_norm": 0.18505299719524845, + "learning_rate": 8.110244809608494e-07, + "loss": 0.1786, + "step": 18300 + }, + { + "epoch": 7.663611747552594, + "grad_norm": 0.12101506184025812, + "learning_rate": 7.843959053281663e-07, + "loss": 0.1786, + "step": 18400 + }, + { + "epoch": 7.705269735471777, + "grad_norm": 0.16939344528667466, + "learning_rate": 7.581302419733633e-07, + "loss": 0.1785, + "step": 18500 + }, + { + "epoch": 7.74692772339096, + "grad_norm": 0.13840737012325652, + "learning_rate": 7.322330470336314e-07, + "loss": 0.1785, + "step": 18600 + }, + { + "epoch": 7.7885857113101435, + "grad_norm": 0.16859264286478876, + "learning_rate": 7.067097987017762e-07, + "loss": 0.1787, + "step": 18700 + }, + { + "epoch": 7.830243699229327, + "grad_norm": 0.1897535110592711, + "learning_rate": 6.815658960673782e-07, + "loss": 0.1785, + "step": 18800 + }, + { + "epoch": 7.871901687148511, + "grad_norm": 0.18368265058091485, + "learning_rate": 6.568066579746901e-07, + "loss": 0.1785, + "step": 18900 + }, + { + "epoch": 7.913559675067694, + "grad_norm": 0.13696515467419504, + "learning_rate": 6.324373218975105e-07, + "loss": 0.1786, + "step": 19000 + }, + { + "epoch": 7.955217662986878, + "grad_norm": 0.14354515830035847, + "learning_rate": 6.084630428312679e-07, + "loss": 0.1785, + "step": 19100 + }, + { + "epoch": 7.996875650906061, + "grad_norm": 0.15165778139105265, + "learning_rate": 5.848888922025553e-07, + "loss": 0.1786, + "step": 19200 + }, + { + "epoch": 8.0, + "eval_loss": 0.17886345088481903, + "eval_runtime": 196.5554, + "eval_samples_per_second": 1389.42, + "eval_steps_per_second": 2.717, + "step": 19208 + } + ], + "logging_steps": 100, + "max_steps": 24000, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4389740174376960.0, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +} diff --git a/saves/chess/generate_strategy/checkpoint-19208/training_args.bin b/saves/chess/generate_strategy/checkpoint-19208/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..773ce6240443c97c5f4f17d0c292e93b3f620d6d --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c4854ece4d5ef51aa764407c1d839019391947a59f720f2fc5ec761b53b0838 +size 7416 diff --git a/saves/chess/generate_strategy/checkpoint-19208/zero_to_fp32.py b/saves/chess/generate_strategy/checkpoint-19208/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/saves/chess/generate_strategy/checkpoint-19208/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/saves/chess/generate_strategy/config.json b/saves/chess/generate_strategy/config.json new file mode 100644 index 0000000000000000000000000000000000000000..fe9ce0e7d2a8ad9d74229897630ae54102a0a1a3 --- /dev/null +++ b/saves/chess/generate_strategy/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.48.2", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/saves/chess/generate_strategy/eval_results.json b/saves/chess/generate_strategy/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..63987d71b3a9acabf588029181d2b2656fe81e3a --- /dev/null +++ b/saves/chess/generate_strategy/eval_results.json @@ -0,0 +1,7 @@ +{ + "epoch": 9.996042491147678, + "eval_loss": 0.18023133277893066, + "eval_runtime": 195.9606, + "eval_samples_per_second": 1393.637, + "eval_steps_per_second": 2.725 +} \ No newline at end of file diff --git a/saves/chess/generate_strategy/generation_config.json b/saves/chess/generate_strategy/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eb70ec09806f7ce366dd58e8239ad0ca2d5babf1 --- /dev/null +++ b/saves/chess/generate_strategy/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128009 + ], + "max_length": 4096, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.48.2" +} diff --git a/saves/chess/generate_strategy/model-00001-of-00004.safetensors b/saves/chess/generate_strategy/model-00001-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..30894e47e82b04774e304f08f0bd864b9d2e3b43 --- /dev/null +++ b/saves/chess/generate_strategy/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8459550a76fcbd4c8a28db32d81f6e98b917d26cc3cb332e80b0bfb11bf4c3a3 +size 4976698672 diff --git a/saves/chess/generate_strategy/model-00002-of-00004.safetensors b/saves/chess/generate_strategy/model-00002-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1773072bd5a0e16479502c010e5342de2dc492bc --- /dev/null +++ b/saves/chess/generate_strategy/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0bc047981c5b16638d826026d2dd570fe3ce00adca38c9c1008ae351e704b24 +size 4999802720 diff --git a/saves/chess/generate_strategy/model-00003-of-00004.safetensors b/saves/chess/generate_strategy/model-00003-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..22853259bc849375ae6dbaa0726f8559b83ab0c4 --- /dev/null +++ b/saves/chess/generate_strategy/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4b0fd5f19a1b0ac8d9d5e9d42a3c5c58ef7e9a5dc625d8ebcb2b35dfb5c8703 +size 4915916176 diff --git a/saves/chess/generate_strategy/model-00004-of-00004.safetensors b/saves/chess/generate_strategy/model-00004-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..94d36e83a83880f2242750556c6ccc437295a629 --- /dev/null +++ b/saves/chess/generate_strategy/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a511dd2b6a97d740fc31080292a2b0f1e9a6da038cd4a663933db96052e5d4d3 +size 1168138808 diff --git a/saves/chess/generate_strategy/model.safetensors.index.json b/saves/chess/generate_strategy/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0fd8120f1c6acddc268ebc2583058efaf699a771 --- /dev/null +++ b/saves/chess/generate_strategy/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/saves/chess/generate_strategy/runs/Feb03_11-07-31_g42-h100-instance-114/events.out.tfevents.1738581013.g42-h100-instance-114.1547419.0 b/saves/chess/generate_strategy/runs/Feb03_11-07-31_g42-h100-instance-114/events.out.tfevents.1738581013.g42-h100-instance-114.1547419.0 new file mode 100644 index 0000000000000000000000000000000000000000..61361c54fe89b4dde08df428fe87e5535f667f1f --- /dev/null +++ b/saves/chess/generate_strategy/runs/Feb03_11-07-31_g42-h100-instance-114/events.out.tfevents.1738581013.g42-h100-instance-114.1547419.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c6ae8da351ce1e11b8d02e6e1f2234e9cf89cac0a54c039efa7e2d32284fbaf +size 8905 diff --git a/saves/chess/generate_strategy/runs/Feb03_13-21-12_g42-h100-instance-114/events.out.tfevents.1738589001.g42-h100-instance-114.1591864.0 b/saves/chess/generate_strategy/runs/Feb03_13-21-12_g42-h100-instance-114/events.out.tfevents.1738589001.g42-h100-instance-114.1591864.0 new file mode 100644 index 0000000000000000000000000000000000000000..90896472d79719b92e189773e1c140146dc86a5d --- /dev/null +++ b/saves/chess/generate_strategy/runs/Feb03_13-21-12_g42-h100-instance-114/events.out.tfevents.1738589001.g42-h100-instance-114.1591864.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f112e79d3577677fcfe7431e22358b9206e1d391faf713b7551efdba7f9c18f0 +size 59567 diff --git a/saves/chess/generate_strategy/runs/Feb03_13-21-12_g42-h100-instance-114/events.out.tfevents.1738659945.g42-h100-instance-114.1591864.1 b/saves/chess/generate_strategy/runs/Feb03_13-21-12_g42-h100-instance-114/events.out.tfevents.1738659945.g42-h100-instance-114.1591864.1 new file mode 100644 index 0000000000000000000000000000000000000000..a39b05c1a4f266eef60b384613a24c70f0796039 --- /dev/null +++ b/saves/chess/generate_strategy/runs/Feb03_13-21-12_g42-h100-instance-114/events.out.tfevents.1738659945.g42-h100-instance-114.1591864.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bd6185ec1f2d043fa76a8b66c0243c73dcb9b2f9468705acea4183813f94257 +size 364 diff --git a/saves/chess/generate_strategy/special_tokens_map.json b/saves/chess/generate_strategy/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..df5c3a478b842fa66e6a8c10265478284c1d4f41 --- /dev/null +++ b/saves/chess/generate_strategy/special_tokens_map.json @@ -0,0 +1,33 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/saves/chess/generate_strategy/tokenizer.json b/saves/chess/generate_strategy/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..172311123ab62378f1f6d90f3068a676b7d939ed --- /dev/null +++ b/saves/chess/generate_strategy/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 +size 17210148 diff --git a/saves/chess/generate_strategy/tokenizer_config.json b/saves/chess/generate_strategy/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e2afd45c14285320f15500548266d7adba98d07a --- /dev/null +++ b/saves/chess/generate_strategy/tokenizer_config.json @@ -0,0 +1,2078 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eot_id|>", + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 2048, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/saves/chess/generate_strategy/train_results.json b/saves/chess/generate_strategy/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..3baad9d6ca3e851792e2a5fd3c9819b48b864b4b --- /dev/null +++ b/saves/chess/generate_strategy/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 9.996042491147678, + "total_flos": 5485114750402560.0, + "train_loss": 0.19645737719535827, + "train_runtime": 70712.6152, + "train_samples_per_second": 347.587, + "train_steps_per_second": 0.339 +} \ No newline at end of file diff --git a/saves/chess/generate_strategy/trainer_log.jsonl b/saves/chess/generate_strategy/trainer_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e366afa486273e2cb7f09228efa7ffb503512adc --- /dev/null +++ b/saves/chess/generate_strategy/trainer_log.jsonl @@ -0,0 +1,251 @@ +{"current_steps": 100, "total_steps": 24000, "loss": 3.9539, "lr": 2.0833333333333333e-07, "epoch": 0.0416579879191835, "percentage": 0.42, "elapsed_time": "0:04:56", "remaining_time": "19:41:07"} +{"current_steps": 200, "total_steps": 24000, "loss": 0.4086, "lr": 4.1666666666666667e-07, "epoch": 0.083315975838367, "percentage": 0.83, "elapsed_time": "0:09:40", "remaining_time": "19:10:49"} +{"current_steps": 300, "total_steps": 24000, "loss": 0.2144, "lr": 6.25e-07, "epoch": 0.12497396375755052, "percentage": 1.25, "elapsed_time": "0:14:24", "remaining_time": "18:57:40"} +{"current_steps": 400, "total_steps": 24000, "loss": 0.2103, "lr": 8.333333333333333e-07, "epoch": 0.166631951676734, "percentage": 1.67, "elapsed_time": "0:19:07", "remaining_time": "18:48:34"} +{"current_steps": 500, "total_steps": 24000, "loss": 0.2075, "lr": 1.0416666666666667e-06, "epoch": 0.20828993959591752, "percentage": 2.08, "elapsed_time": "0:23:51", "remaining_time": "18:40:58"} +{"current_steps": 600, "total_steps": 24000, "loss": 0.2049, "lr": 1.25e-06, "epoch": 0.24994792751510103, "percentage": 2.5, "elapsed_time": "0:28:34", "remaining_time": "18:34:44"} +{"current_steps": 700, "total_steps": 24000, "loss": 0.2001, "lr": 1.4583333333333335e-06, "epoch": 0.29160591543428455, "percentage": 2.92, "elapsed_time": "0:33:19", "remaining_time": "18:29:01"} +{"current_steps": 800, "total_steps": 24000, "loss": 0.1916, "lr": 1.6666666666666667e-06, "epoch": 0.333263903353468, "percentage": 3.33, "elapsed_time": "0:38:02", "remaining_time": "18:23:19"} +{"current_steps": 900, "total_steps": 24000, "loss": 0.1849, "lr": 1.8750000000000003e-06, "epoch": 0.3749218912726515, "percentage": 3.75, "elapsed_time": "0:42:46", "remaining_time": "18:17:55"} +{"current_steps": 1000, "total_steps": 24000, "loss": 0.1847, "lr": 2.0833333333333334e-06, "epoch": 0.41657987919183503, "percentage": 4.17, "elapsed_time": "0:47:29", "remaining_time": "18:12:27"} +{"current_steps": 1100, "total_steps": 24000, "loss": 0.1845, "lr": 2.2916666666666666e-06, "epoch": 0.45823786711101855, "percentage": 4.58, "elapsed_time": "0:52:13", "remaining_time": "18:07:08"} +{"current_steps": 1200, "total_steps": 24000, "loss": 0.1818, "lr": 2.5e-06, "epoch": 0.49989585503020206, "percentage": 5.0, "elapsed_time": "0:56:56", "remaining_time": "18:01:53"} +{"current_steps": 1300, "total_steps": 24000, "loss": 0.1807, "lr": 2.7083333333333334e-06, "epoch": 0.5415538429493856, "percentage": 5.42, "elapsed_time": "1:01:40", "remaining_time": "17:57:00"} +{"current_steps": 1400, "total_steps": 24000, "loss": 0.1806, "lr": 2.916666666666667e-06, "epoch": 0.5832118308685691, "percentage": 5.83, "elapsed_time": "1:06:23", "remaining_time": "17:51:49"} +{"current_steps": 1500, "total_steps": 24000, "loss": 0.1806, "lr": 3.125e-06, "epoch": 0.6248698187877526, "percentage": 6.25, "elapsed_time": "1:11:07", "remaining_time": "17:46:48"} +{"current_steps": 1600, "total_steps": 24000, "loss": 0.1808, "lr": 3.3333333333333333e-06, "epoch": 0.666527806706936, "percentage": 6.67, "elapsed_time": "1:15:50", "remaining_time": "17:41:48"} +{"current_steps": 1700, "total_steps": 24000, "loss": 0.1803, "lr": 3.5416666666666673e-06, "epoch": 0.7081857946261195, "percentage": 7.08, "elapsed_time": "1:20:33", "remaining_time": "17:36:50"} +{"current_steps": 1800, "total_steps": 24000, "loss": 0.1805, "lr": 3.7500000000000005e-06, "epoch": 0.749843782545303, "percentage": 7.5, "elapsed_time": "1:25:17", "remaining_time": "17:31:51"} +{"current_steps": 1900, "total_steps": 24000, "loss": 0.1833, "lr": 3.958333333333333e-06, "epoch": 0.7915017704644866, "percentage": 7.92, "elapsed_time": "1:29:59", "remaining_time": "17:26:48"} +{"current_steps": 2000, "total_steps": 24000, "loss": 0.1807, "lr": 4.166666666666667e-06, "epoch": 0.8331597583836701, "percentage": 8.33, "elapsed_time": "1:34:42", "remaining_time": "17:21:52"} +{"current_steps": 2100, "total_steps": 24000, "loss": 0.1804, "lr": 4.3750000000000005e-06, "epoch": 0.8748177463028536, "percentage": 8.75, "elapsed_time": "1:39:26", "remaining_time": "17:17:03"} +{"current_steps": 2200, "total_steps": 24000, "loss": 0.1805, "lr": 4.583333333333333e-06, "epoch": 0.9164757342220371, "percentage": 9.17, "elapsed_time": "1:44:09", "remaining_time": "17:12:11"} +{"current_steps": 2300, "total_steps": 24000, "loss": 0.1803, "lr": 4.791666666666668e-06, "epoch": 0.9581337221412206, "percentage": 9.58, "elapsed_time": "1:48:53", "remaining_time": "17:07:20"} +{"current_steps": 2400, "total_steps": 24000, "loss": 0.1808, "lr": 5e-06, "epoch": 0.9997917100604041, "percentage": 10.0, "elapsed_time": "1:53:36", "remaining_time": "17:02:29"} +{"current_steps": 2401, "total_steps": 24000, "eval_loss": 0.18106774985790253, "epoch": 1.0, "percentage": 10.0, "elapsed_time": "1:56:54", "remaining_time": "17:31:43"} +{"current_steps": 2500, "total_steps": 24000, "loss": 0.1807, "lr": 4.999735579817769e-06, "epoch": 1.0412414080399917, "percentage": 10.42, "elapsed_time": "2:02:41", "remaining_time": "17:35:08"} +{"current_steps": 2600, "total_steps": 24000, "loss": 0.1801, "lr": 4.998942375205502e-06, "epoch": 1.082899395959175, "percentage": 10.83, "elapsed_time": "2:07:24", "remaining_time": "17:28:40"} +{"current_steps": 2700, "total_steps": 24000, "loss": 0.1801, "lr": 4.997620553954645e-06, "epoch": 1.1245573838783587, "percentage": 11.25, "elapsed_time": "2:12:07", "remaining_time": "17:22:18"} +{"current_steps": 2800, "total_steps": 24000, "loss": 0.1803, "lr": 4.995770395678171e-06, "epoch": 1.166215371797542, "percentage": 11.67, "elapsed_time": "2:16:50", "remaining_time": "17:16:04"} +{"current_steps": 2900, "total_steps": 24000, "loss": 0.1803, "lr": 4.993392291751431e-06, "epoch": 1.2078733597167257, "percentage": 12.08, "elapsed_time": "2:21:33", "remaining_time": "17:09:57"} +{"current_steps": 3000, "total_steps": 24000, "loss": 0.1799, "lr": 4.990486745229364e-06, "epoch": 1.2495313476359091, "percentage": 12.5, "elapsed_time": "2:26:16", "remaining_time": "17:03:57"} +{"current_steps": 3100, "total_steps": 24000, "loss": 0.1798, "lr": 4.9870543707400835e-06, "epoch": 1.2911893355550927, "percentage": 12.92, "elapsed_time": "2:30:59", "remaining_time": "16:58:00"} +{"current_steps": 3200, "total_steps": 24000, "loss": 0.1801, "lr": 4.983095894354858e-06, "epoch": 1.3328473234742761, "percentage": 13.33, "elapsed_time": "2:35:42", "remaining_time": "16:52:07"} +{"current_steps": 3300, "total_steps": 24000, "loss": 0.1801, "lr": 4.978612153434527e-06, "epoch": 1.3745053113934598, "percentage": 13.75, "elapsed_time": "2:40:25", "remaining_time": "16:46:16"} +{"current_steps": 3400, "total_steps": 24000, "loss": 0.1799, "lr": 4.973604096452361e-06, "epoch": 1.4161632993126432, "percentage": 14.17, "elapsed_time": "2:45:07", "remaining_time": "16:40:30"} +{"current_steps": 3500, "total_steps": 24000, "loss": 0.1798, "lr": 4.968072782793436e-06, "epoch": 1.4578212872318268, "percentage": 14.58, "elapsed_time": "2:49:50", "remaining_time": "16:34:48"} +{"current_steps": 3600, "total_steps": 24000, "loss": 0.18, "lr": 4.962019382530521e-06, "epoch": 1.4994792751510102, "percentage": 15.0, "elapsed_time": "2:54:33", "remaining_time": "16:29:10"} +{"current_steps": 3700, "total_steps": 24000, "loss": 0.18, "lr": 4.955445176176577e-06, "epoch": 1.5411372630701936, "percentage": 15.42, "elapsed_time": "2:59:16", "remaining_time": "16:23:35"} +{"current_steps": 3800, "total_steps": 24000, "loss": 0.1993, "lr": 4.948351554413879e-06, "epoch": 1.5827952509893772, "percentage": 15.83, "elapsed_time": "3:03:59", "remaining_time": "16:18:02"} +{"current_steps": 3900, "total_steps": 24000, "loss": 0.1799, "lr": 4.9407400177998335e-06, "epoch": 1.6244532389085609, "percentage": 16.25, "elapsed_time": "3:08:42", "remaining_time": "16:12:32"} +{"current_steps": 4000, "total_steps": 24000, "loss": 0.1796, "lr": 4.93261217644956e-06, "epoch": 1.6661112268277443, "percentage": 16.67, "elapsed_time": "3:13:25", "remaining_time": "16:07:06"} +{"current_steps": 4100, "total_steps": 24000, "loss": 0.1797, "lr": 4.9239697496952904e-06, "epoch": 1.7077692147469277, "percentage": 17.08, "elapsed_time": "3:18:08", "remaining_time": "16:01:41"} +{"current_steps": 4200, "total_steps": 24000, "loss": 0.1797, "lr": 4.914814565722671e-06, "epoch": 1.7494272026661113, "percentage": 17.5, "elapsed_time": "3:22:50", "remaining_time": "15:56:16"} +{"current_steps": 4300, "total_steps": 24000, "loss": 0.1798, "lr": 4.905148561184033e-06, "epoch": 1.7910851905852947, "percentage": 17.92, "elapsed_time": "3:27:33", "remaining_time": "15:50:54"} +{"current_steps": 4400, "total_steps": 24000, "loss": 0.1798, "lr": 4.894973780788722e-06, "epoch": 1.832743178504478, "percentage": 18.33, "elapsed_time": "3:32:16", "remaining_time": "15:45:35"} +{"current_steps": 4500, "total_steps": 24000, "loss": 0.1797, "lr": 4.884292376870567e-06, "epoch": 1.8744011664236617, "percentage": 18.75, "elapsed_time": "3:36:59", "remaining_time": "15:40:18"} +{"current_steps": 4600, "total_steps": 24000, "loss": 0.1796, "lr": 4.873106608932585e-06, "epoch": 1.9160591543428453, "percentage": 19.17, "elapsed_time": "3:41:42", "remaining_time": "15:35:03"} +{"current_steps": 4700, "total_steps": 24000, "loss": 0.1797, "lr": 4.861418843169012e-06, "epoch": 1.9577171422620288, "percentage": 19.58, "elapsed_time": "3:46:26", "remaining_time": "15:29:49"} +{"current_steps": 4800, "total_steps": 24000, "loss": 0.1796, "lr": 4.849231551964771e-06, "epoch": 1.9993751301812122, "percentage": 20.0, "elapsed_time": "3:51:09", "remaining_time": "15:24:37"} +{"current_steps": 4802, "total_steps": 24000, "eval_loss": 0.17972978949546814, "epoch": 2.0, "percentage": 20.01, "elapsed_time": "3:54:30", "remaining_time": "15:37:31"} +{"current_steps": 4900, "total_steps": 24000, "loss": 0.1795, "lr": 4.836547313372472e-06, "epoch": 2.0408248281607997, "percentage": 20.42, "elapsed_time": "4:00:20", "remaining_time": "15:36:48"} +{"current_steps": 5000, "total_steps": 24000, "loss": 0.1794, "lr": 4.823368810567056e-06, "epoch": 2.0824828160799833, "percentage": 20.83, "elapsed_time": "4:05:03", "remaining_time": "15:31:11"} +{"current_steps": 5100, "total_steps": 24000, "loss": 0.1802, "lr": 4.809698831278217e-06, "epoch": 2.124140803999167, "percentage": 21.25, "elapsed_time": "4:09:46", "remaining_time": "15:25:38"} +{"current_steps": 5200, "total_steps": 24000, "loss": 0.18, "lr": 4.7955402672006855e-06, "epoch": 2.16579879191835, "percentage": 21.67, "elapsed_time": "4:14:29", "remaining_time": "15:20:05"} +{"current_steps": 5300, "total_steps": 24000, "loss": 0.1798, "lr": 4.780896113382536e-06, "epoch": 2.2074567798375337, "percentage": 22.08, "elapsed_time": "4:19:12", "remaining_time": "15:14:34"} +{"current_steps": 5400, "total_steps": 24000, "loss": 0.1796, "lr": 4.765769467591626e-06, "epoch": 2.2491147677567174, "percentage": 22.5, "elapsed_time": "4:23:55", "remaining_time": "15:09:05"} +{"current_steps": 5500, "total_steps": 24000, "loss": 0.1799, "lr": 4.750163529660303e-06, "epoch": 2.290772755675901, "percentage": 22.92, "elapsed_time": "4:28:38", "remaining_time": "15:03:38"} +{"current_steps": 5600, "total_steps": 24000, "loss": 0.1796, "lr": 4.734081600808531e-06, "epoch": 2.332430743595084, "percentage": 23.33, "elapsed_time": "4:33:22", "remaining_time": "14:58:12"} +{"current_steps": 5700, "total_steps": 24000, "loss": 0.1797, "lr": 4.717527082945555e-06, "epoch": 2.374088731514268, "percentage": 23.75, "elapsed_time": "4:38:04", "remaining_time": "14:52:47"} +{"current_steps": 5800, "total_steps": 24000, "loss": 0.1797, "lr": 4.700503477950278e-06, "epoch": 2.4157467194334514, "percentage": 24.17, "elapsed_time": "4:42:48", "remaining_time": "14:47:25"} +{"current_steps": 5900, "total_steps": 24000, "loss": 0.1799, "lr": 4.6830143869304904e-06, "epoch": 2.457404707352635, "percentage": 24.58, "elapsed_time": "4:47:31", "remaining_time": "14:42:05"} +{"current_steps": 6000, "total_steps": 24000, "loss": 0.1797, "lr": 4.665063509461098e-06, "epoch": 2.4990626952718182, "percentage": 25.0, "elapsed_time": "4:52:14", "remaining_time": "14:36:44"} +{"current_steps": 6100, "total_steps": 24000, "loss": 0.1794, "lr": 4.646654642801533e-06, "epoch": 2.540720683191002, "percentage": 25.42, "elapsed_time": "4:56:57", "remaining_time": "14:31:25"} +{"current_steps": 6200, "total_steps": 24000, "loss": 0.1794, "lr": 4.627791681092499e-06, "epoch": 2.5823786711101855, "percentage": 25.83, "elapsed_time": "5:01:41", "remaining_time": "14:26:08"} +{"current_steps": 6300, "total_steps": 24000, "loss": 0.1795, "lr": 4.608478614532215e-06, "epoch": 2.624036659029369, "percentage": 26.25, "elapsed_time": "5:06:24", "remaining_time": "14:20:51"} +{"current_steps": 6400, "total_steps": 24000, "loss": 0.1797, "lr": 4.588719528532342e-06, "epoch": 2.6656946469485523, "percentage": 26.67, "elapsed_time": "5:11:07", "remaining_time": "14:15:35"} +{"current_steps": 6500, "total_steps": 24000, "loss": 0.1797, "lr": 4.568518602853776e-06, "epoch": 2.707352634867736, "percentage": 27.08, "elapsed_time": "5:15:51", "remaining_time": "14:10:21"} +{"current_steps": 6600, "total_steps": 24000, "loss": 0.1796, "lr": 4.54788011072248e-06, "epoch": 2.7490106227869195, "percentage": 27.5, "elapsed_time": "5:20:33", "remaining_time": "14:05:07"} +{"current_steps": 6700, "total_steps": 24000, "loss": 0.1796, "lr": 4.526808417925531e-06, "epoch": 2.7906686107061027, "percentage": 27.92, "elapsed_time": "5:25:16", "remaining_time": "13:59:54"} +{"current_steps": 6800, "total_steps": 24000, "loss": 0.1795, "lr": 4.50530798188761e-06, "epoch": 2.8323265986252864, "percentage": 28.33, "elapsed_time": "5:30:00", "remaining_time": "13:54:42"} +{"current_steps": 6900, "total_steps": 24000, "loss": 0.1794, "lr": 4.4833833507280884e-06, "epoch": 2.87398458654447, "percentage": 28.75, "elapsed_time": "5:34:42", "remaining_time": "13:49:30"} +{"current_steps": 7000, "total_steps": 24000, "loss": 0.1793, "lr": 4.46103916229894e-06, "epoch": 2.9156425744636536, "percentage": 29.17, "elapsed_time": "5:39:25", "remaining_time": "13:44:19"} +{"current_steps": 7100, "total_steps": 24000, "loss": 0.1796, "lr": 4.438280143203665e-06, "epoch": 2.957300562382837, "percentage": 29.58, "elapsed_time": "5:44:08", "remaining_time": "13:39:09"} +{"current_steps": 7200, "total_steps": 24000, "loss": 0.1794, "lr": 4.415111107797445e-06, "epoch": 2.9989585503020204, "percentage": 30.0, "elapsed_time": "5:48:51", "remaining_time": "13:34:00"} +{"current_steps": 7203, "total_steps": 24000, "eval_loss": 0.1794959157705307, "epoch": 3.0, "percentage": 30.01, "elapsed_time": "5:52:15", "remaining_time": "13:41:26"} +{"current_steps": 7300, "total_steps": 24000, "loss": 0.1798, "lr": 4.391536957168733e-06, "epoch": 3.040408248281608, "percentage": 30.42, "elapsed_time": "5:58:01", "remaining_time": "13:39:02"} +{"current_steps": 7400, "total_steps": 24000, "loss": 0.1795, "lr": 4.367562678102491e-06, "epoch": 3.0820662362007916, "percentage": 30.83, "elapsed_time": "6:02:44", "remaining_time": "13:33:42"} +{"current_steps": 7500, "total_steps": 24000, "loss": 0.1795, "lr": 4.34319334202531e-06, "epoch": 3.123724224119975, "percentage": 31.25, "elapsed_time": "6:07:27", "remaining_time": "13:28:24"} +{"current_steps": 7600, "total_steps": 24000, "loss": 0.1795, "lr": 4.318434103932622e-06, "epoch": 3.1653822120391584, "percentage": 31.67, "elapsed_time": "6:12:10", "remaining_time": "13:23:06"} +{"current_steps": 7700, "total_steps": 24000, "loss": 0.1796, "lr": 4.293290201298224e-06, "epoch": 3.207040199958342, "percentage": 32.08, "elapsed_time": "6:16:53", "remaining_time": "13:17:49"} +{"current_steps": 7800, "total_steps": 24000, "loss": 0.1793, "lr": 4.267766952966369e-06, "epoch": 3.2486981878775256, "percentage": 32.5, "elapsed_time": "6:21:35", "remaining_time": "13:12:33"} +{"current_steps": 7900, "total_steps": 24000, "loss": 0.1794, "lr": 4.241869758026638e-06, "epoch": 3.290356175796709, "percentage": 32.92, "elapsed_time": "6:26:18", "remaining_time": "13:07:17"} +{"current_steps": 8000, "total_steps": 24000, "loss": 0.1792, "lr": 4.215604094671835e-06, "epoch": 3.3320141637158924, "percentage": 33.33, "elapsed_time": "6:31:01", "remaining_time": "13:02:03"} +{"current_steps": 8100, "total_steps": 24000, "loss": 0.1794, "lr": 4.188975519039151e-06, "epoch": 3.373672151635076, "percentage": 33.75, "elapsed_time": "6:35:44", "remaining_time": "12:56:49"} +{"current_steps": 8200, "total_steps": 24000, "loss": 0.1794, "lr": 4.161989664034844e-06, "epoch": 3.4153301395542597, "percentage": 34.17, "elapsed_time": "6:40:26", "remaining_time": "12:51:35"} +{"current_steps": 8300, "total_steps": 24000, "loss": 0.1794, "lr": 4.134652238142674e-06, "epoch": 3.456988127473443, "percentage": 34.58, "elapsed_time": "6:45:09", "remaining_time": "12:46:23"} +{"current_steps": 8400, "total_steps": 24000, "loss": 0.1794, "lr": 4.106969024216348e-06, "epoch": 3.4986461153926265, "percentage": 35.0, "elapsed_time": "6:49:52", "remaining_time": "12:41:12"} +{"current_steps": 8500, "total_steps": 24000, "loss": 0.1793, "lr": 4.078945878256244e-06, "epoch": 3.54030410331181, "percentage": 35.42, "elapsed_time": "6:54:35", "remaining_time": "12:36:01"} +{"current_steps": 8600, "total_steps": 24000, "loss": 0.1794, "lr": 4.0505887281706505e-06, "epoch": 3.5819620912309933, "percentage": 35.83, "elapsed_time": "6:59:18", "remaining_time": "12:30:51"} +{"current_steps": 8700, "total_steps": 24000, "loss": 0.1794, "lr": 4.021903572521802e-06, "epoch": 3.623620079150177, "percentage": 36.25, "elapsed_time": "7:04:01", "remaining_time": "12:25:41"} +{"current_steps": 8800, "total_steps": 24000, "loss": 0.1793, "lr": 3.992896479256966e-06, "epoch": 3.6652780670693605, "percentage": 36.67, "elapsed_time": "7:08:44", "remaining_time": "12:20:32"} +{"current_steps": 8900, "total_steps": 24000, "loss": 0.1794, "lr": 3.963573584424852e-06, "epoch": 3.706936054988544, "percentage": 37.08, "elapsed_time": "7:13:26", "remaining_time": "12:15:24"} +{"current_steps": 9000, "total_steps": 24000, "loss": 0.1799, "lr": 3.933941090877615e-06, "epoch": 3.748594042907728, "percentage": 37.5, "elapsed_time": "7:18:09", "remaining_time": "12:10:15"} +{"current_steps": 9100, "total_steps": 24000, "loss": 0.1863, "lr": 3.9040052669587325e-06, "epoch": 3.790252030826911, "percentage": 37.92, "elapsed_time": "7:22:51", "remaining_time": "12:05:07"} +{"current_steps": 9200, "total_steps": 24000, "loss": 0.1793, "lr": 3.8737724451770155e-06, "epoch": 3.8319100187460946, "percentage": 38.33, "elapsed_time": "7:27:34", "remaining_time": "12:00:01"} +{"current_steps": 9300, "total_steps": 24000, "loss": 0.1792, "lr": 3.8432490208670605e-06, "epoch": 3.8735680066652782, "percentage": 38.75, "elapsed_time": "7:32:17", "remaining_time": "11:54:54"} +{"current_steps": 9400, "total_steps": 24000, "loss": 0.1792, "lr": 3.8124414508364005e-06, "epoch": 3.9152259945844614, "percentage": 39.17, "elapsed_time": "7:37:00", "remaining_time": "11:49:48"} +{"current_steps": 9500, "total_steps": 24000, "loss": 0.1791, "lr": 3.7813562519996633e-06, "epoch": 3.956883982503645, "percentage": 39.58, "elapsed_time": "7:41:43", "remaining_time": "11:44:44"} +{"current_steps": 9600, "total_steps": 24000, "loss": 0.1792, "lr": 3.7500000000000005e-06, "epoch": 3.9985419704228287, "percentage": 40.0, "elapsed_time": "7:46:26", "remaining_time": "11:39:40"} +{"current_steps": 9604, "total_steps": 24000, "eval_loss": 0.17919312417507172, "epoch": 4.0, "percentage": 40.02, "elapsed_time": "7:49:53", "remaining_time": "11:44:21"} +{"current_steps": 9700, "total_steps": 24000, "loss": 0.1793, "lr": 3.7183793278181063e-06, "epoch": 4.039991668402416, "percentage": 40.42, "elapsed_time": "7:56:02", "remaining_time": "11:41:48"} +{"current_steps": 9800, "total_steps": 24000, "loss": 0.1791, "lr": 3.6865009243691015e-06, "epoch": 4.081649656321599, "percentage": 40.83, "elapsed_time": "8:00:46", "remaining_time": "11:36:37"} +{"current_steps": 9900, "total_steps": 24000, "loss": 0.1792, "lr": 3.654371533087586e-06, "epoch": 4.123307644240783, "percentage": 41.25, "elapsed_time": "8:05:29", "remaining_time": "11:31:27"} +{"current_steps": 10000, "total_steps": 24000, "loss": 0.179, "lr": 3.621997950501156e-06, "epoch": 4.164965632159967, "percentage": 41.67, "elapsed_time": "8:10:12", "remaining_time": "11:26:17"} +{"current_steps": 10100, "total_steps": 24000, "loss": 0.1792, "lr": 3.5893870247926986e-06, "epoch": 4.20662362007915, "percentage": 42.08, "elapsed_time": "8:14:55", "remaining_time": "11:21:07"} +{"current_steps": 10200, "total_steps": 24000, "loss": 0.1791, "lr": 3.556545654351749e-06, "epoch": 4.248281607998334, "percentage": 42.5, "elapsed_time": "8:19:38", "remaining_time": "11:15:59"} +{"current_steps": 10300, "total_steps": 24000, "loss": 0.1793, "lr": 3.5234807863152316e-06, "epoch": 4.2899395959175175, "percentage": 42.92, "elapsed_time": "8:24:21", "remaining_time": "11:10:50"} +{"current_steps": 10400, "total_steps": 24000, "loss": 0.1791, "lr": 3.4901994150978926e-06, "epoch": 4.3315975838367, "percentage": 43.33, "elapsed_time": "8:29:04", "remaining_time": "11:05:43"} +{"current_steps": 10500, "total_steps": 24000, "loss": 0.1791, "lr": 3.4567085809127247e-06, "epoch": 4.373255571755884, "percentage": 43.75, "elapsed_time": "8:33:47", "remaining_time": "11:00:35"} +{"current_steps": 10600, "total_steps": 24000, "loss": 0.1791, "lr": 3.4230153682817112e-06, "epoch": 4.4149135596750675, "percentage": 44.17, "elapsed_time": "8:38:30", "remaining_time": "10:55:28"} +{"current_steps": 10700, "total_steps": 24000, "loss": 0.1791, "lr": 3.389126904537192e-06, "epoch": 4.456571547594251, "percentage": 44.58, "elapsed_time": "8:43:13", "remaining_time": "10:50:22"} +{"current_steps": 10800, "total_steps": 24000, "loss": 0.1791, "lr": 3.3550503583141726e-06, "epoch": 4.498229535513435, "percentage": 45.0, "elapsed_time": "8:47:56", "remaining_time": "10:45:15"} +{"current_steps": 10900, "total_steps": 24000, "loss": 0.1792, "lr": 3.3207929380339034e-06, "epoch": 4.539887523432618, "percentage": 45.42, "elapsed_time": "8:52:39", "remaining_time": "10:40:09"} +{"current_steps": 11000, "total_steps": 24000, "loss": 0.1791, "lr": 3.2863618903790346e-06, "epoch": 4.581545511351802, "percentage": 45.83, "elapsed_time": "8:57:22", "remaining_time": "10:35:04"} +{"current_steps": 11100, "total_steps": 24000, "loss": 0.1792, "lr": 3.2517644987606827e-06, "epoch": 4.623203499270986, "percentage": 46.25, "elapsed_time": "9:02:05", "remaining_time": "10:29:59"} +{"current_steps": 11200, "total_steps": 24000, "loss": 0.1791, "lr": 3.217008081777726e-06, "epoch": 4.664861487190168, "percentage": 46.67, "elapsed_time": "9:06:48", "remaining_time": "10:24:55"} +{"current_steps": 11300, "total_steps": 24000, "loss": 0.1791, "lr": 3.182099991668653e-06, "epoch": 4.706519475109352, "percentage": 47.08, "elapsed_time": "9:11:31", "remaining_time": "10:19:51"} +{"current_steps": 11400, "total_steps": 24000, "loss": 0.1792, "lr": 3.147047612756302e-06, "epoch": 4.748177463028536, "percentage": 47.5, "elapsed_time": "9:16:14", "remaining_time": "10:14:48"} +{"current_steps": 11500, "total_steps": 24000, "loss": 0.179, "lr": 3.1118583598858097e-06, "epoch": 4.789835450947719, "percentage": 47.92, "elapsed_time": "9:20:57", "remaining_time": "10:09:44"} +{"current_steps": 11600, "total_steps": 24000, "loss": 0.179, "lr": 3.0765396768561005e-06, "epoch": 4.831493438866903, "percentage": 48.33, "elapsed_time": "9:25:41", "remaining_time": "10:04:42"} +{"current_steps": 11700, "total_steps": 24000, "loss": 0.1793, "lr": 3.0410990348452572e-06, "epoch": 4.8731514267860865, "percentage": 48.75, "elapsed_time": "9:30:24", "remaining_time": "9:59:39"} +{"current_steps": 11800, "total_steps": 24000, "loss": 0.1791, "lr": 3.0055439308300954e-06, "epoch": 4.91480941470527, "percentage": 49.17, "elapsed_time": "9:35:07", "remaining_time": "9:54:37"} +{"current_steps": 11900, "total_steps": 24000, "loss": 0.1792, "lr": 2.96988188600028e-06, "epoch": 4.956467402624453, "percentage": 49.58, "elapsed_time": "9:39:50", "remaining_time": "9:49:35"} +{"current_steps": 12000, "total_steps": 24000, "loss": 0.1791, "lr": 2.9341204441673267e-06, "epoch": 4.9981253905436365, "percentage": 50.0, "elapsed_time": "9:44:33", "remaining_time": "9:44:33"} +{"current_steps": 12005, "total_steps": 24000, "eval_loss": 0.17911389470100403, "epoch": 5.0, "percentage": 50.02, "elapsed_time": "9:48:03", "remaining_time": "9:47:33"} +{"current_steps": 12100, "total_steps": 24000, "loss": 0.1791, "lr": 2.898267170168807e-06, "epoch": 5.0395750885232244, "percentage": 50.42, "elapsed_time": "9:53:40", "remaining_time": "9:43:52"} +{"current_steps": 12200, "total_steps": 24000, "loss": 0.1789, "lr": 2.862329648268117e-06, "epoch": 5.081233076442408, "percentage": 50.83, "elapsed_time": "9:58:23", "remaining_time": "9:38:46"} +{"current_steps": 12300, "total_steps": 24000, "loss": 0.1792, "lr": 2.82631548055013e-06, "epoch": 5.122891064361592, "percentage": 51.25, "elapsed_time": "10:03:05", "remaining_time": "9:33:40"} +{"current_steps": 12400, "total_steps": 24000, "loss": 0.179, "lr": 2.7902322853130758e-06, "epoch": 5.164549052280774, "percentage": 51.67, "elapsed_time": "10:07:48", "remaining_time": "9:28:35"} +{"current_steps": 12500, "total_steps": 24000, "loss": 0.1791, "lr": 2.754087695457005e-06, "epoch": 5.206207040199958, "percentage": 52.08, "elapsed_time": "10:12:31", "remaining_time": "9:23:31"} +{"current_steps": 12600, "total_steps": 24000, "loss": 0.179, "lr": 2.717889356869146e-06, "epoch": 5.247865028119142, "percentage": 52.5, "elapsed_time": "10:17:14", "remaining_time": "9:18:27"} +{"current_steps": 12700, "total_steps": 24000, "loss": 0.179, "lr": 2.681644926806527e-06, "epoch": 5.289523016038325, "percentage": 52.92, "elapsed_time": "10:21:56", "remaining_time": "9:13:23"} +{"current_steps": 12800, "total_steps": 24000, "loss": 0.179, "lr": 2.6453620722761897e-06, "epoch": 5.331181003957509, "percentage": 53.33, "elapsed_time": "10:26:39", "remaining_time": "9:08:19"} +{"current_steps": 12900, "total_steps": 24000, "loss": 0.1791, "lr": 2.6090484684133406e-06, "epoch": 5.372838991876693, "percentage": 53.75, "elapsed_time": "10:31:22", "remaining_time": "9:03:16"} +{"current_steps": 13000, "total_steps": 24000, "loss": 0.179, "lr": 2.572711796857779e-06, "epoch": 5.414496979795876, "percentage": 54.17, "elapsed_time": "10:36:04", "remaining_time": "8:58:13"} +{"current_steps": 13100, "total_steps": 24000, "loss": 0.179, "lr": 2.5363597441289574e-06, "epoch": 5.45615496771506, "percentage": 54.58, "elapsed_time": "10:40:47", "remaining_time": "8:53:10"} +{"current_steps": 13200, "total_steps": 24000, "loss": 0.179, "lr": 2.5e-06, "epoch": 5.4978129556342425, "percentage": 55.0, "elapsed_time": "10:45:30", "remaining_time": "8:48:08"} +{"current_steps": 13300, "total_steps": 24000, "loss": 0.1791, "lr": 2.4636402558710434e-06, "epoch": 5.539470943553426, "percentage": 55.42, "elapsed_time": "10:50:13", "remaining_time": "8:43:06"} +{"current_steps": 13400, "total_steps": 24000, "loss": 0.179, "lr": 2.4272882031422216e-06, "epoch": 5.58112893147261, "percentage": 55.83, "elapsed_time": "10:54:56", "remaining_time": "8:38:05"} +{"current_steps": 13500, "total_steps": 24000, "loss": 0.1791, "lr": 2.3909515315866606e-06, "epoch": 5.622786919391793, "percentage": 56.25, "elapsed_time": "10:59:39", "remaining_time": "8:33:03"} +{"current_steps": 13600, "total_steps": 24000, "loss": 0.179, "lr": 2.3546379277238107e-06, "epoch": 5.664444907310977, "percentage": 56.67, "elapsed_time": "11:04:21", "remaining_time": "8:28:02"} +{"current_steps": 13700, "total_steps": 24000, "loss": 0.1791, "lr": 2.318355073193474e-06, "epoch": 5.706102895230161, "percentage": 57.08, "elapsed_time": "11:09:04", "remaining_time": "8:23:01"} +{"current_steps": 13800, "total_steps": 24000, "loss": 0.179, "lr": 2.2821106431308546e-06, "epoch": 5.747760883149343, "percentage": 57.5, "elapsed_time": "11:13:47", "remaining_time": "8:18:01"} +{"current_steps": 13900, "total_steps": 24000, "loss": 0.1792, "lr": 2.2459123045429953e-06, "epoch": 5.789418871068527, "percentage": 57.92, "elapsed_time": "11:18:29", "remaining_time": "8:13:00"} +{"current_steps": 14000, "total_steps": 24000, "loss": 0.1791, "lr": 2.2097677146869242e-06, "epoch": 5.831076858987711, "percentage": 58.33, "elapsed_time": "11:23:12", "remaining_time": "8:08:00"} +{"current_steps": 14100, "total_steps": 24000, "loss": 0.1789, "lr": 2.173684519449872e-06, "epoch": 5.872734846906894, "percentage": 58.75, "elapsed_time": "11:27:54", "remaining_time": "8:03:00"} +{"current_steps": 14200, "total_steps": 24000, "loss": 0.179, "lr": 2.1376703517318835e-06, "epoch": 5.914392834826078, "percentage": 59.17, "elapsed_time": "11:32:37", "remaining_time": "7:58:00"} +{"current_steps": 14300, "total_steps": 24000, "loss": 0.179, "lr": 2.101732829831194e-06, "epoch": 5.9560508227452615, "percentage": 59.58, "elapsed_time": "11:37:20", "remaining_time": "7:53:01"} +{"current_steps": 14400, "total_steps": 24000, "loss": 0.179, "lr": 2.0658795558326745e-06, "epoch": 5.997708810664445, "percentage": 60.0, "elapsed_time": "11:42:03", "remaining_time": "7:48:02"} +{"current_steps": 14406, "total_steps": 24000, "eval_loss": 0.17907121777534485, "epoch": 6.0, "percentage": 60.02, "elapsed_time": "11:45:35", "remaining_time": "7:49:54"} +{"current_steps": 14500, "total_steps": 24000, "loss": 0.1789, "lr": 2.0301181139997206e-06, "epoch": 6.039158508644032, "percentage": 60.42, "elapsed_time": "11:51:15", "remaining_time": "7:45:59"} +{"current_steps": 14600, "total_steps": 24000, "loss": 0.179, "lr": 1.994456069169906e-06, "epoch": 6.080816496563216, "percentage": 60.83, "elapsed_time": "11:55:58", "remaining_time": "7:40:58"} +{"current_steps": 14700, "total_steps": 24000, "loss": 0.1789, "lr": 1.958900965154743e-06, "epoch": 6.1224744844823995, "percentage": 61.25, "elapsed_time": "12:00:41", "remaining_time": "7:35:56"} +{"current_steps": 14800, "total_steps": 24000, "loss": 0.1788, "lr": 1.9234603231439e-06, "epoch": 6.164132472401583, "percentage": 61.67, "elapsed_time": "12:05:24", "remaining_time": "7:30:55"} +{"current_steps": 14900, "total_steps": 24000, "loss": 0.1788, "lr": 1.8881416401141905e-06, "epoch": 6.205790460320767, "percentage": 62.08, "elapsed_time": "12:10:08", "remaining_time": "7:25:55"} +{"current_steps": 15000, "total_steps": 24000, "loss": 0.1788, "lr": 1.852952387243698e-06, "epoch": 6.24744844823995, "percentage": 62.5, "elapsed_time": "12:14:51", "remaining_time": "7:20:54"} +{"current_steps": 15100, "total_steps": 24000, "loss": 0.1788, "lr": 1.8179000083313483e-06, "epoch": 6.289106436159133, "percentage": 62.92, "elapsed_time": "12:19:33", "remaining_time": "7:15:54"} +{"current_steps": 15200, "total_steps": 24000, "loss": 0.1788, "lr": 1.7829919182222752e-06, "epoch": 6.330764424078317, "percentage": 63.33, "elapsed_time": "12:24:17", "remaining_time": "7:10:54"} +{"current_steps": 15300, "total_steps": 24000, "loss": 0.1789, "lr": 1.7482355012393177e-06, "epoch": 6.3724224119975, "percentage": 63.75, "elapsed_time": "12:29:00", "remaining_time": "7:05:54"} +{"current_steps": 15400, "total_steps": 24000, "loss": 0.179, "lr": 1.7136381096209665e-06, "epoch": 6.414080399916684, "percentage": 64.17, "elapsed_time": "12:33:43", "remaining_time": "7:00:54"} +{"current_steps": 15500, "total_steps": 24000, "loss": 0.179, "lr": 1.6792070619660977e-06, "epoch": 6.455738387835868, "percentage": 64.58, "elapsed_time": "12:38:26", "remaining_time": "6:55:54"} +{"current_steps": 15600, "total_steps": 24000, "loss": 0.1788, "lr": 1.6449496416858285e-06, "epoch": 6.497396375755051, "percentage": 65.0, "elapsed_time": "12:43:08", "remaining_time": "6:50:55"} +{"current_steps": 15700, "total_steps": 24000, "loss": 0.1788, "lr": 1.6108730954628093e-06, "epoch": 6.539054363674235, "percentage": 65.42, "elapsed_time": "12:47:51", "remaining_time": "6:45:56"} +{"current_steps": 15800, "total_steps": 24000, "loss": 0.1787, "lr": 1.5769846317182894e-06, "epoch": 6.580712351593418, "percentage": 65.83, "elapsed_time": "12:52:34", "remaining_time": "6:40:57"} +{"current_steps": 15900, "total_steps": 24000, "loss": 0.1788, "lr": 1.5432914190872757e-06, "epoch": 6.622370339512601, "percentage": 66.25, "elapsed_time": "12:57:17", "remaining_time": "6:35:58"} +{"current_steps": 16000, "total_steps": 24000, "loss": 0.1789, "lr": 1.509800584902108e-06, "epoch": 6.664028327431785, "percentage": 66.67, "elapsed_time": "13:02:00", "remaining_time": "6:31:00"} +{"current_steps": 16100, "total_steps": 24000, "loss": 0.1789, "lr": 1.4765192136847686e-06, "epoch": 6.7056863153509685, "percentage": 67.08, "elapsed_time": "13:06:43", "remaining_time": "6:26:01"} +{"current_steps": 16200, "total_steps": 24000, "loss": 0.1789, "lr": 1.443454345648252e-06, "epoch": 6.747344303270152, "percentage": 67.5, "elapsed_time": "13:11:26", "remaining_time": "6:21:03"} +{"current_steps": 16300, "total_steps": 24000, "loss": 0.179, "lr": 1.4106129752073023e-06, "epoch": 6.789002291189336, "percentage": 67.92, "elapsed_time": "13:16:09", "remaining_time": "6:16:05"} +{"current_steps": 16400, "total_steps": 24000, "loss": 0.179, "lr": 1.3780020494988447e-06, "epoch": 6.830660279108519, "percentage": 68.33, "elapsed_time": "13:20:52", "remaining_time": "6:11:08"} +{"current_steps": 16500, "total_steps": 24000, "loss": 0.1786, "lr": 1.3456284669124159e-06, "epoch": 6.872318267027703, "percentage": 68.75, "elapsed_time": "13:25:35", "remaining_time": "6:06:10"} +{"current_steps": 16600, "total_steps": 24000, "loss": 0.179, "lr": 1.313499075630899e-06, "epoch": 6.913976254946886, "percentage": 69.17, "elapsed_time": "13:30:18", "remaining_time": "6:01:13"} +{"current_steps": 16700, "total_steps": 24000, "loss": 0.1789, "lr": 1.2816206721818944e-06, "epoch": 6.955634242866069, "percentage": 69.58, "elapsed_time": "13:35:02", "remaining_time": "5:56:16"} +{"current_steps": 16800, "total_steps": 24000, "loss": 0.1787, "lr": 1.2500000000000007e-06, "epoch": 6.997292230785253, "percentage": 70.0, "elapsed_time": "13:39:45", "remaining_time": "5:51:19"} +{"current_steps": 16807, "total_steps": 24000, "eval_loss": 0.17893224954605103, "epoch": 7.0, "percentage": 70.03, "elapsed_time": "13:43:20", "remaining_time": "5:52:22"} +{"current_steps": 16900, "total_steps": 24000, "loss": 0.1787, "lr": 1.218643748000337e-06, "epoch": 7.038741928764841, "percentage": 70.42, "elapsed_time": "13:48:54", "remaining_time": "5:48:14"} +{"current_steps": 17000, "total_steps": 24000, "loss": 0.1788, "lr": 1.1875585491636e-06, "epoch": 7.080399916684025, "percentage": 70.83, "elapsed_time": "13:53:37", "remaining_time": "5:43:15"} +{"current_steps": 17100, "total_steps": 24000, "loss": 0.1786, "lr": 1.1567509791329402e-06, "epoch": 7.122057904603207, "percentage": 71.25, "elapsed_time": "13:58:19", "remaining_time": "5:38:16"} +{"current_steps": 17200, "total_steps": 24000, "loss": 0.1791, "lr": 1.1262275548229852e-06, "epoch": 7.163715892522391, "percentage": 71.67, "elapsed_time": "14:03:02", "remaining_time": "5:33:17"} +{"current_steps": 17300, "total_steps": 24000, "loss": 0.1789, "lr": 1.0959947330412681e-06, "epoch": 7.205373880441575, "percentage": 72.08, "elapsed_time": "14:07:44", "remaining_time": "5:28:19"} +{"current_steps": 17400, "total_steps": 24000, "loss": 0.1786, "lr": 1.0660589091223854e-06, "epoch": 7.247031868360758, "percentage": 72.5, "elapsed_time": "14:12:27", "remaining_time": "5:23:20"} +{"current_steps": 17500, "total_steps": 24000, "loss": 0.1786, "lr": 1.0364264155751489e-06, "epoch": 7.288689856279942, "percentage": 72.92, "elapsed_time": "14:17:10", "remaining_time": "5:18:22"} +{"current_steps": 17600, "total_steps": 24000, "loss": 0.1787, "lr": 1.0071035207430352e-06, "epoch": 7.330347844199125, "percentage": 73.33, "elapsed_time": "14:21:52", "remaining_time": "5:13:24"} +{"current_steps": 17700, "total_steps": 24000, "loss": 0.1786, "lr": 9.780964274781984e-07, "epoch": 7.372005832118309, "percentage": 73.75, "elapsed_time": "14:26:35", "remaining_time": "5:08:26"} +{"current_steps": 17800, "total_steps": 24000, "loss": 0.1787, "lr": 9.494112718293503e-07, "epoch": 7.413663820037492, "percentage": 74.17, "elapsed_time": "14:31:18", "remaining_time": "5:03:29"} +{"current_steps": 17900, "total_steps": 24000, "loss": 0.1787, "lr": 9.210541217437566e-07, "epoch": 7.455321807956675, "percentage": 74.58, "elapsed_time": "14:36:01", "remaining_time": "4:58:31"} +{"current_steps": 18000, "total_steps": 24000, "loss": 0.1785, "lr": 8.930309757836517e-07, "epoch": 7.496979795875859, "percentage": 75.0, "elapsed_time": "14:40:44", "remaining_time": "4:53:34"} +{"current_steps": 18100, "total_steps": 24000, "loss": 0.1786, "lr": 8.653477618573261e-07, "epoch": 7.538637783795043, "percentage": 75.42, "elapsed_time": "14:45:27", "remaining_time": "4:48:37"} +{"current_steps": 18200, "total_steps": 24000, "loss": 0.1787, "lr": 8.380103359651554e-07, "epoch": 7.580295771714226, "percentage": 75.83, "elapsed_time": "14:50:10", "remaining_time": "4:43:40"} +{"current_steps": 18300, "total_steps": 24000, "loss": 0.1786, "lr": 8.110244809608494e-07, "epoch": 7.62195375963341, "percentage": 76.25, "elapsed_time": "14:54:53", "remaining_time": "4:38:44"} +{"current_steps": 18400, "total_steps": 24000, "loss": 0.1786, "lr": 7.843959053281663e-07, "epoch": 7.663611747552594, "percentage": 76.67, "elapsed_time": "14:59:35", "remaining_time": "4:33:47"} +{"current_steps": 18500, "total_steps": 24000, "loss": 0.1785, "lr": 7.581302419733633e-07, "epoch": 7.705269735471777, "percentage": 77.08, "elapsed_time": "15:04:18", "remaining_time": "4:28:50"} +{"current_steps": 18600, "total_steps": 24000, "loss": 0.1785, "lr": 7.322330470336314e-07, "epoch": 7.74692772339096, "percentage": 77.5, "elapsed_time": "15:09:01", "remaining_time": "4:23:54"} +{"current_steps": 18700, "total_steps": 24000, "loss": 0.1787, "lr": 7.067097987017762e-07, "epoch": 7.7885857113101435, "percentage": 77.92, "elapsed_time": "15:13:44", "remaining_time": "4:18:58"} +{"current_steps": 18800, "total_steps": 24000, "loss": 0.1785, "lr": 6.815658960673782e-07, "epoch": 7.830243699229327, "percentage": 78.33, "elapsed_time": "15:18:27", "remaining_time": "4:14:02"} +{"current_steps": 18900, "total_steps": 24000, "loss": 0.1785, "lr": 6.568066579746901e-07, "epoch": 7.871901687148511, "percentage": 78.75, "elapsed_time": "15:23:09", "remaining_time": "4:09:06"} +{"current_steps": 19000, "total_steps": 24000, "loss": 0.1786, "lr": 6.324373218975105e-07, "epoch": 7.913559675067694, "percentage": 79.17, "elapsed_time": "15:27:52", "remaining_time": "4:04:10"} +{"current_steps": 19100, "total_steps": 24000, "loss": 0.1785, "lr": 6.084630428312679e-07, "epoch": 7.955217662986878, "percentage": 79.58, "elapsed_time": "15:32:35", "remaining_time": "3:59:15"} +{"current_steps": 19200, "total_steps": 24000, "loss": 0.1786, "lr": 5.848888922025553e-07, "epoch": 7.996875650906061, "percentage": 80.0, "elapsed_time": "15:37:18", "remaining_time": "3:54:19"} +{"current_steps": 19208, "total_steps": 24000, "eval_loss": 0.17886345088481903, "epoch": 8.0, "percentage": 80.03, "elapsed_time": "15:40:56", "remaining_time": "3:54:44"} +{"current_steps": 19300, "total_steps": 24000, "loss": 0.1783, "lr": 5.617198567963353e-07, "epoch": 8.03832534888565, "percentage": 80.42, "elapsed_time": "15:46:29", "remaining_time": "3:50:29"} +{"current_steps": 19400, "total_steps": 24000, "loss": 0.1783, "lr": 5.389608377010608e-07, "epoch": 8.079983336804831, "percentage": 80.83, "elapsed_time": "15:51:12", "remaining_time": "3:45:32"} +{"current_steps": 19500, "total_steps": 24000, "loss": 0.1783, "lr": 5.166166492719124e-07, "epoch": 8.121641324724015, "percentage": 81.25, "elapsed_time": "15:55:56", "remaining_time": "3:40:36"} +{"current_steps": 19600, "total_steps": 24000, "loss": 0.1782, "lr": 4.946920181123904e-07, "epoch": 8.163299312643199, "percentage": 81.67, "elapsed_time": "16:00:39", "remaining_time": "3:35:39"} +{"current_steps": 19700, "total_steps": 24000, "loss": 0.1782, "lr": 4.7319158207446953e-07, "epoch": 8.204957300562382, "percentage": 82.08, "elapsed_time": "16:05:22", "remaining_time": "3:30:42"} +{"current_steps": 19800, "total_steps": 24000, "loss": 0.1782, "lr": 4.5211988927752026e-07, "epoch": 8.246615288481566, "percentage": 82.5, "elapsed_time": "16:10:05", "remaining_time": "3:25:46"} +{"current_steps": 19900, "total_steps": 24000, "loss": 0.1782, "lr": 4.3148139714622365e-07, "epoch": 8.28827327640075, "percentage": 82.92, "elapsed_time": "16:14:49", "remaining_time": "3:20:50"} +{"current_steps": 20000, "total_steps": 24000, "loss": 0.1781, "lr": 4.1128047146765936e-07, "epoch": 8.329931264319933, "percentage": 83.33, "elapsed_time": "16:19:32", "remaining_time": "3:15:54"} +{"current_steps": 20100, "total_steps": 24000, "loss": 0.1781, "lr": 3.915213854677863e-07, "epoch": 8.371589252239117, "percentage": 83.75, "elapsed_time": "16:24:15", "remaining_time": "3:10:58"} +{"current_steps": 20200, "total_steps": 24000, "loss": 0.1782, "lr": 3.722083189075007e-07, "epoch": 8.4132472401583, "percentage": 84.17, "elapsed_time": "16:28:58", "remaining_time": "3:06:02"} +{"current_steps": 20300, "total_steps": 24000, "loss": 0.1781, "lr": 3.5334535719846767e-07, "epoch": 8.454905228077484, "percentage": 84.58, "elapsed_time": "16:33:41", "remaining_time": "3:01:07"} +{"current_steps": 20400, "total_steps": 24000, "loss": 0.1781, "lr": 3.3493649053890325e-07, "epoch": 8.496563215996668, "percentage": 85.0, "elapsed_time": "16:38:25", "remaining_time": "2:56:11"} +{"current_steps": 20500, "total_steps": 24000, "loss": 0.1782, "lr": 3.1698561306951065e-07, "epoch": 8.538221203915851, "percentage": 85.42, "elapsed_time": "16:43:08", "remaining_time": "2:51:16"} +{"current_steps": 20600, "total_steps": 24000, "loss": 0.178, "lr": 2.9949652204972257e-07, "epoch": 8.579879191835035, "percentage": 85.83, "elapsed_time": "16:47:51", "remaining_time": "2:46:20"} +{"current_steps": 20700, "total_steps": 24000, "loss": 0.1778, "lr": 2.8247291705444575e-07, "epoch": 8.621537179754219, "percentage": 86.25, "elapsed_time": "16:52:34", "remaining_time": "2:41:25"} +{"current_steps": 20800, "total_steps": 24000, "loss": 0.178, "lr": 2.6591839919146963e-07, "epoch": 8.6631951676734, "percentage": 86.67, "elapsed_time": "16:57:17", "remaining_time": "2:36:30"} +{"current_steps": 20900, "total_steps": 24000, "loss": 0.1783, "lr": 2.4983647033969714e-07, "epoch": 8.704853155592584, "percentage": 87.08, "elapsed_time": "17:02:00", "remaining_time": "2:31:35"} +{"current_steps": 21000, "total_steps": 24000, "loss": 0.1781, "lr": 2.3423053240837518e-07, "epoch": 8.746511143511768, "percentage": 87.5, "elapsed_time": "17:06:43", "remaining_time": "2:26:40"} +{"current_steps": 21100, "total_steps": 24000, "loss": 0.1782, "lr": 2.1910388661746495e-07, "epoch": 8.788169131430951, "percentage": 87.92, "elapsed_time": "17:11:27", "remaining_time": "2:21:45"} +{"current_steps": 21200, "total_steps": 24000, "loss": 0.1781, "lr": 2.044597327993153e-07, "epoch": 8.829827119350135, "percentage": 88.33, "elapsed_time": "17:16:10", "remaining_time": "2:16:51"} +{"current_steps": 21300, "total_steps": 24000, "loss": 0.1781, "lr": 1.9030116872178317e-07, "epoch": 8.871485107269319, "percentage": 88.75, "elapsed_time": "17:20:53", "remaining_time": "2:11:56"} +{"current_steps": 21400, "total_steps": 24000, "loss": 0.1781, "lr": 1.7663118943294367e-07, "epoch": 8.913143095188502, "percentage": 89.17, "elapsed_time": "17:25:36", "remaining_time": "2:07:02"} +{"current_steps": 21500, "total_steps": 24000, "loss": 0.1781, "lr": 1.6345268662752904e-07, "epoch": 8.954801083107686, "percentage": 89.58, "elapsed_time": "17:30:20", "remaining_time": "2:02:07"} +{"current_steps": 21600, "total_steps": 24000, "loss": 0.1781, "lr": 1.507684480352292e-07, "epoch": 8.99645907102687, "percentage": 90.0, "elapsed_time": "17:35:03", "remaining_time": "1:57:13"} +{"current_steps": 21609, "total_steps": 24000, "eval_loss": 0.17907947301864624, "epoch": 9.0, "percentage": 90.04, "elapsed_time": "17:38:43", "remaining_time": "1:57:08"} +{"current_steps": 21700, "total_steps": 24000, "loss": 0.177, "lr": 1.3858115683098832e-07, "epoch": 9.037908769006457, "percentage": 90.42, "elapsed_time": "17:44:07", "remaining_time": "1:52:47"} +{"current_steps": 21800, "total_steps": 24000, "loss": 0.1767, "lr": 1.2689339106741529e-07, "epoch": 9.07956675692564, "percentage": 90.83, "elapsed_time": "17:48:50", "remaining_time": "1:47:51"} +{"current_steps": 21900, "total_steps": 24000, "loss": 0.1768, "lr": 1.1570762312943295e-07, "epoch": 9.121224744844824, "percentage": 91.25, "elapsed_time": "17:53:33", "remaining_time": "1:42:56"} +{"current_steps": 22000, "total_steps": 24000, "loss": 0.1767, "lr": 1.0502621921127776e-07, "epoch": 9.162882732764007, "percentage": 91.67, "elapsed_time": "17:58:16", "remaining_time": "1:38:01"} +{"current_steps": 22100, "total_steps": 24000, "loss": 0.1768, "lr": 9.485143881596715e-08, "epoch": 9.204540720683191, "percentage": 92.08, "elapsed_time": "18:02:59", "remaining_time": "1:33:06"} +{"current_steps": 22200, "total_steps": 24000, "loss": 0.1767, "lr": 8.518543427732951e-08, "epoch": 9.246198708602375, "percentage": 92.5, "elapsed_time": "18:07:42", "remaining_time": "1:28:11"} +{"current_steps": 22300, "total_steps": 24000, "loss": 0.1766, "lr": 7.603025030471001e-08, "epoch": 9.287856696521558, "percentage": 92.92, "elapsed_time": "18:12:25", "remaining_time": "1:23:16"} +{"current_steps": 22400, "total_steps": 24000, "loss": 0.1769, "lr": 6.738782355044048e-08, "epoch": 9.329514684440742, "percentage": 93.33, "elapsed_time": "18:17:07", "remaining_time": "1:18:21"} +{"current_steps": 22500, "total_steps": 24000, "loss": 0.1767, "lr": 5.92599822001666e-08, "epoch": 9.371172672359926, "percentage": 93.75, "elapsed_time": "18:21:50", "remaining_time": "1:13:27"} +{"current_steps": 22600, "total_steps": 24000, "loss": 0.1766, "lr": 5.164844558612131e-08, "epoch": 9.41283066027911, "percentage": 94.17, "elapsed_time": "18:26:33", "remaining_time": "1:08:32"} +{"current_steps": 22700, "total_steps": 24000, "loss": 0.1767, "lr": 4.455482382342336e-08, "epoch": 9.454488648198293, "percentage": 94.58, "elapsed_time": "18:31:16", "remaining_time": "1:03:38"} +{"current_steps": 22800, "total_steps": 24000, "loss": 0.1767, "lr": 3.798061746947995e-08, "epoch": 9.496146636117475, "percentage": 95.0, "elapsed_time": "18:35:59", "remaining_time": "0:58:44"} +{"current_steps": 22900, "total_steps": 24000, "loss": 0.1767, "lr": 3.1927217206564884e-08, "epoch": 9.537804624036658, "percentage": 95.42, "elapsed_time": "18:40:41", "remaining_time": "0:53:49"} +{"current_steps": 23000, "total_steps": 24000, "loss": 0.1765, "lr": 2.6395903547638825e-08, "epoch": 9.579462611955842, "percentage": 95.83, "elapsed_time": "18:45:24", "remaining_time": "0:48:55"} +{"current_steps": 23100, "total_steps": 24000, "loss": 0.1765, "lr": 2.1387846565474047e-08, "epoch": 9.621120599875026, "percentage": 96.25, "elapsed_time": "18:50:07", "remaining_time": "0:44:01"} +{"current_steps": 23200, "total_steps": 24000, "loss": 0.1765, "lr": 1.6904105645142443e-08, "epoch": 9.66277858779421, "percentage": 96.67, "elapsed_time": "18:54:50", "remaining_time": "0:39:07"} +{"current_steps": 23300, "total_steps": 24000, "loss": 0.1766, "lr": 1.2945629259917547e-08, "epoch": 9.704436575713393, "percentage": 97.08, "elapsed_time": "18:59:33", "remaining_time": "0:34:14"} +{"current_steps": 23400, "total_steps": 24000, "loss": 0.1767, "lr": 9.513254770636138e-09, "epoch": 9.746094563632576, "percentage": 97.5, "elapsed_time": "19:04:16", "remaining_time": "0:29:20"} +{"current_steps": 23500, "total_steps": 24000, "loss": 0.1766, "lr": 6.607708248569378e-09, "epoch": 9.78775255155176, "percentage": 97.92, "elapsed_time": "19:08:59", "remaining_time": "0:24:26"} +{"current_steps": 23600, "total_steps": 24000, "loss": 0.1766, "lr": 4.229604321829561e-09, "epoch": 9.829410539470944, "percentage": 98.33, "elapsed_time": "19:13:42", "remaining_time": "0:19:33"} +{"current_steps": 23700, "total_steps": 24000, "loss": 0.1766, "lr": 2.3794460453555046e-09, "epoch": 9.871068527390127, "percentage": 98.75, "elapsed_time": "19:18:25", "remaining_time": "0:14:39"} +{"current_steps": 23800, "total_steps": 24000, "loss": 0.1767, "lr": 1.0576247944985018e-09, "epoch": 9.912726515309311, "percentage": 99.17, "elapsed_time": "19:23:08", "remaining_time": "0:09:46"} +{"current_steps": 23900, "total_steps": 24000, "loss": 0.1766, "lr": 2.6442018223132857e-10, "epoch": 9.954384503228495, "percentage": 99.58, "elapsed_time": "19:27:51", "remaining_time": "0:04:53"} +{"current_steps": 24000, "total_steps": 24000, "loss": 0.1766, "lr": 0.0, "epoch": 9.996042491147678, "percentage": 100.0, "elapsed_time": "19:32:34", "remaining_time": "0:00:00"} +{"current_steps": 24000, "total_steps": 24000, "eval_loss": 0.18023133277893066, "epoch": 9.996042491147678, "percentage": 100.0, "elapsed_time": "19:37:07", "remaining_time": "0:00:00"} +{"current_steps": 24000, "total_steps": 24000, "epoch": 9.996042491147678, "percentage": 100.0, "elapsed_time": "19:38:18", "remaining_time": "0:00:00"} diff --git a/saves/chess/generate_strategy/trainer_state.json b/saves/chess/generate_strategy/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6d2c1f0d44c4207844d8a5630440b268102a1276 --- /dev/null +++ b/saves/chess/generate_strategy/trainer_state.json @@ -0,0 +1,1802 @@ +{ + "best_metric": 0.17886345088481903, + "best_model_checkpoint": "saves/chess/generate_strategy/checkpoint-19208", + "epoch": 9.996042491147678, + "eval_steps": 500, + "global_step": 24000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0416579879191835, + "grad_norm": 8.262849587594042, + "learning_rate": 2.0833333333333333e-07, + "loss": 3.9539, + "step": 100 + }, + { + "epoch": 0.083315975838367, + "grad_norm": 2.1815007336055197, + "learning_rate": 4.1666666666666667e-07, + "loss": 0.4086, + "step": 200 + }, + { + "epoch": 0.12497396375755052, + "grad_norm": 1.094766614987478, + "learning_rate": 6.25e-07, + "loss": 0.2144, + "step": 300 + }, + { + "epoch": 0.166631951676734, + "grad_norm": 1.015902700288932, + "learning_rate": 8.333333333333333e-07, + "loss": 0.2103, + "step": 400 + }, + { + "epoch": 0.20828993959591752, + "grad_norm": 1.083927107302103, + "learning_rate": 1.0416666666666667e-06, + "loss": 0.2075, + "step": 500 + }, + { + "epoch": 0.24994792751510103, + "grad_norm": 0.8787980351861964, + "learning_rate": 1.25e-06, + "loss": 0.2049, + "step": 600 + }, + { + "epoch": 0.29160591543428455, + "grad_norm": 0.5454433660253264, + "learning_rate": 1.4583333333333335e-06, + "loss": 0.2001, + "step": 700 + }, + { + "epoch": 0.333263903353468, + "grad_norm": 0.6745519185509095, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.1916, + "step": 800 + }, + { + "epoch": 0.3749218912726515, + "grad_norm": 0.3263511819812891, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.1849, + "step": 900 + }, + { + "epoch": 0.41657987919183503, + "grad_norm": 3.199309878765134, + "learning_rate": 2.0833333333333334e-06, + "loss": 0.1847, + "step": 1000 + }, + { + "epoch": 0.45823786711101855, + "grad_norm": 0.4060106618321982, + "learning_rate": 2.2916666666666666e-06, + "loss": 0.1845, + "step": 1100 + }, + { + "epoch": 0.49989585503020206, + "grad_norm": 0.36591848729629267, + "learning_rate": 2.5e-06, + "loss": 0.1818, + "step": 1200 + }, + { + "epoch": 0.5415538429493856, + "grad_norm": 0.35361804320631923, + "learning_rate": 2.7083333333333334e-06, + "loss": 0.1807, + "step": 1300 + }, + { + "epoch": 0.5832118308685691, + "grad_norm": 0.35892337648275896, + "learning_rate": 2.916666666666667e-06, + "loss": 0.1806, + "step": 1400 + }, + { + "epoch": 0.6248698187877526, + "grad_norm": 0.2820867931414937, + "learning_rate": 3.125e-06, + "loss": 0.1806, + "step": 1500 + }, + { + "epoch": 0.666527806706936, + "grad_norm": 0.3098924570604735, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.1808, + "step": 1600 + }, + { + "epoch": 0.7081857946261195, + "grad_norm": 0.29714949257038253, + "learning_rate": 3.5416666666666673e-06, + "loss": 0.1803, + "step": 1700 + }, + { + "epoch": 0.749843782545303, + "grad_norm": 0.302226244442205, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.1805, + "step": 1800 + }, + { + "epoch": 0.7915017704644866, + "grad_norm": 0.3329180855942572, + "learning_rate": 3.958333333333333e-06, + "loss": 0.1833, + "step": 1900 + }, + { + "epoch": 0.8331597583836701, + "grad_norm": 0.28770265809452183, + "learning_rate": 4.166666666666667e-06, + "loss": 0.1807, + "step": 2000 + }, + { + "epoch": 0.8748177463028536, + "grad_norm": 0.3308819875323557, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.1804, + "step": 2100 + }, + { + "epoch": 0.9164757342220371, + "grad_norm": 0.3163212399640271, + "learning_rate": 4.583333333333333e-06, + "loss": 0.1805, + "step": 2200 + }, + { + "epoch": 0.9581337221412206, + "grad_norm": 0.3898310274135571, + "learning_rate": 4.791666666666668e-06, + "loss": 0.1803, + "step": 2300 + }, + { + "epoch": 0.9997917100604041, + "grad_norm": 0.27784332983216586, + "learning_rate": 5e-06, + "loss": 0.1808, + "step": 2400 + }, + { + "epoch": 1.0, + "eval_loss": 0.18106774985790253, + "eval_runtime": 196.6682, + "eval_samples_per_second": 1388.623, + "eval_steps_per_second": 2.715, + "step": 2401 + }, + { + "epoch": 1.0412414080399917, + "grad_norm": 0.2936543487056633, + "learning_rate": 4.999735579817769e-06, + "loss": 0.1807, + "step": 2500 + }, + { + "epoch": 1.082899395959175, + "grad_norm": 0.2809875255295402, + "learning_rate": 4.998942375205502e-06, + "loss": 0.1801, + "step": 2600 + }, + { + "epoch": 1.1245573838783587, + "grad_norm": 0.2760622198201079, + "learning_rate": 4.997620553954645e-06, + "loss": 0.1801, + "step": 2700 + }, + { + "epoch": 1.166215371797542, + "grad_norm": 0.2710350326429577, + "learning_rate": 4.995770395678171e-06, + "loss": 0.1803, + "step": 2800 + }, + { + "epoch": 1.2078733597167257, + "grad_norm": 0.20931696168572392, + "learning_rate": 4.993392291751431e-06, + "loss": 0.1803, + "step": 2900 + }, + { + "epoch": 1.2495313476359091, + "grad_norm": 0.24323887106839603, + "learning_rate": 4.990486745229364e-06, + "loss": 0.1799, + "step": 3000 + }, + { + "epoch": 1.2911893355550927, + "grad_norm": 0.2815796357302052, + "learning_rate": 4.9870543707400835e-06, + "loss": 0.1798, + "step": 3100 + }, + { + "epoch": 1.3328473234742761, + "grad_norm": 0.23664820561946712, + "learning_rate": 4.983095894354858e-06, + "loss": 0.1801, + "step": 3200 + }, + { + "epoch": 1.3745053113934598, + "grad_norm": 0.3083911955290968, + "learning_rate": 4.978612153434527e-06, + "loss": 0.1801, + "step": 3300 + }, + { + "epoch": 1.4161632993126432, + "grad_norm": 0.24337206279187154, + "learning_rate": 4.973604096452361e-06, + "loss": 0.1799, + "step": 3400 + }, + { + "epoch": 1.4578212872318268, + "grad_norm": 0.2691338598173961, + "learning_rate": 4.968072782793436e-06, + "loss": 0.1798, + "step": 3500 + }, + { + "epoch": 1.4994792751510102, + "grad_norm": 0.1859964729302664, + "learning_rate": 4.962019382530521e-06, + "loss": 0.18, + "step": 3600 + }, + { + "epoch": 1.5411372630701936, + "grad_norm": 0.29588302582709847, + "learning_rate": 4.955445176176577e-06, + "loss": 0.18, + "step": 3700 + }, + { + "epoch": 1.5827952509893772, + "grad_norm": 0.24224751463035848, + "learning_rate": 4.948351554413879e-06, + "loss": 0.1993, + "step": 3800 + }, + { + "epoch": 1.6244532389085609, + "grad_norm": 0.24926986804364754, + "learning_rate": 4.9407400177998335e-06, + "loss": 0.1799, + "step": 3900 + }, + { + "epoch": 1.6661112268277443, + "grad_norm": 0.26907499271712193, + "learning_rate": 4.93261217644956e-06, + "loss": 0.1796, + "step": 4000 + }, + { + "epoch": 1.7077692147469277, + "grad_norm": 0.24652167596434857, + "learning_rate": 4.9239697496952904e-06, + "loss": 0.1797, + "step": 4100 + }, + { + "epoch": 1.7494272026661113, + "grad_norm": 0.26360641338937, + "learning_rate": 4.914814565722671e-06, + "loss": 0.1797, + "step": 4200 + }, + { + "epoch": 1.7910851905852947, + "grad_norm": 0.21211424396568565, + "learning_rate": 4.905148561184033e-06, + "loss": 0.1798, + "step": 4300 + }, + { + "epoch": 1.832743178504478, + "grad_norm": 0.23174306094818595, + "learning_rate": 4.894973780788722e-06, + "loss": 0.1798, + "step": 4400 + }, + { + "epoch": 1.8744011664236617, + "grad_norm": 0.20239856810705756, + "learning_rate": 4.884292376870567e-06, + "loss": 0.1797, + "step": 4500 + }, + { + "epoch": 1.9160591543428453, + "grad_norm": 0.20895880362963307, + "learning_rate": 4.873106608932585e-06, + "loss": 0.1796, + "step": 4600 + }, + { + "epoch": 1.9577171422620288, + "grad_norm": 0.2341875351736524, + "learning_rate": 4.861418843169012e-06, + "loss": 0.1797, + "step": 4700 + }, + { + "epoch": 1.9993751301812122, + "grad_norm": 0.20045835157915606, + "learning_rate": 4.849231551964771e-06, + "loss": 0.1796, + "step": 4800 + }, + { + "epoch": 2.0, + "eval_loss": 0.17972978949546814, + "eval_runtime": 196.3636, + "eval_samples_per_second": 1390.777, + "eval_steps_per_second": 2.719, + "step": 4802 + }, + { + "epoch": 2.0408248281607997, + "grad_norm": 0.21309941078379252, + "learning_rate": 4.836547313372472e-06, + "loss": 0.1795, + "step": 4900 + }, + { + "epoch": 2.0824828160799833, + "grad_norm": 0.19717578427183138, + "learning_rate": 4.823368810567056e-06, + "loss": 0.1794, + "step": 5000 + }, + { + "epoch": 2.124140803999167, + "grad_norm": 0.23023011075724995, + "learning_rate": 4.809698831278217e-06, + "loss": 0.1802, + "step": 5100 + }, + { + "epoch": 2.16579879191835, + "grad_norm": 0.21578484379978355, + "learning_rate": 4.7955402672006855e-06, + "loss": 0.18, + "step": 5200 + }, + { + "epoch": 2.2074567798375337, + "grad_norm": 0.21410225528440446, + "learning_rate": 4.780896113382536e-06, + "loss": 0.1798, + "step": 5300 + }, + { + "epoch": 2.2491147677567174, + "grad_norm": 0.24923656549560563, + "learning_rate": 4.765769467591626e-06, + "loss": 0.1796, + "step": 5400 + }, + { + "epoch": 2.290772755675901, + "grad_norm": 0.27043973727195314, + "learning_rate": 4.750163529660303e-06, + "loss": 0.1799, + "step": 5500 + }, + { + "epoch": 2.332430743595084, + "grad_norm": 0.20084508849747548, + "learning_rate": 4.734081600808531e-06, + "loss": 0.1796, + "step": 5600 + }, + { + "epoch": 2.374088731514268, + "grad_norm": 0.17037675166345598, + "learning_rate": 4.717527082945555e-06, + "loss": 0.1797, + "step": 5700 + }, + { + "epoch": 2.4157467194334514, + "grad_norm": 0.20792174660657012, + "learning_rate": 4.700503477950278e-06, + "loss": 0.1797, + "step": 5800 + }, + { + "epoch": 2.457404707352635, + "grad_norm": 0.20444912332175158, + "learning_rate": 4.6830143869304904e-06, + "loss": 0.1799, + "step": 5900 + }, + { + "epoch": 2.4990626952718182, + "grad_norm": 0.2160441899332462, + "learning_rate": 4.665063509461098e-06, + "loss": 0.1797, + "step": 6000 + }, + { + "epoch": 2.540720683191002, + "grad_norm": 0.25556787549882387, + "learning_rate": 4.646654642801533e-06, + "loss": 0.1794, + "step": 6100 + }, + { + "epoch": 2.5823786711101855, + "grad_norm": 0.22198410769602075, + "learning_rate": 4.627791681092499e-06, + "loss": 0.1794, + "step": 6200 + }, + { + "epoch": 2.624036659029369, + "grad_norm": 0.19549701905963526, + "learning_rate": 4.608478614532215e-06, + "loss": 0.1795, + "step": 6300 + }, + { + "epoch": 2.6656946469485523, + "grad_norm": 0.24454736703986502, + "learning_rate": 4.588719528532342e-06, + "loss": 0.1797, + "step": 6400 + }, + { + "epoch": 2.707352634867736, + "grad_norm": 0.20111965276500102, + "learning_rate": 4.568518602853776e-06, + "loss": 0.1797, + "step": 6500 + }, + { + "epoch": 2.7490106227869195, + "grad_norm": 0.2155615827433472, + "learning_rate": 4.54788011072248e-06, + "loss": 0.1796, + "step": 6600 + }, + { + "epoch": 2.7906686107061027, + "grad_norm": 0.23518049751986453, + "learning_rate": 4.526808417925531e-06, + "loss": 0.1796, + "step": 6700 + }, + { + "epoch": 2.8323265986252864, + "grad_norm": 0.2088881277827675, + "learning_rate": 4.50530798188761e-06, + "loss": 0.1795, + "step": 6800 + }, + { + "epoch": 2.87398458654447, + "grad_norm": 0.22027451607755855, + "learning_rate": 4.4833833507280884e-06, + "loss": 0.1794, + "step": 6900 + }, + { + "epoch": 2.9156425744636536, + "grad_norm": 0.20366425013850817, + "learning_rate": 4.46103916229894e-06, + "loss": 0.1793, + "step": 7000 + }, + { + "epoch": 2.957300562382837, + "grad_norm": 0.2718663681076218, + "learning_rate": 4.438280143203665e-06, + "loss": 0.1796, + "step": 7100 + }, + { + "epoch": 2.9989585503020204, + "grad_norm": 0.19182709064421555, + "learning_rate": 4.415111107797445e-06, + "loss": 0.1794, + "step": 7200 + }, + { + "epoch": 3.0, + "eval_loss": 0.1794959157705307, + "eval_runtime": 196.4289, + "eval_samples_per_second": 1390.315, + "eval_steps_per_second": 2.719, + "step": 7203 + }, + { + "epoch": 3.040408248281608, + "grad_norm": 0.195058367609666, + "learning_rate": 4.391536957168733e-06, + "loss": 0.1798, + "step": 7300 + }, + { + "epoch": 3.0820662362007916, + "grad_norm": 0.2256357073328012, + "learning_rate": 4.367562678102491e-06, + "loss": 0.1795, + "step": 7400 + }, + { + "epoch": 3.123724224119975, + "grad_norm": 0.2129481809880029, + "learning_rate": 4.34319334202531e-06, + "loss": 0.1795, + "step": 7500 + }, + { + "epoch": 3.1653822120391584, + "grad_norm": 0.1689665633552094, + "learning_rate": 4.318434103932622e-06, + "loss": 0.1795, + "step": 7600 + }, + { + "epoch": 3.207040199958342, + "grad_norm": 0.18434140023135, + "learning_rate": 4.293290201298224e-06, + "loss": 0.1796, + "step": 7700 + }, + { + "epoch": 3.2486981878775256, + "grad_norm": 0.2103528683280332, + "learning_rate": 4.267766952966369e-06, + "loss": 0.1793, + "step": 7800 + }, + { + "epoch": 3.290356175796709, + "grad_norm": 0.16087446181904855, + "learning_rate": 4.241869758026638e-06, + "loss": 0.1794, + "step": 7900 + }, + { + "epoch": 3.3320141637158924, + "grad_norm": 0.22569144057534085, + "learning_rate": 4.215604094671835e-06, + "loss": 0.1792, + "step": 8000 + }, + { + "epoch": 3.373672151635076, + "grad_norm": 0.19990473196998446, + "learning_rate": 4.188975519039151e-06, + "loss": 0.1794, + "step": 8100 + }, + { + "epoch": 3.4153301395542597, + "grad_norm": 0.1902243355455867, + "learning_rate": 4.161989664034844e-06, + "loss": 0.1794, + "step": 8200 + }, + { + "epoch": 3.456988127473443, + "grad_norm": 0.18824118604006632, + "learning_rate": 4.134652238142674e-06, + "loss": 0.1794, + "step": 8300 + }, + { + "epoch": 3.4986461153926265, + "grad_norm": 0.19597204875441573, + "learning_rate": 4.106969024216348e-06, + "loss": 0.1794, + "step": 8400 + }, + { + "epoch": 3.54030410331181, + "grad_norm": 0.17674897479656335, + "learning_rate": 4.078945878256244e-06, + "loss": 0.1793, + "step": 8500 + }, + { + "epoch": 3.5819620912309933, + "grad_norm": 0.19658906636767987, + "learning_rate": 4.0505887281706505e-06, + "loss": 0.1794, + "step": 8600 + }, + { + "epoch": 3.623620079150177, + "grad_norm": 0.1607909455989355, + "learning_rate": 4.021903572521802e-06, + "loss": 0.1794, + "step": 8700 + }, + { + "epoch": 3.6652780670693605, + "grad_norm": 0.18982136425367155, + "learning_rate": 3.992896479256966e-06, + "loss": 0.1793, + "step": 8800 + }, + { + "epoch": 3.706936054988544, + "grad_norm": 0.18212426964310202, + "learning_rate": 3.963573584424852e-06, + "loss": 0.1794, + "step": 8900 + }, + { + "epoch": 3.748594042907728, + "grad_norm": 0.18731109638030716, + "learning_rate": 3.933941090877615e-06, + "loss": 0.1799, + "step": 9000 + }, + { + "epoch": 3.790252030826911, + "grad_norm": 0.2243920924541318, + "learning_rate": 3.9040052669587325e-06, + "loss": 0.1863, + "step": 9100 + }, + { + "epoch": 3.8319100187460946, + "grad_norm": 0.19665494095424324, + "learning_rate": 3.8737724451770155e-06, + "loss": 0.1793, + "step": 9200 + }, + { + "epoch": 3.8735680066652782, + "grad_norm": 0.1709097835399287, + "learning_rate": 3.8432490208670605e-06, + "loss": 0.1792, + "step": 9300 + }, + { + "epoch": 3.9152259945844614, + "grad_norm": 0.1519558310026607, + "learning_rate": 3.8124414508364005e-06, + "loss": 0.1792, + "step": 9400 + }, + { + "epoch": 3.956883982503645, + "grad_norm": 0.18615584510557248, + "learning_rate": 3.7813562519996633e-06, + "loss": 0.1791, + "step": 9500 + }, + { + "epoch": 3.9985419704228287, + "grad_norm": 0.14216906700933155, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.1792, + "step": 9600 + }, + { + "epoch": 4.0, + "eval_loss": 0.17919312417507172, + "eval_runtime": 196.5199, + "eval_samples_per_second": 1389.671, + "eval_steps_per_second": 2.717, + "step": 9604 + }, + { + "epoch": 4.039991668402416, + "grad_norm": 0.1981373334933009, + "learning_rate": 3.7183793278181063e-06, + "loss": 0.1793, + "step": 9700 + }, + { + "epoch": 4.081649656321599, + "grad_norm": 0.1796707844873524, + "learning_rate": 3.6865009243691015e-06, + "loss": 0.1791, + "step": 9800 + }, + { + "epoch": 4.123307644240783, + "grad_norm": 0.21582792834146144, + "learning_rate": 3.654371533087586e-06, + "loss": 0.1792, + "step": 9900 + }, + { + "epoch": 4.164965632159967, + "grad_norm": 0.22285894509633086, + "learning_rate": 3.621997950501156e-06, + "loss": 0.179, + "step": 10000 + }, + { + "epoch": 4.20662362007915, + "grad_norm": 0.1947839176316504, + "learning_rate": 3.5893870247926986e-06, + "loss": 0.1792, + "step": 10100 + }, + { + "epoch": 4.248281607998334, + "grad_norm": 0.18044045004936568, + "learning_rate": 3.556545654351749e-06, + "loss": 0.1791, + "step": 10200 + }, + { + "epoch": 4.2899395959175175, + "grad_norm": 0.21629122720481903, + "learning_rate": 3.5234807863152316e-06, + "loss": 0.1793, + "step": 10300 + }, + { + "epoch": 4.3315975838367, + "grad_norm": 0.15404290423986947, + "learning_rate": 3.4901994150978926e-06, + "loss": 0.1791, + "step": 10400 + }, + { + "epoch": 4.373255571755884, + "grad_norm": 0.16032922618842949, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.1791, + "step": 10500 + }, + { + "epoch": 4.4149135596750675, + "grad_norm": 0.1495191719599753, + "learning_rate": 3.4230153682817112e-06, + "loss": 0.1791, + "step": 10600 + }, + { + "epoch": 4.456571547594251, + "grad_norm": 0.19697439856186114, + "learning_rate": 3.389126904537192e-06, + "loss": 0.1791, + "step": 10700 + }, + { + "epoch": 4.498229535513435, + "grad_norm": 0.17156322418134476, + "learning_rate": 3.3550503583141726e-06, + "loss": 0.1791, + "step": 10800 + }, + { + "epoch": 4.539887523432618, + "grad_norm": 0.1561878142062692, + "learning_rate": 3.3207929380339034e-06, + "loss": 0.1792, + "step": 10900 + }, + { + "epoch": 4.581545511351802, + "grad_norm": 0.1828679685381653, + "learning_rate": 3.2863618903790346e-06, + "loss": 0.1791, + "step": 11000 + }, + { + "epoch": 4.623203499270986, + "grad_norm": 0.1802733896031037, + "learning_rate": 3.2517644987606827e-06, + "loss": 0.1792, + "step": 11100 + }, + { + "epoch": 4.664861487190168, + "grad_norm": 0.15579534435978112, + "learning_rate": 3.217008081777726e-06, + "loss": 0.1791, + "step": 11200 + }, + { + "epoch": 4.706519475109352, + "grad_norm": 0.16638908065693153, + "learning_rate": 3.182099991668653e-06, + "loss": 0.1791, + "step": 11300 + }, + { + "epoch": 4.748177463028536, + "grad_norm": 0.18397163828033228, + "learning_rate": 3.147047612756302e-06, + "loss": 0.1792, + "step": 11400 + }, + { + "epoch": 4.789835450947719, + "grad_norm": 0.17751483450519995, + "learning_rate": 3.1118583598858097e-06, + "loss": 0.179, + "step": 11500 + }, + { + "epoch": 4.831493438866903, + "grad_norm": 0.1808778224251496, + "learning_rate": 3.0765396768561005e-06, + "loss": 0.179, + "step": 11600 + }, + { + "epoch": 4.8731514267860865, + "grad_norm": 0.17593346330767928, + "learning_rate": 3.0410990348452572e-06, + "loss": 0.1793, + "step": 11700 + }, + { + "epoch": 4.91480941470527, + "grad_norm": 0.15824861181745342, + "learning_rate": 3.0055439308300954e-06, + "loss": 0.1791, + "step": 11800 + }, + { + "epoch": 4.956467402624453, + "grad_norm": 0.21055777806239853, + "learning_rate": 2.96988188600028e-06, + "loss": 0.1792, + "step": 11900 + }, + { + "epoch": 4.9981253905436365, + "grad_norm": 0.15352806003656314, + "learning_rate": 2.9341204441673267e-06, + "loss": 0.1791, + "step": 12000 + }, + { + "epoch": 5.0, + "eval_loss": 0.17911389470100403, + "eval_runtime": 196.4564, + "eval_samples_per_second": 1390.12, + "eval_steps_per_second": 2.718, + "step": 12005 + }, + { + "epoch": 5.0395750885232244, + "grad_norm": 0.1891820592041876, + "learning_rate": 2.898267170168807e-06, + "loss": 0.1791, + "step": 12100 + }, + { + "epoch": 5.081233076442408, + "grad_norm": 0.14302405130068518, + "learning_rate": 2.862329648268117e-06, + "loss": 0.1789, + "step": 12200 + }, + { + "epoch": 5.122891064361592, + "grad_norm": 0.2215960599158716, + "learning_rate": 2.82631548055013e-06, + "loss": 0.1792, + "step": 12300 + }, + { + "epoch": 5.164549052280774, + "grad_norm": 0.1566593937408507, + "learning_rate": 2.7902322853130758e-06, + "loss": 0.179, + "step": 12400 + }, + { + "epoch": 5.206207040199958, + "grad_norm": 0.15513379693358573, + "learning_rate": 2.754087695457005e-06, + "loss": 0.1791, + "step": 12500 + }, + { + "epoch": 5.247865028119142, + "grad_norm": 0.14968722299942713, + "learning_rate": 2.717889356869146e-06, + "loss": 0.179, + "step": 12600 + }, + { + "epoch": 5.289523016038325, + "grad_norm": 0.2097123380235341, + "learning_rate": 2.681644926806527e-06, + "loss": 0.179, + "step": 12700 + }, + { + "epoch": 5.331181003957509, + "grad_norm": 0.19315969222642626, + "learning_rate": 2.6453620722761897e-06, + "loss": 0.179, + "step": 12800 + }, + { + "epoch": 5.372838991876693, + "grad_norm": 0.2209634744371871, + "learning_rate": 2.6090484684133406e-06, + "loss": 0.1791, + "step": 12900 + }, + { + "epoch": 5.414496979795876, + "grad_norm": 0.20430693758591473, + "learning_rate": 2.572711796857779e-06, + "loss": 0.179, + "step": 13000 + }, + { + "epoch": 5.45615496771506, + "grad_norm": 0.18903967369853375, + "learning_rate": 2.5363597441289574e-06, + "loss": 0.179, + "step": 13100 + }, + { + "epoch": 5.4978129556342425, + "grad_norm": 0.15616083753477006, + "learning_rate": 2.5e-06, + "loss": 0.179, + "step": 13200 + }, + { + "epoch": 5.539470943553426, + "grad_norm": 0.1507559008561688, + "learning_rate": 2.4636402558710434e-06, + "loss": 0.1791, + "step": 13300 + }, + { + "epoch": 5.58112893147261, + "grad_norm": 0.16640062646644058, + "learning_rate": 2.4272882031422216e-06, + "loss": 0.179, + "step": 13400 + }, + { + "epoch": 5.622786919391793, + "grad_norm": 0.1824434916593794, + "learning_rate": 2.3909515315866606e-06, + "loss": 0.1791, + "step": 13500 + }, + { + "epoch": 5.664444907310977, + "grad_norm": 0.2004975100759413, + "learning_rate": 2.3546379277238107e-06, + "loss": 0.179, + "step": 13600 + }, + { + "epoch": 5.706102895230161, + "grad_norm": 0.17154522514366766, + "learning_rate": 2.318355073193474e-06, + "loss": 0.1791, + "step": 13700 + }, + { + "epoch": 5.747760883149343, + "grad_norm": 0.13248550006328844, + "learning_rate": 2.2821106431308546e-06, + "loss": 0.179, + "step": 13800 + }, + { + "epoch": 5.789418871068527, + "grad_norm": 0.1915171020600886, + "learning_rate": 2.2459123045429953e-06, + "loss": 0.1792, + "step": 13900 + }, + { + "epoch": 5.831076858987711, + "grad_norm": 0.16235356856597902, + "learning_rate": 2.2097677146869242e-06, + "loss": 0.1791, + "step": 14000 + }, + { + "epoch": 5.872734846906894, + "grad_norm": 0.1627140490119954, + "learning_rate": 2.173684519449872e-06, + "loss": 0.1789, + "step": 14100 + }, + { + "epoch": 5.914392834826078, + "grad_norm": 0.16466884224746445, + "learning_rate": 2.1376703517318835e-06, + "loss": 0.179, + "step": 14200 + }, + { + "epoch": 5.9560508227452615, + "grad_norm": 0.20611687756993843, + "learning_rate": 2.101732829831194e-06, + "loss": 0.179, + "step": 14300 + }, + { + "epoch": 5.997708810664445, + "grad_norm": 0.16559158144998481, + "learning_rate": 2.0658795558326745e-06, + "loss": 0.179, + "step": 14400 + }, + { + "epoch": 6.0, + "eval_loss": 0.17907121777534485, + "eval_runtime": 196.4273, + "eval_samples_per_second": 1390.326, + "eval_steps_per_second": 2.719, + "step": 14406 + }, + { + "epoch": 6.039158508644032, + "grad_norm": 0.16927649861039284, + "learning_rate": 2.0301181139997206e-06, + "loss": 0.1789, + "step": 14500 + }, + { + "epoch": 6.080816496563216, + "grad_norm": 0.1752142512252337, + "learning_rate": 1.994456069169906e-06, + "loss": 0.179, + "step": 14600 + }, + { + "epoch": 6.1224744844823995, + "grad_norm": 0.21170178196900302, + "learning_rate": 1.958900965154743e-06, + "loss": 0.1789, + "step": 14700 + }, + { + "epoch": 6.164132472401583, + "grad_norm": 0.21884267966966597, + "learning_rate": 1.9234603231439e-06, + "loss": 0.1788, + "step": 14800 + }, + { + "epoch": 6.205790460320767, + "grad_norm": 0.17106948371146288, + "learning_rate": 1.8881416401141905e-06, + "loss": 0.1788, + "step": 14900 + }, + { + "epoch": 6.24744844823995, + "grad_norm": 0.174097273230219, + "learning_rate": 1.852952387243698e-06, + "loss": 0.1788, + "step": 15000 + }, + { + "epoch": 6.289106436159133, + "grad_norm": 0.20862365699110258, + "learning_rate": 1.8179000083313483e-06, + "loss": 0.1788, + "step": 15100 + }, + { + "epoch": 6.330764424078317, + "grad_norm": 0.17885797151549512, + "learning_rate": 1.7829919182222752e-06, + "loss": 0.1788, + "step": 15200 + }, + { + "epoch": 6.3724224119975, + "grad_norm": 0.19498914359958716, + "learning_rate": 1.7482355012393177e-06, + "loss": 0.1789, + "step": 15300 + }, + { + "epoch": 6.414080399916684, + "grad_norm": 0.1389966716220221, + "learning_rate": 1.7136381096209665e-06, + "loss": 0.179, + "step": 15400 + }, + { + "epoch": 6.455738387835868, + "grad_norm": 0.1786092324697337, + "learning_rate": 1.6792070619660977e-06, + "loss": 0.179, + "step": 15500 + }, + { + "epoch": 6.497396375755051, + "grad_norm": 0.19161758807721282, + "learning_rate": 1.6449496416858285e-06, + "loss": 0.1788, + "step": 15600 + }, + { + "epoch": 6.539054363674235, + "grad_norm": 0.19197303954060144, + "learning_rate": 1.6108730954628093e-06, + "loss": 0.1788, + "step": 15700 + }, + { + "epoch": 6.580712351593418, + "grad_norm": 0.16743828588501417, + "learning_rate": 1.5769846317182894e-06, + "loss": 0.1787, + "step": 15800 + }, + { + "epoch": 6.622370339512601, + "grad_norm": 0.16492318029574304, + "learning_rate": 1.5432914190872757e-06, + "loss": 0.1788, + "step": 15900 + }, + { + "epoch": 6.664028327431785, + "grad_norm": 0.15440438163304784, + "learning_rate": 1.509800584902108e-06, + "loss": 0.1789, + "step": 16000 + }, + { + "epoch": 6.7056863153509685, + "grad_norm": 0.17667275704806315, + "learning_rate": 1.4765192136847686e-06, + "loss": 0.1789, + "step": 16100 + }, + { + "epoch": 6.747344303270152, + "grad_norm": 0.17904015323124156, + "learning_rate": 1.443454345648252e-06, + "loss": 0.1789, + "step": 16200 + }, + { + "epoch": 6.789002291189336, + "grad_norm": 0.16736730033822061, + "learning_rate": 1.4106129752073023e-06, + "loss": 0.179, + "step": 16300 + }, + { + "epoch": 6.830660279108519, + "grad_norm": 0.16038102753372047, + "learning_rate": 1.3780020494988447e-06, + "loss": 0.179, + "step": 16400 + }, + { + "epoch": 6.872318267027703, + "grad_norm": 0.15315299560909978, + "learning_rate": 1.3456284669124159e-06, + "loss": 0.1786, + "step": 16500 + }, + { + "epoch": 6.913976254946886, + "grad_norm": 0.1430660492396621, + "learning_rate": 1.313499075630899e-06, + "loss": 0.179, + "step": 16600 + }, + { + "epoch": 6.955634242866069, + "grad_norm": 0.17326024703322063, + "learning_rate": 1.2816206721818944e-06, + "loss": 0.1789, + "step": 16700 + }, + { + "epoch": 6.997292230785253, + "grad_norm": 0.14987232796770428, + "learning_rate": 1.2500000000000007e-06, + "loss": 0.1787, + "step": 16800 + }, + { + "epoch": 7.0, + "eval_loss": 0.17893224954605103, + "eval_runtime": 196.4121, + "eval_samples_per_second": 1390.434, + "eval_steps_per_second": 2.719, + "step": 16807 + }, + { + "epoch": 7.038741928764841, + "grad_norm": 0.1439804790666206, + "learning_rate": 1.218643748000337e-06, + "loss": 0.1787, + "step": 16900 + }, + { + "epoch": 7.080399916684025, + "grad_norm": 0.1820620837643405, + "learning_rate": 1.1875585491636e-06, + "loss": 0.1788, + "step": 17000 + }, + { + "epoch": 7.122057904603207, + "grad_norm": 0.1619570282327302, + "learning_rate": 1.1567509791329402e-06, + "loss": 0.1786, + "step": 17100 + }, + { + "epoch": 7.163715892522391, + "grad_norm": 0.2470491812569796, + "learning_rate": 1.1262275548229852e-06, + "loss": 0.1791, + "step": 17200 + }, + { + "epoch": 7.205373880441575, + "grad_norm": 0.18058952670407366, + "learning_rate": 1.0959947330412681e-06, + "loss": 0.1789, + "step": 17300 + }, + { + "epoch": 7.247031868360758, + "grad_norm": 0.20589528394837478, + "learning_rate": 1.0660589091223854e-06, + "loss": 0.1786, + "step": 17400 + }, + { + "epoch": 7.288689856279942, + "grad_norm": 0.13562633767825757, + "learning_rate": 1.0364264155751489e-06, + "loss": 0.1786, + "step": 17500 + }, + { + "epoch": 7.330347844199125, + "grad_norm": 0.194696644563295, + "learning_rate": 1.0071035207430352e-06, + "loss": 0.1787, + "step": 17600 + }, + { + "epoch": 7.372005832118309, + "grad_norm": 0.19213496981753242, + "learning_rate": 9.780964274781984e-07, + "loss": 0.1786, + "step": 17700 + }, + { + "epoch": 7.413663820037492, + "grad_norm": 0.19876379595232896, + "learning_rate": 9.494112718293503e-07, + "loss": 0.1787, + "step": 17800 + }, + { + "epoch": 7.455321807956675, + "grad_norm": 0.1684329683430977, + "learning_rate": 9.210541217437566e-07, + "loss": 0.1787, + "step": 17900 + }, + { + "epoch": 7.496979795875859, + "grad_norm": 0.1823625942631362, + "learning_rate": 8.930309757836517e-07, + "loss": 0.1785, + "step": 18000 + }, + { + "epoch": 7.538637783795043, + "grad_norm": 0.18725762365246973, + "learning_rate": 8.653477618573261e-07, + "loss": 0.1786, + "step": 18100 + }, + { + "epoch": 7.580295771714226, + "grad_norm": 0.1507247392992477, + "learning_rate": 8.380103359651554e-07, + "loss": 0.1787, + "step": 18200 + }, + { + "epoch": 7.62195375963341, + "grad_norm": 0.18505299719524845, + "learning_rate": 8.110244809608494e-07, + "loss": 0.1786, + "step": 18300 + }, + { + "epoch": 7.663611747552594, + "grad_norm": 0.12101506184025812, + "learning_rate": 7.843959053281663e-07, + "loss": 0.1786, + "step": 18400 + }, + { + "epoch": 7.705269735471777, + "grad_norm": 0.16939344528667466, + "learning_rate": 7.581302419733633e-07, + "loss": 0.1785, + "step": 18500 + }, + { + "epoch": 7.74692772339096, + "grad_norm": 0.13840737012325652, + "learning_rate": 7.322330470336314e-07, + "loss": 0.1785, + "step": 18600 + }, + { + "epoch": 7.7885857113101435, + "grad_norm": 0.16859264286478876, + "learning_rate": 7.067097987017762e-07, + "loss": 0.1787, + "step": 18700 + }, + { + "epoch": 7.830243699229327, + "grad_norm": 0.1897535110592711, + "learning_rate": 6.815658960673782e-07, + "loss": 0.1785, + "step": 18800 + }, + { + "epoch": 7.871901687148511, + "grad_norm": 0.18368265058091485, + "learning_rate": 6.568066579746901e-07, + "loss": 0.1785, + "step": 18900 + }, + { + "epoch": 7.913559675067694, + "grad_norm": 0.13696515467419504, + "learning_rate": 6.324373218975105e-07, + "loss": 0.1786, + "step": 19000 + }, + { + "epoch": 7.955217662986878, + "grad_norm": 0.14354515830035847, + "learning_rate": 6.084630428312679e-07, + "loss": 0.1785, + "step": 19100 + }, + { + "epoch": 7.996875650906061, + "grad_norm": 0.15165778139105265, + "learning_rate": 5.848888922025553e-07, + "loss": 0.1786, + "step": 19200 + }, + { + "epoch": 8.0, + "eval_loss": 0.17886345088481903, + "eval_runtime": 196.5554, + "eval_samples_per_second": 1389.42, + "eval_steps_per_second": 2.717, + "step": 19208 + }, + { + "epoch": 8.03832534888565, + "grad_norm": 0.15763312404128105, + "learning_rate": 5.617198567963353e-07, + "loss": 0.1783, + "step": 19300 + }, + { + "epoch": 8.079983336804831, + "grad_norm": 0.1720429493205497, + "learning_rate": 5.389608377010608e-07, + "loss": 0.1783, + "step": 19400 + }, + { + "epoch": 8.121641324724015, + "grad_norm": 0.1690726413308925, + "learning_rate": 5.166166492719124e-07, + "loss": 0.1783, + "step": 19500 + }, + { + "epoch": 8.163299312643199, + "grad_norm": 0.17909925356768044, + "learning_rate": 4.946920181123904e-07, + "loss": 0.1782, + "step": 19600 + }, + { + "epoch": 8.204957300562382, + "grad_norm": 0.22116088190481087, + "learning_rate": 4.7319158207446953e-07, + "loss": 0.1782, + "step": 19700 + }, + { + "epoch": 8.246615288481566, + "grad_norm": 0.16383363990929287, + "learning_rate": 4.5211988927752026e-07, + "loss": 0.1782, + "step": 19800 + }, + { + "epoch": 8.28827327640075, + "grad_norm": 0.18255215192836688, + "learning_rate": 4.3148139714622365e-07, + "loss": 0.1782, + "step": 19900 + }, + { + "epoch": 8.329931264319933, + "grad_norm": 0.19783668808521335, + "learning_rate": 4.1128047146765936e-07, + "loss": 0.1781, + "step": 20000 + }, + { + "epoch": 8.371589252239117, + "grad_norm": 0.1828620345488146, + "learning_rate": 3.915213854677863e-07, + "loss": 0.1781, + "step": 20100 + }, + { + "epoch": 8.4132472401583, + "grad_norm": 0.1461266269903454, + "learning_rate": 3.722083189075007e-07, + "loss": 0.1782, + "step": 20200 + }, + { + "epoch": 8.454905228077484, + "grad_norm": 0.19063937525748337, + "learning_rate": 3.5334535719846767e-07, + "loss": 0.1781, + "step": 20300 + }, + { + "epoch": 8.496563215996668, + "grad_norm": 0.12678778363904367, + "learning_rate": 3.3493649053890325e-07, + "loss": 0.1781, + "step": 20400 + }, + { + "epoch": 8.538221203915851, + "grad_norm": 0.15880039262804566, + "learning_rate": 3.1698561306951065e-07, + "loss": 0.1782, + "step": 20500 + }, + { + "epoch": 8.579879191835035, + "grad_norm": 0.18763241075198428, + "learning_rate": 2.9949652204972257e-07, + "loss": 0.178, + "step": 20600 + }, + { + "epoch": 8.621537179754219, + "grad_norm": 0.1582482612527278, + "learning_rate": 2.8247291705444575e-07, + "loss": 0.1778, + "step": 20700 + }, + { + "epoch": 8.6631951676734, + "grad_norm": 0.181992432758085, + "learning_rate": 2.6591839919146963e-07, + "loss": 0.178, + "step": 20800 + }, + { + "epoch": 8.704853155592584, + "grad_norm": 0.1463913122272469, + "learning_rate": 2.4983647033969714e-07, + "loss": 0.1783, + "step": 20900 + }, + { + "epoch": 8.746511143511768, + "grad_norm": 0.15649171707147957, + "learning_rate": 2.3423053240837518e-07, + "loss": 0.1781, + "step": 21000 + }, + { + "epoch": 8.788169131430951, + "grad_norm": 0.16428482803404829, + "learning_rate": 2.1910388661746495e-07, + "loss": 0.1782, + "step": 21100 + }, + { + "epoch": 8.829827119350135, + "grad_norm": 0.19349382720192548, + "learning_rate": 2.044597327993153e-07, + "loss": 0.1781, + "step": 21200 + }, + { + "epoch": 8.871485107269319, + "grad_norm": 0.1678737628788564, + "learning_rate": 1.9030116872178317e-07, + "loss": 0.1781, + "step": 21300 + }, + { + "epoch": 8.913143095188502, + "grad_norm": 0.187501462753097, + "learning_rate": 1.7663118943294367e-07, + "loss": 0.1781, + "step": 21400 + }, + { + "epoch": 8.954801083107686, + "grad_norm": 0.17102799413092362, + "learning_rate": 1.6345268662752904e-07, + "loss": 0.1781, + "step": 21500 + }, + { + "epoch": 8.99645907102687, + "grad_norm": 0.14591121551272715, + "learning_rate": 1.507684480352292e-07, + "loss": 0.1781, + "step": 21600 + }, + { + "epoch": 9.0, + "eval_loss": 0.17907947301864624, + "eval_runtime": 196.3329, + "eval_samples_per_second": 1390.995, + "eval_steps_per_second": 2.72, + "step": 21609 + }, + { + "epoch": 9.037908769006457, + "grad_norm": 0.1816902644971728, + "learning_rate": 1.3858115683098832e-07, + "loss": 0.177, + "step": 21700 + }, + { + "epoch": 9.07956675692564, + "grad_norm": 0.18741449385017522, + "learning_rate": 1.2689339106741529e-07, + "loss": 0.1767, + "step": 21800 + }, + { + "epoch": 9.121224744844824, + "grad_norm": 0.20197534473429568, + "learning_rate": 1.1570762312943295e-07, + "loss": 0.1768, + "step": 21900 + }, + { + "epoch": 9.162882732764007, + "grad_norm": 0.21639195747399645, + "learning_rate": 1.0502621921127776e-07, + "loss": 0.1767, + "step": 22000 + }, + { + "epoch": 9.204540720683191, + "grad_norm": 0.18933606645836426, + "learning_rate": 9.485143881596715e-08, + "loss": 0.1768, + "step": 22100 + }, + { + "epoch": 9.246198708602375, + "grad_norm": 0.1960648079791721, + "learning_rate": 8.518543427732951e-08, + "loss": 0.1767, + "step": 22200 + }, + { + "epoch": 9.287856696521558, + "grad_norm": 0.18056583891057434, + "learning_rate": 7.603025030471001e-08, + "loss": 0.1766, + "step": 22300 + }, + { + "epoch": 9.329514684440742, + "grad_norm": 0.18480124722464905, + "learning_rate": 6.738782355044048e-08, + "loss": 0.1769, + "step": 22400 + }, + { + "epoch": 9.371172672359926, + "grad_norm": 0.22786425388668805, + "learning_rate": 5.92599822001666e-08, + "loss": 0.1767, + "step": 22500 + }, + { + "epoch": 9.41283066027911, + "grad_norm": 0.2205541920741548, + "learning_rate": 5.164844558612131e-08, + "loss": 0.1766, + "step": 22600 + }, + { + "epoch": 9.454488648198293, + "grad_norm": 0.2134938008984885, + "learning_rate": 4.455482382342336e-08, + "loss": 0.1767, + "step": 22700 + }, + { + "epoch": 9.496146636117475, + "grad_norm": 0.23030736326238382, + "learning_rate": 3.798061746947995e-08, + "loss": 0.1767, + "step": 22800 + }, + { + "epoch": 9.537804624036658, + "grad_norm": 0.2214355490299709, + "learning_rate": 3.1927217206564884e-08, + "loss": 0.1767, + "step": 22900 + }, + { + "epoch": 9.579462611955842, + "grad_norm": 0.2291392443441154, + "learning_rate": 2.6395903547638825e-08, + "loss": 0.1765, + "step": 23000 + }, + { + "epoch": 9.621120599875026, + "grad_norm": 0.22120778210484332, + "learning_rate": 2.1387846565474047e-08, + "loss": 0.1765, + "step": 23100 + }, + { + "epoch": 9.66277858779421, + "grad_norm": 0.1927066727358843, + "learning_rate": 1.6904105645142443e-08, + "loss": 0.1765, + "step": 23200 + }, + { + "epoch": 9.704436575713393, + "grad_norm": 0.2369391538896648, + "learning_rate": 1.2945629259917547e-08, + "loss": 0.1766, + "step": 23300 + }, + { + "epoch": 9.746094563632576, + "grad_norm": 0.21269587694232558, + "learning_rate": 9.513254770636138e-09, + "loss": 0.1767, + "step": 23400 + }, + { + "epoch": 9.78775255155176, + "grad_norm": 0.20767475535201343, + "learning_rate": 6.607708248569378e-09, + "loss": 0.1766, + "step": 23500 + }, + { + "epoch": 9.829410539470944, + "grad_norm": 0.21058981271348698, + "learning_rate": 4.229604321829561e-09, + "loss": 0.1766, + "step": 23600 + }, + { + "epoch": 9.871068527390127, + "grad_norm": 0.18917603463369678, + "learning_rate": 2.3794460453555046e-09, + "loss": 0.1766, + "step": 23700 + }, + { + "epoch": 9.912726515309311, + "grad_norm": 0.18145195315540197, + "learning_rate": 1.0576247944985018e-09, + "loss": 0.1767, + "step": 23800 + }, + { + "epoch": 9.954384503228495, + "grad_norm": 0.22385123601872012, + "learning_rate": 2.6442018223132857e-10, + "loss": 0.1766, + "step": 23900 + }, + { + "epoch": 9.996042491147678, + "grad_norm": 0.22063368359660335, + "learning_rate": 0.0, + "loss": 0.1766, + "step": 24000 + }, + { + "epoch": 9.996042491147678, + "eval_loss": 0.18023133277893066, + "eval_runtime": 196.0313, + "eval_samples_per_second": 1393.135, + "eval_steps_per_second": 2.724, + "step": 24000 + }, + { + "epoch": 9.996042491147678, + "step": 24000, + "total_flos": 5485114750402560.0, + "train_loss": 0.19645737719535827, + "train_runtime": 70712.6152, + "train_samples_per_second": 347.587, + "train_steps_per_second": 0.339 + } + ], + "logging_steps": 100, + "max_steps": 24000, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5485114750402560.0, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +} diff --git a/saves/chess/generate_strategy/training_args.bin b/saves/chess/generate_strategy/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..773ce6240443c97c5f4f17d0c292e93b3f620d6d --- /dev/null +++ b/saves/chess/generate_strategy/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c4854ece4d5ef51aa764407c1d839019391947a59f720f2fc5ec761b53b0838 +size 7416 diff --git a/saves/chess/generate_strategy/training_eval_loss.png b/saves/chess/generate_strategy/training_eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..5ef84f8b1ad40dc649d3c0caa71e9a578d454912 Binary files /dev/null and b/saves/chess/generate_strategy/training_eval_loss.png differ diff --git a/saves/chess/generate_strategy/training_loss.png b/saves/chess/generate_strategy/training_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..c21c97cf46dd77b26059f7673d541bb8b6486a1c Binary files /dev/null and b/saves/chess/generate_strategy/training_loss.png differ diff --git a/saves/chess/no_explain/README.md b/saves/chess/no_explain/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b0739cac499014de092eace3e685901dcc36b436 --- /dev/null +++ b/saves/chess/no_explain/README.md @@ -0,0 +1,77 @@ +--- +library_name: transformers +license: other +base_model: meta-llama/Meta-Llama-3-8B-Instruct +tags: +- llama-factory +- full +- generated_from_trainer +model-index: +- name: no_explain + results: [] +--- + + + +# no_explain + +This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on the chess_explain_noexplain_00, the chess_explain_noexplain_01, the chess_explain_noexplain_02, the chess_explain_noexplain_03, the chess_explain_noexplain_04, the chess_explain_noexplain_05, the chess_explain_noexplain_06, the chess_explain_noexplain_07, the chess_explain_noexplain_08, the chess_explain_noexplain_09, the chess_explain_noexplain_10, the chess_explain_noexplain_11, the chess_explain_noexplain_12, the chess_explain_noexplain_13 and the chess_explain_noexplain_14 datasets. +It achieves the following results on the evaluation set: +- Loss: 0.0932 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-06 +- train_batch_size: 64 +- eval_batch_size: 64 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 8 +- gradient_accumulation_steps: 2 +- total_train_batch_size: 1024 +- total_eval_batch_size: 512 +- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 10.0 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | +|:-------------:|:------:|:-----:|:---------------:| +| 0.0429 | 0.8010 | 1000 | 0.0422 | +| 0.0329 | 1.6015 | 2000 | 0.0336 | +| 0.0275 | 2.4021 | 3000 | 0.0297 | +| 0.0202 | 3.2026 | 4000 | 0.0292 | +| 0.0194 | 4.0032 | 5000 | 0.0294 | +| 0.0119 | 4.8042 | 6000 | 0.0311 | +| 0.0048 | 5.6047 | 7000 | 0.0439 | +| 0.0013 | 6.4053 | 8000 | 0.0538 | +| 0.0004 | 7.2058 | 9000 | 0.0670 | +| 0.0003 | 8.0064 | 10000 | 0.0698 | +| 0.0 | 8.8074 | 11000 | 0.0894 | +| 0.0 | 9.6079 | 12000 | 0.0931 | + + +### Framework versions + +- Transformers 4.48.2 +- Pytorch 2.6.0+cu124 +- Datasets 2.21.0 +- Tokenizers 0.21.0 diff --git a/saves/chess/no_explain/all_results.json b/saves/chess/no_explain/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..19eca1acad5b8db9479be0979aa71125636f6632 --- /dev/null +++ b/saves/chess/no_explain/all_results.json @@ -0,0 +1,12 @@ +{ + "epoch": 9.992390869042852, + "eval_loss": 0.09318816661834717, + "eval_runtime": 97.0026, + "eval_samples_per_second": 1464.136, + "eval_steps_per_second": 2.866, + "total_flos": 2784163811819520.0, + "train_loss": 0.025371345406674895, + "train_runtime": 36703.9164, + "train_samples_per_second": 348.252, + "train_steps_per_second": 0.34 +} \ No newline at end of file diff --git a/saves/chess/no_explain/checkpoint-4000/config.json b/saves/chess/no_explain/checkpoint-4000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..fe9ce0e7d2a8ad9d74229897630ae54102a0a1a3 --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.48.2", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/saves/chess/no_explain/checkpoint-4000/generation_config.json b/saves/chess/no_explain/checkpoint-4000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eb70ec09806f7ce366dd58e8239ad0ca2d5babf1 --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128009 + ], + "max_length": 4096, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.48.2" +} diff --git a/saves/chess/no_explain/checkpoint-4000/latest b/saves/chess/no_explain/checkpoint-4000/latest new file mode 100644 index 0000000000000000000000000000000000000000..bde045b84f7344a502489c347cb8527c3cce2ef5 --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/latest @@ -0,0 +1 @@ +global_step3998 \ No newline at end of file diff --git a/saves/chess/no_explain/checkpoint-4000/model-00001-of-00004.safetensors b/saves/chess/no_explain/checkpoint-4000/model-00001-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3c671cd9c9bcfc161ae9ac606058f417b25843ee --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47dde464d2aefd37eda2815b5ba61b57d361c3bce52e792d3c1ab69102a55110 +size 4976698672 diff --git a/saves/chess/no_explain/checkpoint-4000/model-00002-of-00004.safetensors b/saves/chess/no_explain/checkpoint-4000/model-00002-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cfc74a973715e43594e2678567e12aab1c679e92 --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfde70000ea34a2629d392c10f2bc415d8d6fde186a2d31d992fc0c70386f154 +size 4999802720 diff --git a/saves/chess/no_explain/checkpoint-4000/model-00003-of-00004.safetensors b/saves/chess/no_explain/checkpoint-4000/model-00003-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..704a9ff77226551034fd2040957b56800db344f3 --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:366054a4bee6871b7747951631c6f684e622cadc61ffb309bb794f49ca09d355 +size 4915916176 diff --git a/saves/chess/no_explain/checkpoint-4000/model-00004-of-00004.safetensors b/saves/chess/no_explain/checkpoint-4000/model-00004-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..224d9585cfa423b5bfa4d3849e745e8f82d57429 --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3fd155657116ece6520b0be23250ac218e093f5e3fb69a33fcb0227a3a62ce3 +size 1168138808 diff --git a/saves/chess/no_explain/checkpoint-4000/model.safetensors.index.json b/saves/chess/no_explain/checkpoint-4000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0fd8120f1c6acddc268ebc2583058efaf699a771 --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/saves/chess/no_explain/checkpoint-4000/rng_state_0.pth b/saves/chess/no_explain/checkpoint-4000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a74f25da28f01a2e6b66587824ee5f5cc9be737 --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ee195ebde9bf012f945f068f133e7fe22fef5450c496607e3ef11cc2034a186 +size 15984 diff --git a/saves/chess/no_explain/checkpoint-4000/rng_state_1.pth b/saves/chess/no_explain/checkpoint-4000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f44ddc47315653477728c971b4ea191a3df8b92c --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf0fe1a3315d60b197207c5cb249d0ce4f9ce6d7585e696276d9ffbcb5379893 +size 15984 diff --git a/saves/chess/no_explain/checkpoint-4000/rng_state_2.pth b/saves/chess/no_explain/checkpoint-4000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..04636b9eca6484a4339eaa1e3acdf15d42d493b3 --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01c5bd6eae04542162b3e94245555bd81312524066bc01d0ebbfc4fd8554240e +size 15984 diff --git a/saves/chess/no_explain/checkpoint-4000/rng_state_3.pth b/saves/chess/no_explain/checkpoint-4000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..05435e407541728c3159054a4beb6705039a8ddf --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b74942c68b00d657cfce186b0eeb4aa8f52efa04b114803b605fee8de45972 +size 15984 diff --git a/saves/chess/no_explain/checkpoint-4000/rng_state_4.pth b/saves/chess/no_explain/checkpoint-4000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..94fdf5f2c3e5df27424e6482bf52255531147a23 --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd66dd2ba958fc9929441817d8154abbd929c0aa9cd66ff3171965bdaaf5d78 +size 15984 diff --git a/saves/chess/no_explain/checkpoint-4000/rng_state_5.pth b/saves/chess/no_explain/checkpoint-4000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..da6e37fc011d97a1512e1e746bdd410a738c018a --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89eeedefdd62514d0130acc330a5c08e9774c95d38c60997905cfd65fc54b710 +size 15984 diff --git a/saves/chess/no_explain/checkpoint-4000/rng_state_6.pth b/saves/chess/no_explain/checkpoint-4000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..751fd85c617e15dee9713bc0f0c533af5bd18c8e --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ced939100082608f57561a10e1888e69210c80675068db530c5815889910e +size 15984 diff --git a/saves/chess/no_explain/checkpoint-4000/rng_state_7.pth b/saves/chess/no_explain/checkpoint-4000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4aacf54fa8285b7e199a7cd62f1ee3d8b9beb5e5 --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d8d6ee244d99525e7004ae3f02d44ae63082d81fbbab7306f641ac6aeeb736f +size 15984 diff --git a/saves/chess/no_explain/checkpoint-4000/scheduler.pt b/saves/chess/no_explain/checkpoint-4000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3b39f0ea2d9092bec75f0da2301e46c33457524 --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:624e83d8bb1c2a780d4eb55df92ee2a5d78647169f3aca1e61c43c0cf57e3359 +size 1064 diff --git a/saves/chess/no_explain/checkpoint-4000/special_tokens_map.json b/saves/chess/no_explain/checkpoint-4000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..df5c3a478b842fa66e6a8c10265478284c1d4f41 --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/special_tokens_map.json @@ -0,0 +1,33 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/saves/chess/no_explain/checkpoint-4000/tokenizer.json b/saves/chess/no_explain/checkpoint-4000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..172311123ab62378f1f6d90f3068a676b7d939ed --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 +size 17210148 diff --git a/saves/chess/no_explain/checkpoint-4000/tokenizer_config.json b/saves/chess/no_explain/checkpoint-4000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e2afd45c14285320f15500548266d7adba98d07a --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/tokenizer_config.json @@ -0,0 +1,2078 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eot_id|>", + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 2048, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/saves/chess/no_explain/checkpoint-4000/trainer_state.json b/saves/chess/no_explain/checkpoint-4000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..98915fa94a5536c938e024c89cbeb70198d7f6b8 --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/trainer_state.json @@ -0,0 +1,345 @@ +{ + "best_metric": 0.029243575409054756, + "best_model_checkpoint": "saves/chess/no_explain/checkpoint-4000", + "epoch": 3.202643171806167, + "eval_steps": 1000, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08009611533840609, + "grad_norm": 0.8625897724596373, + "learning_rate": 4.006410256410257e-07, + "loss": 1.3897, + "step": 100 + }, + { + "epoch": 0.16019223067681218, + "grad_norm": 0.8895947937892531, + "learning_rate": 8.012820512820515e-07, + "loss": 0.0598, + "step": 200 + }, + { + "epoch": 0.24028834601521826, + "grad_norm": 0.5221246844134636, + "learning_rate": 1.201923076923077e-06, + "loss": 0.0551, + "step": 300 + }, + { + "epoch": 0.32038446135362436, + "grad_norm": 0.5590357289952654, + "learning_rate": 1.602564102564103e-06, + "loss": 0.0516, + "step": 400 + }, + { + "epoch": 0.4004805766920304, + "grad_norm": 0.36991974174438536, + "learning_rate": 2.0032051282051286e-06, + "loss": 0.0501, + "step": 500 + }, + { + "epoch": 0.4805766920304365, + "grad_norm": 0.6389443947236714, + "learning_rate": 2.403846153846154e-06, + "loss": 0.0486, + "step": 600 + }, + { + "epoch": 0.5606728073688426, + "grad_norm": 0.44563280571067243, + "learning_rate": 2.8044871794871797e-06, + "loss": 0.0463, + "step": 700 + }, + { + "epoch": 0.6407689227072487, + "grad_norm": 0.44266380357676305, + "learning_rate": 3.205128205128206e-06, + "loss": 0.0447, + "step": 800 + }, + { + "epoch": 0.7208650380456548, + "grad_norm": 0.585654631503778, + "learning_rate": 3.605769230769231e-06, + "loss": 0.0441, + "step": 900 + }, + { + "epoch": 0.8009611533840608, + "grad_norm": 0.600751877456253, + "learning_rate": 4.006410256410257e-06, + "loss": 0.0429, + "step": 1000 + }, + { + "epoch": 0.8009611533840608, + "eval_loss": 0.042210426181554794, + "eval_runtime": 97.133, + "eval_samples_per_second": 1462.17, + "eval_steps_per_second": 2.862, + "step": 1000 + }, + { + "epoch": 0.8810572687224669, + "grad_norm": 0.2641551118831142, + "learning_rate": 4.4070512820512826e-06, + "loss": 0.0414, + "step": 1100 + }, + { + "epoch": 0.961153384060873, + "grad_norm": 0.29049561928975876, + "learning_rate": 4.807692307692308e-06, + "loss": 0.0402, + "step": 1200 + }, + { + "epoch": 1.0408490188225872, + "grad_norm": 0.5344113116420023, + "learning_rate": 4.999735579817769e-06, + "loss": 0.0386, + "step": 1300 + }, + { + "epoch": 1.1209451341609933, + "grad_norm": 0.31257482202449377, + "learning_rate": 4.997740994288484e-06, + "loss": 0.0373, + "step": 1400 + }, + { + "epoch": 1.2010412494993994, + "grad_norm": 0.4593106982622164, + "learning_rate": 4.993792498360407e-06, + "loss": 0.0366, + "step": 1500 + }, + { + "epoch": 1.2811373648378055, + "grad_norm": 0.2012883704449717, + "learning_rate": 4.9878931808274796e-06, + "loss": 0.0357, + "step": 1600 + }, + { + "epoch": 1.3612334801762114, + "grad_norm": 0.22908626001592647, + "learning_rate": 4.980047656554856e-06, + "loss": 0.0352, + "step": 1700 + }, + { + "epoch": 1.4413295955146175, + "grad_norm": 0.3169879320183415, + "learning_rate": 4.970262062868821e-06, + "loss": 0.0346, + "step": 1800 + }, + { + "epoch": 1.5214257108530236, + "grad_norm": 0.2078878255601618, + "learning_rate": 4.958544054755741e-06, + "loss": 0.0336, + "step": 1900 + }, + { + "epoch": 1.6015218261914297, + "grad_norm": 0.2978110993331312, + "learning_rate": 4.944902798873794e-06, + "loss": 0.0329, + "step": 2000 + }, + { + "epoch": 1.6015218261914297, + "eval_loss": 0.03361953794956207, + "eval_runtime": 97.2876, + "eval_samples_per_second": 1459.847, + "eval_steps_per_second": 2.858, + "step": 2000 + }, + { + "epoch": 1.6816179415298358, + "grad_norm": 0.16678424956102253, + "learning_rate": 4.92934896638215e-06, + "loss": 0.0328, + "step": 2100 + }, + { + "epoch": 1.761714056868242, + "grad_norm": 0.19029664571581045, + "learning_rate": 4.91189472459324e-06, + "loss": 0.0316, + "step": 2200 + }, + { + "epoch": 1.841810172206648, + "grad_norm": 0.2388908631462674, + "learning_rate": 4.892553727454616e-06, + "loss": 0.0317, + "step": 2300 + }, + { + "epoch": 1.921906287545054, + "grad_norm": 0.15794270702360638, + "learning_rate": 4.8713411048678635e-06, + "loss": 0.0309, + "step": 2400 + }, + { + "epoch": 2.0016019223067683, + "grad_norm": 0.2103115075663395, + "learning_rate": 4.848273450852921e-06, + "loss": 0.0305, + "step": 2500 + }, + { + "epoch": 2.0816980376451744, + "grad_norm": 0.28601246983481904, + "learning_rate": 4.823368810567056e-06, + "loss": 0.0268, + "step": 2600 + }, + { + "epoch": 2.1617941529835805, + "grad_norm": 0.25522616878445004, + "learning_rate": 4.796646666188663e-06, + "loss": 0.0268, + "step": 2700 + }, + { + "epoch": 2.2418902683219866, + "grad_norm": 0.2343538332348778, + "learning_rate": 4.768127921676916e-06, + "loss": 0.0272, + "step": 2800 + }, + { + "epoch": 2.3219863836603922, + "grad_norm": 0.22903658893889398, + "learning_rate": 4.737834886419217e-06, + "loss": 0.0297, + "step": 2900 + }, + { + "epoch": 2.4020824989987988, + "grad_norm": 0.19855668130980528, + "learning_rate": 4.705791257779196e-06, + "loss": 0.0275, + "step": 3000 + }, + { + "epoch": 2.4020824989987988, + "eval_loss": 0.029653793200850487, + "eval_runtime": 97.2179, + "eval_samples_per_second": 1460.893, + "eval_steps_per_second": 2.86, + "step": 3000 + }, + { + "epoch": 2.4821786143372044, + "grad_norm": 0.1868527106405498, + "learning_rate": 4.672022102558958e-06, + "loss": 0.0269, + "step": 3100 + }, + { + "epoch": 2.562274729675611, + "grad_norm": 0.1985255713449175, + "learning_rate": 4.636553837390051e-06, + "loss": 0.0269, + "step": 3200 + }, + { + "epoch": 2.6423708450140166, + "grad_norm": 0.17528235376425527, + "learning_rate": 4.5994142080684956e-06, + "loss": 0.026, + "step": 3300 + }, + { + "epoch": 2.7224669603524227, + "grad_norm": 0.20238382028782428, + "learning_rate": 4.560632267850054e-06, + "loss": 0.026, + "step": 3400 + }, + { + "epoch": 2.802563075690829, + "grad_norm": 0.20789525240306345, + "learning_rate": 4.5202383547227134e-06, + "loss": 0.0257, + "step": 3500 + }, + { + "epoch": 2.882659191029235, + "grad_norm": 0.2849074845845128, + "learning_rate": 4.478264067674155e-06, + "loss": 0.0256, + "step": 3600 + }, + { + "epoch": 2.962755306367641, + "grad_norm": 0.1826392119567578, + "learning_rate": 4.43474224197278e-06, + "loss": 0.0255, + "step": 3700 + }, + { + "epoch": 3.0424509411293554, + "grad_norm": 0.3254043272458406, + "learning_rate": 4.389706923481633e-06, + "loss": 0.0224, + "step": 3800 + }, + { + "epoch": 3.122547056467761, + "grad_norm": 0.2695456046362865, + "learning_rate": 4.34319334202531e-06, + "loss": 0.0198, + "step": 3900 + }, + { + "epoch": 3.202643171806167, + "grad_norm": 0.24345073976828904, + "learning_rate": 4.2952378838306855e-06, + "loss": 0.0202, + "step": 4000 + }, + { + "epoch": 3.202643171806167, + "eval_loss": 0.029243575409054756, + "eval_runtime": 97.6159, + "eval_samples_per_second": 1454.937, + "eval_steps_per_second": 2.848, + "step": 4000 + } + ], + "logging_steps": 100, + "max_steps": 12480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 892260770119680.0, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +} diff --git a/saves/chess/no_explain/checkpoint-4000/training_args.bin b/saves/chess/no_explain/checkpoint-4000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..307671da66dbb1b7ea5eba31217babf26111f74f --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44b434e8cddfefc8be8a47c2feee91ad07bcea3a34db2ebd66e1953e926e28aa +size 7416 diff --git a/saves/chess/no_explain/checkpoint-4000/zero_to_fp32.py b/saves/chess/no_explain/checkpoint-4000/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/saves/chess/no_explain/checkpoint-4000/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/saves/chess/no_explain/config.json b/saves/chess/no_explain/config.json new file mode 100644 index 0000000000000000000000000000000000000000..fe9ce0e7d2a8ad9d74229897630ae54102a0a1a3 --- /dev/null +++ b/saves/chess/no_explain/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.48.2", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/saves/chess/no_explain/eval_results.json b/saves/chess/no_explain/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1d2fe578bb12f8959a4c454792588dbd3e4beb61 --- /dev/null +++ b/saves/chess/no_explain/eval_results.json @@ -0,0 +1,7 @@ +{ + "epoch": 9.992390869042852, + "eval_loss": 0.09318816661834717, + "eval_runtime": 97.0026, + "eval_samples_per_second": 1464.136, + "eval_steps_per_second": 2.866 +} \ No newline at end of file diff --git a/saves/chess/no_explain/generation_config.json b/saves/chess/no_explain/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eb70ec09806f7ce366dd58e8239ad0ca2d5babf1 --- /dev/null +++ b/saves/chess/no_explain/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128009 + ], + "max_length": 4096, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.48.2" +} diff --git a/saves/chess/no_explain/model-00001-of-00004.safetensors b/saves/chess/no_explain/model-00001-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..99bfeb2ed77d7070de22f57f505c20b2a19547ec --- /dev/null +++ b/saves/chess/no_explain/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b78b95283ed92c92c07c1c6ae83be8d9a5ff8d53a56e9ec9a4dfb071c3783d04 +size 4976698672 diff --git a/saves/chess/no_explain/model-00002-of-00004.safetensors b/saves/chess/no_explain/model-00002-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..13c5af36cd173a1b54636139185068d50673dca6 --- /dev/null +++ b/saves/chess/no_explain/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de65dd322045cf57b5bcb84559cb9cebc102880d4de807672ee34fdbb35b0f29 +size 4999802720 diff --git a/saves/chess/no_explain/model-00003-of-00004.safetensors b/saves/chess/no_explain/model-00003-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b350caa5f069a6c0cd0c8f13293ff349ea233e5c --- /dev/null +++ b/saves/chess/no_explain/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e18f7f4789acf64b010baff2f41781aa2b6f40c0ced21d6da65568a782049615 +size 4915916176 diff --git a/saves/chess/no_explain/model-00004-of-00004.safetensors b/saves/chess/no_explain/model-00004-of-00004.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..854597a0daf7fd620d9c1124ed9ef5ee7b96d66a --- /dev/null +++ b/saves/chess/no_explain/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8b086998cf0614264b40e2b7bd0d82018e1a19b868e7da34c10559b61d32df4 +size 1168138808 diff --git a/saves/chess/no_explain/model.safetensors.index.json b/saves/chess/no_explain/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0fd8120f1c6acddc268ebc2583058efaf699a771 --- /dev/null +++ b/saves/chess/no_explain/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/saves/chess/no_explain/runs/Feb03_13-38-01_g42-h100-instance-112/events.out.tfevents.1738590008.g42-h100-instance-112.4133466.0 b/saves/chess/no_explain/runs/Feb03_13-38-01_g42-h100-instance-112/events.out.tfevents.1738590008.g42-h100-instance-112.4133466.0 new file mode 100644 index 0000000000000000000000000000000000000000..8c330fa15477d955355ea939dc8a58958d1b3504 --- /dev/null +++ b/saves/chess/no_explain/runs/Feb03_13-38-01_g42-h100-instance-112/events.out.tfevents.1738590008.g42-h100-instance-112.4133466.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:175f9c67a17c0b9a225f1ec2c08adcccfd6576b8eb65631320d0c055781abb8e +size 35279 diff --git a/saves/chess/no_explain/runs/Feb03_13-38-01_g42-h100-instance-112/events.out.tfevents.1738626844.g42-h100-instance-112.4133466.1 b/saves/chess/no_explain/runs/Feb03_13-38-01_g42-h100-instance-112/events.out.tfevents.1738626844.g42-h100-instance-112.4133466.1 new file mode 100644 index 0000000000000000000000000000000000000000..a525c2fd86e46ded62f9de8e33f71df48a2e5485 --- /dev/null +++ b/saves/chess/no_explain/runs/Feb03_13-38-01_g42-h100-instance-112/events.out.tfevents.1738626844.g42-h100-instance-112.4133466.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf05a19919973bb3d37dbf7b30eec6a982893550fff1b0ad3b9e6d220950c8ff +size 359 diff --git a/saves/chess/no_explain/special_tokens_map.json b/saves/chess/no_explain/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..df5c3a478b842fa66e6a8c10265478284c1d4f41 --- /dev/null +++ b/saves/chess/no_explain/special_tokens_map.json @@ -0,0 +1,33 @@ +{ + "additional_special_tokens": [ + { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } + ], + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/saves/chess/no_explain/tokenizer.json b/saves/chess/no_explain/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..172311123ab62378f1f6d90f3068a676b7d939ed --- /dev/null +++ b/saves/chess/no_explain/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1dcab308e7cf5970ea38815e0a62887d705c5b436f869ca27a5dcdd40c36a6 +size 17210148 diff --git a/saves/chess/no_explain/tokenizer_config.json b/saves/chess/no_explain/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e2afd45c14285320f15500548266d7adba98d07a --- /dev/null +++ b/saves/chess/no_explain/tokenizer_config.json @@ -0,0 +1,2078 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|eot_id|>", + "<|eom_id|>" + ], + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 2048, + "pad_token": "<|eot_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/saves/chess/no_explain/train_results.json b/saves/chess/no_explain/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c58e8809ed2652c7111bfcda237889a42876c8 --- /dev/null +++ b/saves/chess/no_explain/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 9.992390869042852, + "total_flos": 2784163811819520.0, + "train_loss": 0.025371345406674895, + "train_runtime": 36703.9164, + "train_samples_per_second": 348.252, + "train_steps_per_second": 0.34 +} \ No newline at end of file diff --git a/saves/chess/no_explain/trainer_log.jsonl b/saves/chess/no_explain/trainer_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..efe39868675c10cb0559001434ec3491d20777a5 --- /dev/null +++ b/saves/chess/no_explain/trainer_log.jsonl @@ -0,0 +1,137 @@ +{"current_steps": 100, "total_steps": 12480, "loss": 1.3897, "lr": 4.006410256410257e-07, "epoch": 0.08009611533840609, "percentage": 0.8, "elapsed_time": "0:04:40", "remaining_time": "9:38:28"} +{"current_steps": 200, "total_steps": 12480, "loss": 0.0598, "lr": 8.012820512820515e-07, "epoch": 0.16019223067681218, "percentage": 1.6, "elapsed_time": "0:09:11", "remaining_time": "9:24:06"} +{"current_steps": 300, "total_steps": 12480, "loss": 0.0551, "lr": 1.201923076923077e-06, "epoch": 0.24028834601521826, "percentage": 2.4, "elapsed_time": "0:13:43", "remaining_time": "9:16:55"} +{"current_steps": 400, "total_steps": 12480, "loss": 0.0516, "lr": 1.602564102564103e-06, "epoch": 0.32038446135362436, "percentage": 3.21, "elapsed_time": "0:18:48", "remaining_time": "9:27:55"} +{"current_steps": 500, "total_steps": 12480, "loss": 0.0501, "lr": 2.0032051282051286e-06, "epoch": 0.4004805766920304, "percentage": 4.01, "elapsed_time": "0:23:18", "remaining_time": "9:18:38"} +{"current_steps": 600, "total_steps": 12480, "loss": 0.0486, "lr": 2.403846153846154e-06, "epoch": 0.4805766920304365, "percentage": 4.81, "elapsed_time": "0:27:49", "remaining_time": "9:10:52"} +{"current_steps": 700, "total_steps": 12480, "loss": 0.0463, "lr": 2.8044871794871797e-06, "epoch": 0.5606728073688426, "percentage": 5.61, "elapsed_time": "0:32:36", "remaining_time": "9:08:45"} +{"current_steps": 800, "total_steps": 12480, "loss": 0.0447, "lr": 3.205128205128206e-06, "epoch": 0.6407689227072487, "percentage": 6.41, "elapsed_time": "0:38:34", "remaining_time": "9:23:16"} +{"current_steps": 900, "total_steps": 12480, "loss": 0.0441, "lr": 3.605769230769231e-06, "epoch": 0.7208650380456548, "percentage": 7.21, "elapsed_time": "0:43:53", "remaining_time": "9:24:43"} +{"current_steps": 1000, "total_steps": 12480, "loss": 0.0429, "lr": 4.006410256410257e-06, "epoch": 0.8009611533840608, "percentage": 8.01, "elapsed_time": "0:48:23", "remaining_time": "9:15:32"} +{"current_steps": 1000, "total_steps": 12480, "eval_loss": 0.042210426181554794, "epoch": 0.8009611533840608, "percentage": 8.01, "elapsed_time": "0:50:00", "remaining_time": "9:34:08"} +{"current_steps": 1100, "total_steps": 12480, "loss": 0.0414, "lr": 4.4070512820512826e-06, "epoch": 0.8810572687224669, "percentage": 8.81, "elapsed_time": "0:55:40", "remaining_time": "9:35:57"} +{"current_steps": 1200, "total_steps": 12480, "loss": 0.0402, "lr": 4.807692307692308e-06, "epoch": 0.961153384060873, "percentage": 9.62, "elapsed_time": "1:00:10", "remaining_time": "9:25:41"} +{"current_steps": 1300, "total_steps": 12480, "loss": 0.0386, "lr": 4.999735579817769e-06, "epoch": 1.0408490188225872, "percentage": 10.42, "elapsed_time": "1:04:57", "remaining_time": "9:18:40"} +{"current_steps": 1400, "total_steps": 12480, "loss": 0.0373, "lr": 4.997740994288484e-06, "epoch": 1.1209451341609933, "percentage": 11.22, "elapsed_time": "1:09:28", "remaining_time": "9:09:46"} +{"current_steps": 1500, "total_steps": 12480, "loss": 0.0366, "lr": 4.993792498360407e-06, "epoch": 1.2010412494993994, "percentage": 12.02, "elapsed_time": "1:13:57", "remaining_time": "9:01:25"} +{"current_steps": 1600, "total_steps": 12480, "loss": 0.0357, "lr": 4.9878931808274796e-06, "epoch": 1.2811373648378055, "percentage": 12.82, "elapsed_time": "1:18:28", "remaining_time": "8:53:34"} +{"current_steps": 1700, "total_steps": 12480, "loss": 0.0352, "lr": 4.980047656554856e-06, "epoch": 1.3612334801762114, "percentage": 13.62, "elapsed_time": "1:23:22", "remaining_time": "8:48:39"} +{"current_steps": 1800, "total_steps": 12480, "loss": 0.0346, "lr": 4.970262062868821e-06, "epoch": 1.4413295955146175, "percentage": 14.42, "elapsed_time": "1:28:25", "remaining_time": "8:44:41"} +{"current_steps": 1900, "total_steps": 12480, "loss": 0.0336, "lr": 4.958544054755741e-06, "epoch": 1.5214257108530236, "percentage": 15.22, "elapsed_time": "1:33:34", "remaining_time": "8:41:03"} +{"current_steps": 2000, "total_steps": 12480, "loss": 0.0329, "lr": 4.944902798873794e-06, "epoch": 1.6015218261914297, "percentage": 16.03, "elapsed_time": "1:38:03", "remaining_time": "8:33:50"} +{"current_steps": 2000, "total_steps": 12480, "eval_loss": 0.03361953794956207, "epoch": 1.6015218261914297, "percentage": 16.03, "elapsed_time": "1:39:40", "remaining_time": "8:42:20"} +{"current_steps": 2100, "total_steps": 12480, "loss": 0.0328, "lr": 4.92934896638215e-06, "epoch": 1.6816179415298358, "percentage": 16.83, "elapsed_time": "1:45:26", "remaining_time": "8:41:13"} +{"current_steps": 2200, "total_steps": 12480, "loss": 0.0316, "lr": 4.91189472459324e-06, "epoch": 1.761714056868242, "percentage": 17.63, "elapsed_time": "1:49:57", "remaining_time": "8:33:46"} +{"current_steps": 2300, "total_steps": 12480, "loss": 0.0317, "lr": 4.892553727454616e-06, "epoch": 1.841810172206648, "percentage": 18.43, "elapsed_time": "1:54:27", "remaining_time": "8:26:34"} +{"current_steps": 2400, "total_steps": 12480, "loss": 0.0309, "lr": 4.8713411048678635e-06, "epoch": 1.921906287545054, "percentage": 19.23, "elapsed_time": "1:58:56", "remaining_time": "8:19:34"} +{"current_steps": 2500, "total_steps": 12480, "loss": 0.0305, "lr": 4.848273450852921e-06, "epoch": 2.0016019223067683, "percentage": 20.03, "elapsed_time": "2:03:25", "remaining_time": "8:12:41"} +{"current_steps": 2600, "total_steps": 12480, "loss": 0.0268, "lr": 4.823368810567056e-06, "epoch": 2.0816980376451744, "percentage": 20.83, "elapsed_time": "2:07:54", "remaining_time": "8:06:04"} +{"current_steps": 2700, "total_steps": 12480, "loss": 0.0268, "lr": 4.796646666188663e-06, "epoch": 2.1617941529835805, "percentage": 21.63, "elapsed_time": "2:12:24", "remaining_time": "7:59:38"} +{"current_steps": 2800, "total_steps": 12480, "loss": 0.0272, "lr": 4.768127921676916e-06, "epoch": 2.2418902683219866, "percentage": 22.44, "elapsed_time": "2:17:52", "remaining_time": "7:56:40"} +{"current_steps": 2900, "total_steps": 12480, "loss": 0.0297, "lr": 4.737834886419217e-06, "epoch": 2.3219863836603922, "percentage": 23.24, "elapsed_time": "2:22:28", "remaining_time": "7:50:40"} +{"current_steps": 3000, "total_steps": 12480, "loss": 0.0275, "lr": 4.705791257779196e-06, "epoch": 2.4020824989987988, "percentage": 24.04, "elapsed_time": "2:26:58", "remaining_time": "7:44:26"} +{"current_steps": 3000, "total_steps": 12480, "eval_loss": 0.029653793200850487, "epoch": 2.4020824989987988, "percentage": 24.04, "elapsed_time": "2:28:35", "remaining_time": "7:49:33"} +{"current_steps": 3100, "total_steps": 12480, "loss": 0.0269, "lr": 4.672022102558958e-06, "epoch": 2.4821786143372044, "percentage": 24.84, "elapsed_time": "2:34:19", "remaining_time": "7:46:57"} +{"current_steps": 3200, "total_steps": 12480, "loss": 0.0269, "lr": 4.636553837390051e-06, "epoch": 2.562274729675611, "percentage": 25.64, "elapsed_time": "2:38:50", "remaining_time": "7:40:37"} +{"current_steps": 3300, "total_steps": 12480, "loss": 0.026, "lr": 4.5994142080684956e-06, "epoch": 2.6423708450140166, "percentage": 26.44, "elapsed_time": "2:43:20", "remaining_time": "7:34:23"} +{"current_steps": 3400, "total_steps": 12480, "loss": 0.026, "lr": 4.560632267850054e-06, "epoch": 2.7224669603524227, "percentage": 27.24, "elapsed_time": "2:47:51", "remaining_time": "7:28:16"} +{"current_steps": 3500, "total_steps": 12480, "loss": 0.0257, "lr": 4.5202383547227134e-06, "epoch": 2.802563075690829, "percentage": 28.04, "elapsed_time": "2:52:21", "remaining_time": "7:22:13"} +{"current_steps": 3600, "total_steps": 12480, "loss": 0.0256, "lr": 4.478264067674155e-06, "epoch": 2.882659191029235, "percentage": 28.85, "elapsed_time": "2:56:52", "remaining_time": "7:16:16"} +{"current_steps": 3700, "total_steps": 12480, "loss": 0.0255, "lr": 4.43474224197278e-06, "epoch": 2.962755306367641, "percentage": 29.65, "elapsed_time": "3:01:22", "remaining_time": "7:10:24"} +{"current_steps": 3800, "total_steps": 12480, "loss": 0.0224, "lr": 4.389706923481633e-06, "epoch": 3.0424509411293554, "percentage": 30.45, "elapsed_time": "3:05:51", "remaining_time": "7:04:33"} +{"current_steps": 3900, "total_steps": 12480, "loss": 0.0198, "lr": 4.34319334202531e-06, "epoch": 3.122547056467761, "percentage": 31.25, "elapsed_time": "3:10:21", "remaining_time": "6:58:46"} +{"current_steps": 4000, "total_steps": 12480, "loss": 0.0202, "lr": 4.2952378838306855e-06, "epoch": 3.202643171806167, "percentage": 32.05, "elapsed_time": "3:14:50", "remaining_time": "6:53:03"} +{"current_steps": 4000, "total_steps": 12480, "eval_loss": 0.029243575409054756, "epoch": 3.202643171806167, "percentage": 32.05, "elapsed_time": "3:16:28", "remaining_time": "6:56:30"} +{"current_steps": 4100, "total_steps": 12480, "loss": 0.0205, "lr": 4.245878063063022e-06, "epoch": 3.2827392871445733, "percentage": 32.85, "elapsed_time": "3:23:36", "remaining_time": "6:56:10"} +{"current_steps": 4200, "total_steps": 12480, "loss": 0.0205, "lr": 4.195152492479727e-06, "epoch": 3.3628354024829794, "percentage": 33.65, "elapsed_time": "3:28:05", "remaining_time": "6:50:13"} +{"current_steps": 4300, "total_steps": 12480, "loss": 0.0204, "lr": 4.143100853224714e-06, "epoch": 3.4429315178213855, "percentage": 34.46, "elapsed_time": "3:32:34", "remaining_time": "6:44:22"} +{"current_steps": 4400, "total_steps": 12480, "loss": 0.0204, "lr": 4.089763863786988e-06, "epoch": 3.5230276331597916, "percentage": 35.26, "elapsed_time": "3:37:04", "remaining_time": "6:38:36"} +{"current_steps": 4500, "total_steps": 12480, "loss": 0.0201, "lr": 4.035183248147752e-06, "epoch": 3.6031237484981977, "percentage": 36.06, "elapsed_time": "3:41:34", "remaining_time": "6:32:55"} +{"current_steps": 4600, "total_steps": 12480, "loss": 0.02, "lr": 3.979401703140955e-06, "epoch": 3.683219863836604, "percentage": 36.86, "elapsed_time": "3:46:04", "remaining_time": "6:27:16"} +{"current_steps": 4700, "total_steps": 12480, "loss": 0.0206, "lr": 3.922462865052782e-06, "epoch": 3.76331597917501, "percentage": 37.66, "elapsed_time": "3:50:35", "remaining_time": "6:21:41"} +{"current_steps": 4800, "total_steps": 12480, "loss": 0.0199, "lr": 3.8644112754862614e-06, "epoch": 3.843412094513416, "percentage": 38.46, "elapsed_time": "3:55:04", "remaining_time": "6:16:07"} +{"current_steps": 4900, "total_steps": 12480, "loss": 0.0199, "lr": 3.805292346517659e-06, "epoch": 3.923508209851822, "percentage": 39.26, "elapsed_time": "3:59:34", "remaining_time": "6:10:36"} +{"current_steps": 5000, "total_steps": 12480, "loss": 0.0194, "lr": 3.745152325171921e-06, "epoch": 4.0032038446135365, "percentage": 40.06, "elapsed_time": "4:04:02", "remaining_time": "6:05:05"} +{"current_steps": 5000, "total_steps": 12480, "eval_loss": 0.02939535118639469, "epoch": 4.0032038446135365, "percentage": 40.06, "elapsed_time": "4:05:40", "remaining_time": "6:07:31"} +{"current_steps": 5100, "total_steps": 12480, "loss": 0.0109, "lr": 3.6840382572449733e-06, "epoch": 4.083299959951942, "percentage": 40.87, "elapsed_time": "4:11:38", "remaining_time": "6:04:09"} +{"current_steps": 5200, "total_steps": 12480, "loss": 0.0116, "lr": 3.621997950501156e-06, "epoch": 4.163396075290349, "percentage": 41.67, "elapsed_time": "4:16:54", "remaining_time": "5:59:40"} +{"current_steps": 5300, "total_steps": 12480, "loss": 0.0119, "lr": 3.5590799372745915e-06, "epoch": 4.243492190628754, "percentage": 42.47, "elapsed_time": "4:21:24", "remaining_time": "5:54:07"} +{"current_steps": 5400, "total_steps": 12480, "loss": 0.0125, "lr": 3.495333436503753e-06, "epoch": 4.323588305967161, "percentage": 43.27, "elapsed_time": "4:25:54", "remaining_time": "5:48:37"} +{"current_steps": 5500, "total_steps": 12480, "loss": 0.0122, "lr": 3.4308083152289073e-06, "epoch": 4.403684421305567, "percentage": 44.07, "elapsed_time": "4:30:25", "remaining_time": "5:43:11"} +{"current_steps": 5600, "total_steps": 12480, "loss": 0.0121, "lr": 3.3655550495825824e-06, "epoch": 4.483780536643973, "percentage": 44.87, "elapsed_time": "4:34:54", "remaining_time": "5:37:45"} +{"current_steps": 5700, "total_steps": 12480, "loss": 0.0123, "lr": 3.2996246853035417e-06, "epoch": 4.563876651982379, "percentage": 45.67, "elapsed_time": "4:39:24", "remaining_time": "5:32:20"} +{"current_steps": 5800, "total_steps": 12480, "loss": 0.0121, "lr": 3.233068797805194e-06, "epoch": 4.6439727673207845, "percentage": 46.47, "elapsed_time": "4:43:54", "remaining_time": "5:26:59"} +{"current_steps": 5900, "total_steps": 12480, "loss": 0.0121, "lr": 3.1659394518296303e-06, "epoch": 4.724068882659191, "percentage": 47.28, "elapsed_time": "4:48:26", "remaining_time": "5:21:41"} +{"current_steps": 6000, "total_steps": 12480, "loss": 0.0119, "lr": 3.0982891607188948e-06, "epoch": 4.8041649979975976, "percentage": 48.08, "elapsed_time": "4:53:09", "remaining_time": "5:16:36"} +{"current_steps": 6000, "total_steps": 12480, "eval_loss": 0.031129568815231323, "epoch": 4.8041649979975976, "percentage": 48.08, "elapsed_time": "4:54:46", "remaining_time": "5:18:21"} +{"current_steps": 6100, "total_steps": 12480, "loss": 0.0121, "lr": 3.0301708453353118e-06, "epoch": 4.884261113336003, "percentage": 48.88, "elapsed_time": "5:00:29", "remaining_time": "5:14:17"} +{"current_steps": 6200, "total_steps": 12480, "loss": 0.0114, "lr": 2.961637792663032e-06, "epoch": 4.964357228674409, "percentage": 49.68, "elapsed_time": "5:05:00", "remaining_time": "5:08:56"} +{"current_steps": 6300, "total_steps": 12480, "loss": 0.0075, "lr": 2.8927436141231695e-06, "epoch": 5.044052863436123, "percentage": 50.48, "elapsed_time": "5:09:29", "remaining_time": "5:03:35"} +{"current_steps": 6400, "total_steps": 12480, "loss": 0.0044, "lr": 2.8235422036351384e-06, "epoch": 5.12414897877453, "percentage": 51.28, "elapsed_time": "5:13:58", "remaining_time": "4:58:16"} +{"current_steps": 6500, "total_steps": 12480, "loss": 0.0043, "lr": 2.754087695457005e-06, "epoch": 5.2042450941129355, "percentage": 52.08, "elapsed_time": "5:18:27", "remaining_time": "4:52:58"} +{"current_steps": 6600, "total_steps": 12480, "loss": 0.0045, "lr": 2.684434421837821e-06, "epoch": 5.284341209451342, "percentage": 52.88, "elapsed_time": "5:24:17", "remaining_time": "4:48:55"} +{"current_steps": 6700, "total_steps": 12480, "loss": 0.0048, "lr": 2.6146368705150854e-06, "epoch": 5.364437324789748, "percentage": 53.69, "elapsed_time": "5:29:17", "remaining_time": "4:44:04"} +{"current_steps": 6800, "total_steps": 12480, "loss": 0.0045, "lr": 2.5447496420905608e-06, "epoch": 5.444533440128154, "percentage": 54.49, "elapsed_time": "5:33:46", "remaining_time": "4:38:48"} +{"current_steps": 6900, "total_steps": 12480, "loss": 0.0045, "lr": 2.4748274073178114e-06, "epoch": 5.52462955546656, "percentage": 55.29, "elapsed_time": "5:38:17", "remaining_time": "4:33:34"} +{"current_steps": 7000, "total_steps": 12480, "loss": 0.0048, "lr": 2.4049248643348512e-06, "epoch": 5.6047256708049655, "percentage": 56.09, "elapsed_time": "5:42:47", "remaining_time": "4:28:21"} +{"current_steps": 7000, "total_steps": 12480, "eval_loss": 0.04388193413615227, "epoch": 5.6047256708049655, "percentage": 56.09, "elapsed_time": "5:44:24", "remaining_time": "4:29:37"} +{"current_steps": 7100, "total_steps": 12480, "loss": 0.0043, "lr": 2.3350966958753766e-06, "epoch": 5.684821786143372, "percentage": 56.89, "elapsed_time": "5:50:02", "remaining_time": "4:25:14"} +{"current_steps": 7200, "total_steps": 12480, "loss": 0.0043, "lr": 2.265397526492052e-06, "epoch": 5.764917901481779, "percentage": 57.69, "elapsed_time": "5:54:32", "remaining_time": "4:19:59"} +{"current_steps": 7300, "total_steps": 12480, "loss": 0.0043, "lr": 2.195881879825301e-06, "epoch": 5.845014016820184, "percentage": 58.49, "elapsed_time": "5:59:01", "remaining_time": "4:14:45"} +{"current_steps": 7400, "total_steps": 12480, "loss": 0.0043, "lr": 2.1266041359510456e-06, "epoch": 5.92511013215859, "percentage": 59.29, "elapsed_time": "6:03:31", "remaining_time": "4:09:33"} +{"current_steps": 7500, "total_steps": 12480, "loss": 0.004, "lr": 2.057618488840745e-06, "epoch": 6.004805766920304, "percentage": 60.1, "elapsed_time": "6:07:59", "remaining_time": "4:04:20"} +{"current_steps": 7600, "total_steps": 12480, "loss": 0.0014, "lr": 1.9889789039670276e-06, "epoch": 6.084901882258711, "percentage": 60.9, "elapsed_time": "6:12:29", "remaining_time": "3:59:10"} +{"current_steps": 7700, "total_steps": 12480, "loss": 0.0014, "lr": 1.9207390760880605e-06, "epoch": 6.1649979975971165, "percentage": 61.7, "elapsed_time": "6:16:59", "remaining_time": "3:54:02"} +{"current_steps": 7800, "total_steps": 12480, "loss": 0.0013, "lr": 1.852952387243698e-06, "epoch": 6.245094112935522, "percentage": 62.5, "elapsed_time": "6:21:30", "remaining_time": "3:48:54"} +{"current_steps": 7900, "total_steps": 12480, "loss": 0.0013, "lr": 1.7856718649962606e-06, "epoch": 6.325190228273929, "percentage": 63.3, "elapsed_time": "6:26:00", "remaining_time": "3:43:47"} +{"current_steps": 8000, "total_steps": 12480, "loss": 0.0013, "lr": 1.7189501409486061e-06, "epoch": 6.405286343612334, "percentage": 64.1, "elapsed_time": "6:31:09", "remaining_time": "3:39:02"} +{"current_steps": 8000, "total_steps": 12480, "eval_loss": 0.053785648196935654, "epoch": 6.405286343612334, "percentage": 64.1, "elapsed_time": "6:32:46", "remaining_time": "3:39:57"} +{"current_steps": 8100, "total_steps": 12480, "loss": 0.0013, "lr": 1.6528394095719558e-06, "epoch": 6.485382458950741, "percentage": 64.9, "elapsed_time": "6:38:28", "remaining_time": "3:35:28"} +{"current_steps": 8200, "total_steps": 12480, "loss": 0.0014, "lr": 1.587391387375669e-06, "epoch": 6.565478574289147, "percentage": 65.71, "elapsed_time": "6:42:59", "remaining_time": "3:30:20"} +{"current_steps": 8300, "total_steps": 12480, "loss": 0.0013, "lr": 1.522657272450917e-06, "epoch": 6.645574689627553, "percentage": 66.51, "elapsed_time": "6:47:29", "remaining_time": "3:25:12"} +{"current_steps": 8400, "total_steps": 12480, "loss": 0.0014, "lr": 1.4586877044199015e-06, "epoch": 6.725670804965959, "percentage": 67.31, "elapsed_time": "6:51:59", "remaining_time": "3:20:06"} +{"current_steps": 8500, "total_steps": 12480, "loss": 0.0014, "lr": 1.3955327248219438e-06, "epoch": 6.805766920304365, "percentage": 68.11, "elapsed_time": "6:56:29", "remaining_time": "3:15:01"} +{"current_steps": 8600, "total_steps": 12480, "loss": 0.0012, "lr": 1.3332417379674426e-06, "epoch": 6.885863035642771, "percentage": 68.91, "elapsed_time": "7:00:59", "remaining_time": "3:09:56"} +{"current_steps": 8700, "total_steps": 12480, "loss": 0.0013, "lr": 1.2718634722903073e-06, "epoch": 6.965959150981178, "percentage": 69.71, "elapsed_time": "7:05:35", "remaining_time": "3:04:54"} +{"current_steps": 8800, "total_steps": 12480, "loss": 0.0007, "lr": 1.2114459422291205e-06, "epoch": 7.045654785742891, "percentage": 70.51, "elapsed_time": "7:10:19", "remaining_time": "2:59:57"} +{"current_steps": 8900, "total_steps": 12480, "loss": 0.0003, "lr": 1.1520364106668342e-06, "epoch": 7.125750901081298, "percentage": 71.31, "elapsed_time": "7:14:48", "remaining_time": "2:54:54"} +{"current_steps": 9000, "total_steps": 12480, "loss": 0.0004, "lr": 1.093681351958383e-06, "epoch": 7.205847016419703, "percentage": 72.12, "elapsed_time": "7:19:18", "remaining_time": "2:49:52"} +{"current_steps": 9000, "total_steps": 12480, "eval_loss": 0.06704169511795044, "epoch": 7.205847016419703, "percentage": 72.12, "elapsed_time": "7:20:56", "remaining_time": "2:50:29"} +{"current_steps": 9100, "total_steps": 12480, "loss": 0.0004, "lr": 1.0364264155751489e-06, "epoch": 7.28594313175811, "percentage": 72.92, "elapsed_time": "7:27:05", "remaining_time": "2:46:03"} +{"current_steps": 9200, "total_steps": 12480, "loss": 0.0004, "lr": 9.803163903946952e-07, "epoch": 7.3660392470965155, "percentage": 73.72, "elapsed_time": "7:31:34", "remaining_time": "2:40:59"} +{"current_steps": 9300, "total_steps": 12480, "loss": 0.0004, "lr": 9.253951696637311e-07, "epoch": 7.446135362434922, "percentage": 74.52, "elapsed_time": "7:36:03", "remaining_time": "2:35:56"} +{"current_steps": 9400, "total_steps": 12480, "loss": 0.0003, "lr": 8.717057166616926e-07, "epoch": 7.526231477773328, "percentage": 75.32, "elapsed_time": "7:41:16", "remaining_time": "2:31:08"} +{"current_steps": 9500, "total_steps": 12480, "loss": 0.0004, "lr": 8.192900310918206e-07, "epoch": 7.606327593111734, "percentage": 76.12, "elapsed_time": "7:45:45", "remaining_time": "2:26:06"} +{"current_steps": 9600, "total_steps": 12480, "loss": 0.0004, "lr": 7.681891162260016e-07, "epoch": 7.68642370845014, "percentage": 76.92, "elapsed_time": "7:50:16", "remaining_time": "2:21:04"} +{"current_steps": 9700, "total_steps": 12480, "loss": 0.0003, "lr": 7.184429468291023e-07, "epoch": 7.766519823788546, "percentage": 77.72, "elapsed_time": "7:54:46", "remaining_time": "2:16:04"} +{"current_steps": 9800, "total_steps": 12480, "loss": 0.0004, "lr": 6.700904378878675e-07, "epoch": 7.846615939126952, "percentage": 78.53, "elapsed_time": "7:59:16", "remaining_time": "2:11:03"} +{"current_steps": 9900, "total_steps": 12480, "loss": 0.0003, "lr": 6.231694141688535e-07, "epoch": 7.926712054465359, "percentage": 79.33, "elapsed_time": "8:03:45", "remaining_time": "2:06:04"} +{"current_steps": 10000, "total_steps": 12480, "loss": 0.0003, "lr": 5.777165806292109e-07, "epoch": 8.006407689227073, "percentage": 80.13, "elapsed_time": "8:08:13", "remaining_time": "2:01:04"} +{"current_steps": 10000, "total_steps": 12480, "eval_loss": 0.06978683918714523, "epoch": 8.006407689227073, "percentage": 80.13, "elapsed_time": "8:09:50", "remaining_time": "2:01:28"} +{"current_steps": 10100, "total_steps": 12480, "loss": 0.0001, "lr": 5.337674937034581e-07, "epoch": 8.086503804565478, "percentage": 80.93, "elapsed_time": "8:15:31", "remaining_time": "1:56:46"} +{"current_steps": 10200, "total_steps": 12480, "loss": 0.0001, "lr": 4.913565334887135e-07, "epoch": 8.166599919903884, "percentage": 81.73, "elapsed_time": "8:20:01", "remaining_time": "1:51:46"} +{"current_steps": 10300, "total_steps": 12480, "loss": 0.0001, "lr": 4.505168768501431e-07, "epoch": 8.246696035242291, "percentage": 82.53, "elapsed_time": "8:24:32", "remaining_time": "1:46:47"} +{"current_steps": 10400, "total_steps": 12480, "loss": 0.0001, "lr": 4.1128047146765936e-07, "epoch": 8.326792150580697, "percentage": 83.33, "elapsed_time": "8:29:02", "remaining_time": "1:41:48"} +{"current_steps": 10500, "total_steps": 12480, "loss": 0.0001, "lr": 3.736780108441762e-07, "epoch": 8.406888265919102, "percentage": 84.13, "elapsed_time": "8:33:32", "remaining_time": "1:36:50"} +{"current_steps": 10600, "total_steps": 12480, "loss": 0.0001, "lr": 3.3773891029497326e-07, "epoch": 8.486984381257509, "percentage": 84.94, "elapsed_time": "8:38:01", "remaining_time": "1:31:52"} +{"current_steps": 10700, "total_steps": 12480, "loss": 0.0, "lr": 3.034912839369447e-07, "epoch": 8.567080496595915, "percentage": 85.74, "elapsed_time": "8:42:32", "remaining_time": "1:26:55"} +{"current_steps": 10800, "total_steps": 12480, "loss": 0.0001, "lr": 2.70961922695743e-07, "epoch": 8.647176611934322, "percentage": 86.54, "elapsed_time": "8:47:03", "remaining_time": "1:21:59"} +{"current_steps": 10900, "total_steps": 12480, "loss": 0.0, "lr": 2.401762733480115e-07, "epoch": 8.727272727272727, "percentage": 87.34, "elapsed_time": "8:51:32", "remaining_time": "1:17:02"} +{"current_steps": 11000, "total_steps": 12480, "loss": 0.0, "lr": 2.1115841861510945e-07, "epoch": 8.807368842611133, "percentage": 88.14, "elapsed_time": "8:56:02", "remaining_time": "1:12:07"} +{"current_steps": 11000, "total_steps": 12480, "eval_loss": 0.08943355828523636, "epoch": 8.807368842611133, "percentage": 88.14, "elapsed_time": "8:57:39", "remaining_time": "1:12:20"} +{"current_steps": 11100, "total_steps": 12480, "loss": 0.0, "lr": 1.8393105832389791e-07, "epoch": 8.88746495794954, "percentage": 88.94, "elapsed_time": "9:03:20", "remaining_time": "1:07:32"} +{"current_steps": 11200, "total_steps": 12480, "loss": 0.0001, "lr": 1.5851549164932118e-07, "epoch": 8.967561073287946, "percentage": 89.74, "elapsed_time": "9:07:51", "remaining_time": "1:02:36"} +{"current_steps": 11300, "total_steps": 12480, "loss": 0.0, "lr": 1.349316004526824e-07, "epoch": 9.047256708049659, "percentage": 90.54, "elapsed_time": "9:12:19", "remaining_time": "0:57:40"} +{"current_steps": 11400, "total_steps": 12480, "loss": 0.0, "lr": 1.1319783372863601e-07, "epoch": 9.127352823388065, "percentage": 91.35, "elapsed_time": "9:16:49", "remaining_time": "0:52:45"} +{"current_steps": 11500, "total_steps": 12480, "loss": 0.0, "lr": 9.333119317307598e-08, "epoch": 9.207448938726472, "percentage": 92.15, "elapsed_time": "9:22:55", "remaining_time": "0:47:58"} +{"current_steps": 11600, "total_steps": 12480, "loss": 0.0, "lr": 7.534721988320143e-08, "epoch": 9.287545054064879, "percentage": 92.95, "elapsed_time": "9:27:23", "remaining_time": "0:43:02"} +{"current_steps": 11700, "total_steps": 12480, "loss": 0.0, "lr": 5.92599822001666e-08, "epoch": 9.367641169403283, "percentage": 93.75, "elapsed_time": "9:31:54", "remaining_time": "0:38:07"} +{"current_steps": 11800, "total_steps": 12480, "loss": 0.0, "lr": 4.508206470382554e-08, "epoch": 9.44773728474169, "percentage": 94.55, "elapsed_time": "9:36:24", "remaining_time": "0:33:12"} +{"current_steps": 11900, "total_steps": 12480, "loss": 0.0, "lr": 3.2824558368179384e-08, "epoch": 9.527833400080096, "percentage": 95.35, "elapsed_time": "9:40:53", "remaining_time": "0:28:18"} +{"current_steps": 12000, "total_steps": 12480, "loss": 0.0, "lr": 2.2497051885228825e-08, "epoch": 9.607929515418503, "percentage": 96.15, "elapsed_time": "9:45:23", "remaining_time": "0:23:24"} +{"current_steps": 12000, "total_steps": 12480, "eval_loss": 0.09308738261461258, "epoch": 9.607929515418503, "percentage": 96.15, "elapsed_time": "9:47:00", "remaining_time": "0:23:28"} +{"current_steps": 12100, "total_steps": 12480, "loss": 0.0, "lr": 1.4107624164019229e-08, "epoch": 9.688025630756908, "percentage": 96.96, "elapsed_time": "9:52:45", "remaining_time": "0:18:36"} +{"current_steps": 12200, "total_steps": 12480, "loss": 0.0, "lr": 7.662838010742413e-09, "epoch": 9.768121746095314, "percentage": 97.76, "elapsed_time": "9:57:15", "remaining_time": "0:13:42"} +{"current_steps": 12300, "total_steps": 12480, "loss": 0.0, "lr": 3.1677349948461277e-09, "epoch": 9.84821786143372, "percentage": 98.56, "elapsed_time": "10:02:03", "remaining_time": "0:08:48"} +{"current_steps": 12400, "total_steps": 12480, "loss": 0.0, "lr": 6.258315051568819e-10, "epoch": 9.928313976772127, "percentage": 99.36, "elapsed_time": "10:06:35", "remaining_time": "0:03:54"} +{"current_steps": 12480, "total_steps": 12480, "epoch": 9.992390869042852, "percentage": 100.0, "elapsed_time": "10:11:26", "remaining_time": "0:00:00"} diff --git a/saves/chess/no_explain/trainer_state.json b/saves/chess/no_explain/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..af4d2fe1a9790a26b58864bbc3de6206401e79ff --- /dev/null +++ b/saves/chess/no_explain/trainer_state.json @@ -0,0 +1,1006 @@ +{ + "best_metric": 0.029243575409054756, + "best_model_checkpoint": "saves/chess/no_explain/checkpoint-4000", + "epoch": 9.992390869042852, + "eval_steps": 1000, + "global_step": 12480, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08009611533840609, + "grad_norm": 0.8625897724596373, + "learning_rate": 4.006410256410257e-07, + "loss": 1.3897, + "step": 100 + }, + { + "epoch": 0.16019223067681218, + "grad_norm": 0.8895947937892531, + "learning_rate": 8.012820512820515e-07, + "loss": 0.0598, + "step": 200 + }, + { + "epoch": 0.24028834601521826, + "grad_norm": 0.5221246844134636, + "learning_rate": 1.201923076923077e-06, + "loss": 0.0551, + "step": 300 + }, + { + "epoch": 0.32038446135362436, + "grad_norm": 0.5590357289952654, + "learning_rate": 1.602564102564103e-06, + "loss": 0.0516, + "step": 400 + }, + { + "epoch": 0.4004805766920304, + "grad_norm": 0.36991974174438536, + "learning_rate": 2.0032051282051286e-06, + "loss": 0.0501, + "step": 500 + }, + { + "epoch": 0.4805766920304365, + "grad_norm": 0.6389443947236714, + "learning_rate": 2.403846153846154e-06, + "loss": 0.0486, + "step": 600 + }, + { + "epoch": 0.5606728073688426, + "grad_norm": 0.44563280571067243, + "learning_rate": 2.8044871794871797e-06, + "loss": 0.0463, + "step": 700 + }, + { + "epoch": 0.6407689227072487, + "grad_norm": 0.44266380357676305, + "learning_rate": 3.205128205128206e-06, + "loss": 0.0447, + "step": 800 + }, + { + "epoch": 0.7208650380456548, + "grad_norm": 0.585654631503778, + "learning_rate": 3.605769230769231e-06, + "loss": 0.0441, + "step": 900 + }, + { + "epoch": 0.8009611533840608, + "grad_norm": 0.600751877456253, + "learning_rate": 4.006410256410257e-06, + "loss": 0.0429, + "step": 1000 + }, + { + "epoch": 0.8009611533840608, + "eval_loss": 0.042210426181554794, + "eval_runtime": 97.133, + "eval_samples_per_second": 1462.17, + "eval_steps_per_second": 2.862, + "step": 1000 + }, + { + "epoch": 0.8810572687224669, + "grad_norm": 0.2641551118831142, + "learning_rate": 4.4070512820512826e-06, + "loss": 0.0414, + "step": 1100 + }, + { + "epoch": 0.961153384060873, + "grad_norm": 0.29049561928975876, + "learning_rate": 4.807692307692308e-06, + "loss": 0.0402, + "step": 1200 + }, + { + "epoch": 1.0408490188225872, + "grad_norm": 0.5344113116420023, + "learning_rate": 4.999735579817769e-06, + "loss": 0.0386, + "step": 1300 + }, + { + "epoch": 1.1209451341609933, + "grad_norm": 0.31257482202449377, + "learning_rate": 4.997740994288484e-06, + "loss": 0.0373, + "step": 1400 + }, + { + "epoch": 1.2010412494993994, + "grad_norm": 0.4593106982622164, + "learning_rate": 4.993792498360407e-06, + "loss": 0.0366, + "step": 1500 + }, + { + "epoch": 1.2811373648378055, + "grad_norm": 0.2012883704449717, + "learning_rate": 4.9878931808274796e-06, + "loss": 0.0357, + "step": 1600 + }, + { + "epoch": 1.3612334801762114, + "grad_norm": 0.22908626001592647, + "learning_rate": 4.980047656554856e-06, + "loss": 0.0352, + "step": 1700 + }, + { + "epoch": 1.4413295955146175, + "grad_norm": 0.3169879320183415, + "learning_rate": 4.970262062868821e-06, + "loss": 0.0346, + "step": 1800 + }, + { + "epoch": 1.5214257108530236, + "grad_norm": 0.2078878255601618, + "learning_rate": 4.958544054755741e-06, + "loss": 0.0336, + "step": 1900 + }, + { + "epoch": 1.6015218261914297, + "grad_norm": 0.2978110993331312, + "learning_rate": 4.944902798873794e-06, + "loss": 0.0329, + "step": 2000 + }, + { + "epoch": 1.6015218261914297, + "eval_loss": 0.03361953794956207, + "eval_runtime": 97.2876, + "eval_samples_per_second": 1459.847, + "eval_steps_per_second": 2.858, + "step": 2000 + }, + { + "epoch": 1.6816179415298358, + "grad_norm": 0.16678424956102253, + "learning_rate": 4.92934896638215e-06, + "loss": 0.0328, + "step": 2100 + }, + { + "epoch": 1.761714056868242, + "grad_norm": 0.19029664571581045, + "learning_rate": 4.91189472459324e-06, + "loss": 0.0316, + "step": 2200 + }, + { + "epoch": 1.841810172206648, + "grad_norm": 0.2388908631462674, + "learning_rate": 4.892553727454616e-06, + "loss": 0.0317, + "step": 2300 + }, + { + "epoch": 1.921906287545054, + "grad_norm": 0.15794270702360638, + "learning_rate": 4.8713411048678635e-06, + "loss": 0.0309, + "step": 2400 + }, + { + "epoch": 2.0016019223067683, + "grad_norm": 0.2103115075663395, + "learning_rate": 4.848273450852921e-06, + "loss": 0.0305, + "step": 2500 + }, + { + "epoch": 2.0816980376451744, + "grad_norm": 0.28601246983481904, + "learning_rate": 4.823368810567056e-06, + "loss": 0.0268, + "step": 2600 + }, + { + "epoch": 2.1617941529835805, + "grad_norm": 0.25522616878445004, + "learning_rate": 4.796646666188663e-06, + "loss": 0.0268, + "step": 2700 + }, + { + "epoch": 2.2418902683219866, + "grad_norm": 0.2343538332348778, + "learning_rate": 4.768127921676916e-06, + "loss": 0.0272, + "step": 2800 + }, + { + "epoch": 2.3219863836603922, + "grad_norm": 0.22903658893889398, + "learning_rate": 4.737834886419217e-06, + "loss": 0.0297, + "step": 2900 + }, + { + "epoch": 2.4020824989987988, + "grad_norm": 0.19855668130980528, + "learning_rate": 4.705791257779196e-06, + "loss": 0.0275, + "step": 3000 + }, + { + "epoch": 2.4020824989987988, + "eval_loss": 0.029653793200850487, + "eval_runtime": 97.2179, + "eval_samples_per_second": 1460.893, + "eval_steps_per_second": 2.86, + "step": 3000 + }, + { + "epoch": 2.4821786143372044, + "grad_norm": 0.1868527106405498, + "learning_rate": 4.672022102558958e-06, + "loss": 0.0269, + "step": 3100 + }, + { + "epoch": 2.562274729675611, + "grad_norm": 0.1985255713449175, + "learning_rate": 4.636553837390051e-06, + "loss": 0.0269, + "step": 3200 + }, + { + "epoch": 2.6423708450140166, + "grad_norm": 0.17528235376425527, + "learning_rate": 4.5994142080684956e-06, + "loss": 0.026, + "step": 3300 + }, + { + "epoch": 2.7224669603524227, + "grad_norm": 0.20238382028782428, + "learning_rate": 4.560632267850054e-06, + "loss": 0.026, + "step": 3400 + }, + { + "epoch": 2.802563075690829, + "grad_norm": 0.20789525240306345, + "learning_rate": 4.5202383547227134e-06, + "loss": 0.0257, + "step": 3500 + }, + { + "epoch": 2.882659191029235, + "grad_norm": 0.2849074845845128, + "learning_rate": 4.478264067674155e-06, + "loss": 0.0256, + "step": 3600 + }, + { + "epoch": 2.962755306367641, + "grad_norm": 0.1826392119567578, + "learning_rate": 4.43474224197278e-06, + "loss": 0.0255, + "step": 3700 + }, + { + "epoch": 3.0424509411293554, + "grad_norm": 0.3254043272458406, + "learning_rate": 4.389706923481633e-06, + "loss": 0.0224, + "step": 3800 + }, + { + "epoch": 3.122547056467761, + "grad_norm": 0.2695456046362865, + "learning_rate": 4.34319334202531e-06, + "loss": 0.0198, + "step": 3900 + }, + { + "epoch": 3.202643171806167, + "grad_norm": 0.24345073976828904, + "learning_rate": 4.2952378838306855e-06, + "loss": 0.0202, + "step": 4000 + }, + { + "epoch": 3.202643171806167, + "eval_loss": 0.029243575409054756, + "eval_runtime": 97.6159, + "eval_samples_per_second": 1454.937, + "eval_steps_per_second": 2.848, + "step": 4000 + }, + { + "epoch": 3.2827392871445733, + "grad_norm": 0.3753413906545954, + "learning_rate": 4.245878063063022e-06, + "loss": 0.0205, + "step": 4100 + }, + { + "epoch": 3.3628354024829794, + "grad_norm": 0.2460926534460345, + "learning_rate": 4.195152492479727e-06, + "loss": 0.0205, + "step": 4200 + }, + { + "epoch": 3.4429315178213855, + "grad_norm": 0.2704381094416959, + "learning_rate": 4.143100853224714e-06, + "loss": 0.0204, + "step": 4300 + }, + { + "epoch": 3.5230276331597916, + "grad_norm": 0.32177852781904165, + "learning_rate": 4.089763863786988e-06, + "loss": 0.0204, + "step": 4400 + }, + { + "epoch": 3.6031237484981977, + "grad_norm": 0.24794031349246146, + "learning_rate": 4.035183248147752e-06, + "loss": 0.0201, + "step": 4500 + }, + { + "epoch": 3.683219863836604, + "grad_norm": 0.2548491545100107, + "learning_rate": 3.979401703140955e-06, + "loss": 0.02, + "step": 4600 + }, + { + "epoch": 3.76331597917501, + "grad_norm": 0.28339343421860097, + "learning_rate": 3.922462865052782e-06, + "loss": 0.0206, + "step": 4700 + }, + { + "epoch": 3.843412094513416, + "grad_norm": 0.25858197249007897, + "learning_rate": 3.8644112754862614e-06, + "loss": 0.0199, + "step": 4800 + }, + { + "epoch": 3.923508209851822, + "grad_norm": 0.25917676686664276, + "learning_rate": 3.805292346517659e-06, + "loss": 0.0199, + "step": 4900 + }, + { + "epoch": 4.0032038446135365, + "grad_norm": 0.205217434085613, + "learning_rate": 3.745152325171921e-06, + "loss": 0.0194, + "step": 5000 + }, + { + "epoch": 4.0032038446135365, + "eval_loss": 0.02939535118639469, + "eval_runtime": 97.4425, + "eval_samples_per_second": 1457.526, + "eval_steps_per_second": 2.853, + "step": 5000 + }, + { + "epoch": 4.083299959951942, + "grad_norm": 0.3449210512817333, + "learning_rate": 3.6840382572449733e-06, + "loss": 0.0109, + "step": 5100 + }, + { + "epoch": 4.163396075290349, + "grad_norm": 0.35702251381560834, + "learning_rate": 3.621997950501156e-06, + "loss": 0.0116, + "step": 5200 + }, + { + "epoch": 4.243492190628754, + "grad_norm": 0.3503016584030036, + "learning_rate": 3.5590799372745915e-06, + "loss": 0.0119, + "step": 5300 + }, + { + "epoch": 4.323588305967161, + "grad_norm": 0.32698876302828034, + "learning_rate": 3.495333436503753e-06, + "loss": 0.0125, + "step": 5400 + }, + { + "epoch": 4.403684421305567, + "grad_norm": 0.29218555867917617, + "learning_rate": 3.4308083152289073e-06, + "loss": 0.0122, + "step": 5500 + }, + { + "epoch": 4.483780536643973, + "grad_norm": 0.42870050776267266, + "learning_rate": 3.3655550495825824e-06, + "loss": 0.0121, + "step": 5600 + }, + { + "epoch": 4.563876651982379, + "grad_norm": 0.3841200097431653, + "learning_rate": 3.2996246853035417e-06, + "loss": 0.0123, + "step": 5700 + }, + { + "epoch": 4.6439727673207845, + "grad_norm": 0.27276776968480937, + "learning_rate": 3.233068797805194e-06, + "loss": 0.0121, + "step": 5800 + }, + { + "epoch": 4.724068882659191, + "grad_norm": 0.37618566324117403, + "learning_rate": 3.1659394518296303e-06, + "loss": 0.0121, + "step": 5900 + }, + { + "epoch": 4.8041649979975976, + "grad_norm": 0.3053361427605705, + "learning_rate": 3.0982891607188948e-06, + "loss": 0.0119, + "step": 6000 + }, + { + "epoch": 4.8041649979975976, + "eval_loss": 0.031129568815231323, + "eval_runtime": 97.1562, + "eval_samples_per_second": 1461.821, + "eval_steps_per_second": 2.861, + "step": 6000 + }, + { + "epoch": 4.884261113336003, + "grad_norm": 0.24611176483050773, + "learning_rate": 3.0301708453353118e-06, + "loss": 0.0121, + "step": 6100 + }, + { + "epoch": 4.964357228674409, + "grad_norm": 0.30724706018820913, + "learning_rate": 2.961637792663032e-06, + "loss": 0.0114, + "step": 6200 + }, + { + "epoch": 5.044052863436123, + "grad_norm": 0.38244952925905945, + "learning_rate": 2.8927436141231695e-06, + "loss": 0.0075, + "step": 6300 + }, + { + "epoch": 5.12414897877453, + "grad_norm": 0.20603254501695356, + "learning_rate": 2.8235422036351384e-06, + "loss": 0.0044, + "step": 6400 + }, + { + "epoch": 5.2042450941129355, + "grad_norm": 0.2637357295160275, + "learning_rate": 2.754087695457005e-06, + "loss": 0.0043, + "step": 6500 + }, + { + "epoch": 5.284341209451342, + "grad_norm": 0.5274953505653177, + "learning_rate": 2.684434421837821e-06, + "loss": 0.0045, + "step": 6600 + }, + { + "epoch": 5.364437324789748, + "grad_norm": 0.4238975113115418, + "learning_rate": 2.6146368705150854e-06, + "loss": 0.0048, + "step": 6700 + }, + { + "epoch": 5.444533440128154, + "grad_norm": 0.37315897649626995, + "learning_rate": 2.5447496420905608e-06, + "loss": 0.0045, + "step": 6800 + }, + { + "epoch": 5.52462955546656, + "grad_norm": 0.33573760401057196, + "learning_rate": 2.4748274073178114e-06, + "loss": 0.0045, + "step": 6900 + }, + { + "epoch": 5.6047256708049655, + "grad_norm": 0.4205706583224986, + "learning_rate": 2.4049248643348512e-06, + "loss": 0.0048, + "step": 7000 + }, + { + "epoch": 5.6047256708049655, + "eval_loss": 0.04388193413615227, + "eval_runtime": 97.237, + "eval_samples_per_second": 1460.607, + "eval_steps_per_second": 2.859, + "step": 7000 + }, + { + "epoch": 5.684821786143372, + "grad_norm": 0.3352151910327754, + "learning_rate": 2.3350966958753766e-06, + "loss": 0.0043, + "step": 7100 + }, + { + "epoch": 5.764917901481779, + "grad_norm": 0.30090375390642815, + "learning_rate": 2.265397526492052e-06, + "loss": 0.0043, + "step": 7200 + }, + { + "epoch": 5.845014016820184, + "grad_norm": 0.3649497532401096, + "learning_rate": 2.195881879825301e-06, + "loss": 0.0043, + "step": 7300 + }, + { + "epoch": 5.92511013215859, + "grad_norm": 0.26280545277109674, + "learning_rate": 2.1266041359510456e-06, + "loss": 0.0043, + "step": 7400 + }, + { + "epoch": 6.004805766920304, + "grad_norm": 0.13356592430041458, + "learning_rate": 2.057618488840745e-06, + "loss": 0.004, + "step": 7500 + }, + { + "epoch": 6.084901882258711, + "grad_norm": 0.5145664187486052, + "learning_rate": 1.9889789039670276e-06, + "loss": 0.0014, + "step": 7600 + }, + { + "epoch": 6.1649979975971165, + "grad_norm": 0.4699723392536862, + "learning_rate": 1.9207390760880605e-06, + "loss": 0.0014, + "step": 7700 + }, + { + "epoch": 6.245094112935522, + "grad_norm": 0.4374296333529995, + "learning_rate": 1.852952387243698e-06, + "loss": 0.0013, + "step": 7800 + }, + { + "epoch": 6.325190228273929, + "grad_norm": 0.42961763489773475, + "learning_rate": 1.7856718649962606e-06, + "loss": 0.0013, + "step": 7900 + }, + { + "epoch": 6.405286343612334, + "grad_norm": 0.2569945673800255, + "learning_rate": 1.7189501409486061e-06, + "loss": 0.0013, + "step": 8000 + }, + { + "epoch": 6.405286343612334, + "eval_loss": 0.053785648196935654, + "eval_runtime": 97.6698, + "eval_samples_per_second": 1454.134, + "eval_steps_per_second": 2.846, + "step": 8000 + }, + { + "epoch": 6.485382458950741, + "grad_norm": 0.36869037970588475, + "learning_rate": 1.6528394095719558e-06, + "loss": 0.0013, + "step": 8100 + }, + { + "epoch": 6.565478574289147, + "grad_norm": 0.2187928895873153, + "learning_rate": 1.587391387375669e-06, + "loss": 0.0014, + "step": 8200 + }, + { + "epoch": 6.645574689627553, + "grad_norm": 0.3346302824445088, + "learning_rate": 1.522657272450917e-06, + "loss": 0.0013, + "step": 8300 + }, + { + "epoch": 6.725670804965959, + "grad_norm": 0.22583442175391086, + "learning_rate": 1.4586877044199015e-06, + "loss": 0.0014, + "step": 8400 + }, + { + "epoch": 6.805766920304365, + "grad_norm": 0.24275774632690653, + "learning_rate": 1.3955327248219438e-06, + "loss": 0.0014, + "step": 8500 + }, + { + "epoch": 6.885863035642771, + "grad_norm": 0.322120144658376, + "learning_rate": 1.3332417379674426e-06, + "loss": 0.0012, + "step": 8600 + }, + { + "epoch": 6.965959150981178, + "grad_norm": 0.2971892796613953, + "learning_rate": 1.2718634722903073e-06, + "loss": 0.0013, + "step": 8700 + }, + { + "epoch": 7.045654785742891, + "grad_norm": 0.18481532134302478, + "learning_rate": 1.2114459422291205e-06, + "loss": 0.0007, + "step": 8800 + }, + { + "epoch": 7.125750901081298, + "grad_norm": 0.02287021398890685, + "learning_rate": 1.1520364106668342e-06, + "loss": 0.0003, + "step": 8900 + }, + { + "epoch": 7.205847016419703, + "grad_norm": 0.2801253618567114, + "learning_rate": 1.093681351958383e-06, + "loss": 0.0004, + "step": 9000 + }, + { + "epoch": 7.205847016419703, + "eval_loss": 0.06704169511795044, + "eval_runtime": 97.5519, + "eval_samples_per_second": 1455.892, + "eval_steps_per_second": 2.85, + "step": 9000 + }, + { + "epoch": 7.28594313175811, + "grad_norm": 0.05323383136377585, + "learning_rate": 1.0364264155751489e-06, + "loss": 0.0004, + "step": 9100 + }, + { + "epoch": 7.3660392470965155, + "grad_norm": 0.10232274475527954, + "learning_rate": 9.803163903946952e-07, + "loss": 0.0004, + "step": 9200 + }, + { + "epoch": 7.446135362434922, + "grad_norm": 0.1820198743026229, + "learning_rate": 9.253951696637311e-07, + "loss": 0.0004, + "step": 9300 + }, + { + "epoch": 7.526231477773328, + "grad_norm": 0.3077085461325738, + "learning_rate": 8.717057166616926e-07, + "loss": 0.0003, + "step": 9400 + }, + { + "epoch": 7.606327593111734, + "grad_norm": 0.02788433448517323, + "learning_rate": 8.192900310918206e-07, + "loss": 0.0004, + "step": 9500 + }, + { + "epoch": 7.68642370845014, + "grad_norm": 0.454935336405101, + "learning_rate": 7.681891162260016e-07, + "loss": 0.0004, + "step": 9600 + }, + { + "epoch": 7.766519823788546, + "grad_norm": 0.404101132597737, + "learning_rate": 7.184429468291023e-07, + "loss": 0.0003, + "step": 9700 + }, + { + "epoch": 7.846615939126952, + "grad_norm": 0.006797483493599147, + "learning_rate": 6.700904378878675e-07, + "loss": 0.0004, + "step": 9800 + }, + { + "epoch": 7.926712054465359, + "grad_norm": 0.253219681405225, + "learning_rate": 6.231694141688535e-07, + "loss": 0.0003, + "step": 9900 + }, + { + "epoch": 8.006407689227073, + "grad_norm": 0.01676261471498421, + "learning_rate": 5.777165806292109e-07, + "loss": 0.0003, + "step": 10000 + }, + { + "epoch": 8.006407689227073, + "eval_loss": 0.06978683918714523, + "eval_runtime": 97.4682, + "eval_samples_per_second": 1457.142, + "eval_steps_per_second": 2.852, + "step": 10000 + }, + { + "epoch": 8.086503804565478, + "grad_norm": 0.023667739210693765, + "learning_rate": 5.337674937034581e-07, + "loss": 0.0001, + "step": 10100 + }, + { + "epoch": 8.166599919903884, + "grad_norm": 0.19018699551662502, + "learning_rate": 4.913565334887135e-07, + "loss": 0.0001, + "step": 10200 + }, + { + "epoch": 8.246696035242291, + "grad_norm": 0.15520052508971907, + "learning_rate": 4.505168768501431e-07, + "loss": 0.0001, + "step": 10300 + }, + { + "epoch": 8.326792150580697, + "grad_norm": 0.002693072772497186, + "learning_rate": 4.1128047146765936e-07, + "loss": 0.0001, + "step": 10400 + }, + { + "epoch": 8.406888265919102, + "grad_norm": 0.07912436909277526, + "learning_rate": 3.736780108441762e-07, + "loss": 0.0001, + "step": 10500 + }, + { + "epoch": 8.486984381257509, + "grad_norm": 0.0704194063541305, + "learning_rate": 3.3773891029497326e-07, + "loss": 0.0001, + "step": 10600 + }, + { + "epoch": 8.567080496595915, + "grad_norm": 0.0031901574938772484, + "learning_rate": 3.034912839369447e-07, + "loss": 0.0, + "step": 10700 + }, + { + "epoch": 8.647176611934322, + "grad_norm": 0.3276934498065665, + "learning_rate": 2.70961922695743e-07, + "loss": 0.0001, + "step": 10800 + }, + { + "epoch": 8.727272727272727, + "grad_norm": 0.0018329070981180388, + "learning_rate": 2.401762733480115e-07, + "loss": 0.0, + "step": 10900 + }, + { + "epoch": 8.807368842611133, + "grad_norm": 0.002408780413295549, + "learning_rate": 2.1115841861510945e-07, + "loss": 0.0, + "step": 11000 + }, + { + "epoch": 8.807368842611133, + "eval_loss": 0.08943355828523636, + "eval_runtime": 97.2207, + "eval_samples_per_second": 1460.852, + "eval_steps_per_second": 2.859, + "step": 11000 + }, + { + "epoch": 8.88746495794954, + "grad_norm": 0.005344361337035522, + "learning_rate": 1.8393105832389791e-07, + "loss": 0.0, + "step": 11100 + }, + { + "epoch": 8.967561073287946, + "grad_norm": 0.008809607265012539, + "learning_rate": 1.5851549164932118e-07, + "loss": 0.0001, + "step": 11200 + }, + { + "epoch": 9.047256708049659, + "grad_norm": 0.004351746843250683, + "learning_rate": 1.349316004526824e-07, + "loss": 0.0, + "step": 11300 + }, + { + "epoch": 9.127352823388065, + "grad_norm": 0.0008942462172532464, + "learning_rate": 1.1319783372863601e-07, + "loss": 0.0, + "step": 11400 + }, + { + "epoch": 9.207448938726472, + "grad_norm": 0.0009999088005623051, + "learning_rate": 9.333119317307598e-08, + "loss": 0.0, + "step": 11500 + }, + { + "epoch": 9.287545054064879, + "grad_norm": 0.006833873365903121, + "learning_rate": 7.534721988320143e-08, + "loss": 0.0, + "step": 11600 + }, + { + "epoch": 9.367641169403283, + "grad_norm": 0.001580786758369194, + "learning_rate": 5.92599822001666e-08, + "loss": 0.0, + "step": 11700 + }, + { + "epoch": 9.44773728474169, + "grad_norm": 0.08223063305947663, + "learning_rate": 4.508206470382554e-08, + "loss": 0.0, + "step": 11800 + }, + { + "epoch": 9.527833400080096, + "grad_norm": 0.0003265712066290809, + "learning_rate": 3.2824558368179384e-08, + "loss": 0.0, + "step": 11900 + }, + { + "epoch": 9.607929515418503, + "grad_norm": 0.0005479447690907845, + "learning_rate": 2.2497051885228825e-08, + "loss": 0.0, + "step": 12000 + }, + { + "epoch": 9.607929515418503, + "eval_loss": 0.09308738261461258, + "eval_runtime": 97.3361, + "eval_samples_per_second": 1459.119, + "eval_steps_per_second": 2.856, + "step": 12000 + }, + { + "epoch": 9.688025630756908, + "grad_norm": 0.01973266591029808, + "learning_rate": 1.4107624164019229e-08, + "loss": 0.0, + "step": 12100 + }, + { + "epoch": 9.768121746095314, + "grad_norm": 0.0007774042502156854, + "learning_rate": 7.662838010742413e-09, + "loss": 0.0, + "step": 12200 + }, + { + "epoch": 9.84821786143372, + "grad_norm": 0.0003574216553306887, + "learning_rate": 3.1677349948461277e-09, + "loss": 0.0, + "step": 12300 + }, + { + "epoch": 9.928313976772127, + "grad_norm": 0.0005660328857731791, + "learning_rate": 6.258315051568819e-10, + "loss": 0.0, + "step": 12400 + }, + { + "epoch": 9.992390869042852, + "step": 12480, + "total_flos": 2784163811819520.0, + "train_loss": 0.025371345406674895, + "train_runtime": 36703.9164, + "train_samples_per_second": 348.252, + "train_steps_per_second": 0.34 + } + ], + "logging_steps": 100, + "max_steps": 12480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2784163811819520.0, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +} diff --git a/saves/chess/no_explain/training_args.bin b/saves/chess/no_explain/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..307671da66dbb1b7ea5eba31217babf26111f74f --- /dev/null +++ b/saves/chess/no_explain/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44b434e8cddfefc8be8a47c2feee91ad07bcea3a34db2ebd66e1953e926e28aa +size 7416 diff --git a/saves/chess/no_explain/training_eval_loss.png b/saves/chess/no_explain/training_eval_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..1902ccf9a0d425217b2a1c6a62f8c3732bb0fa3b Binary files /dev/null and b/saves/chess/no_explain/training_eval_loss.png differ diff --git a/saves/chess/no_explain/training_loss.png b/saves/chess/no_explain/training_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..27c610af3cd9c11863d97da113f641b87b662924 Binary files /dev/null and b/saves/chess/no_explain/training_loss.png differ