adding files

Browse files

Files changed (11) hide show

config.json +39 -0
generation_config.json +6 -0
pytorch_model-00001-of-00007.bin +3 -0
pytorch_model-00002-of-00007.bin +3 -0
pytorch_model-00003-of-00007.bin +3 -0
pytorch_model-00004-of-00007.bin +3 -0
pytorch_model-00005-of-00007.bin +3 -0
pytorch_model-00006-of-00007.bin +3 -0
pytorch_model-00007-of-00007.bin +3 -0
pytorch_model.bin.index.json +492 -0
trainer_state.json +419 -0

config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "_name_or_path": "checkpoints-fft/checkpoint-200",
+  "activation_function": "gelu",
+  "architectures": [
+    "GPTBigCodeForCausalLM"
+  ],
+  "attention_softmax_in_fp32": true,
+  "attn_pdrop": 0.1,
+  "bos_token_id": 0,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 0,
+  "inference_runner": 0,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "max_batch_size": null,
+  "max_sequence_length": null,
+  "model_type": "gpt_bigcode",
+  "multi_query": true,
+  "n_embd": 6144,
+  "n_head": 48,
+  "n_inner": 24576,
+  "n_layer": 40,
+  "n_positions": 8192,
+  "pad_key_length": true,
+  "pre_allocate_kv_cache": false,
+  "resid_pdrop": 0.1,
+  "scale_attention_softmax_in_fp32": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.33.0.dev0",
+  "use_cache": true,
+  "validate_runner_input": true,
+  "vocab_size": 49152
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "transformers_version": "4.33.0.dev0"
+}

pytorch_model-00001-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82e070f27b346424b672e1cb31f9e9f67e04866956041190ddcbe205282f2416
+size 9904379303

pytorch_model-00002-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5bcf0980b165b1b82a65df90dc2d3b53b8f0505f57524140ca23e3bcb1698a92
+size 9860464915

pytorch_model-00003-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8721ac8e8cae6ab40a1f8939bd76a7f564e78af88d4e0ce06b1c2d2c228d8e4d
+size 9854246167

pytorch_model-00004-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2399dbb1fbd1e3de0395d7fe6b43dde4014c18141f83cbdd535b8febdbce00ed
+size 9860464979

pytorch_model-00005-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5cfb64bd63f7e69f7519b0ad0ff83a7b5a40ce28041b9b1daa67ecd2acb4db43
+size 9854246167

pytorch_model-00006-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3c64964b7682841c1a55b49c7a12ad208737f02059f1eaf88f75e2c8f06dcab
+size 9860464979

pytorch_model-00007-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a81eccee5bfc5fe15ede2267567ea6d44646757b800ecc901cea28aa859f35e4
+size 2875719771

pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,492 @@

+{
+  "metadata": {
+    "total_size": 62069825536
+  },
+  "weight_map": {
+    "lm_head.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.0.attn.c_attn.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.0.attn.c_attn.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.0.attn.c_proj.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.0.attn.c_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.0.ln_1.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.0.ln_1.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.0.ln_2.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.0.ln_2.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.0.mlp.c_fc.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.0.mlp.c_fc.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.0.mlp.c_proj.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.0.mlp.c_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.1.attn.c_attn.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.1.attn.c_attn.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.1.attn.c_proj.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.1.attn.c_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.1.ln_1.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.1.ln_1.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.1.ln_2.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.1.ln_2.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.1.mlp.c_fc.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.1.mlp.c_fc.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.1.mlp.c_proj.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.1.mlp.c_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.10.attn.c_attn.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.10.attn.c_attn.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.10.attn.c_proj.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.10.attn.c_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.10.ln_1.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.10.ln_1.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.10.ln_2.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.10.ln_2.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.10.mlp.c_fc.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.10.mlp.c_fc.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.10.mlp.c_proj.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.10.mlp.c_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.11.attn.c_attn.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.11.attn.c_attn.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.11.attn.c_proj.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.11.attn.c_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.11.ln_1.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.11.ln_1.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.11.ln_2.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.11.ln_2.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.11.mlp.c_fc.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.11.mlp.c_fc.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.11.mlp.c_proj.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.11.mlp.c_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.12.attn.c_attn.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.12.attn.c_attn.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.12.attn.c_proj.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.12.attn.c_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.12.ln_1.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.12.ln_1.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.12.ln_2.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.12.ln_2.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.12.mlp.c_fc.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.12.mlp.c_fc.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.12.mlp.c_proj.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.12.mlp.c_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.13.attn.c_attn.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.13.attn.c_attn.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.13.attn.c_proj.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.13.attn.c_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.13.ln_1.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.13.ln_1.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.13.ln_2.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.13.ln_2.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.13.mlp.c_fc.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.13.mlp.c_fc.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.13.mlp.c_proj.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.13.mlp.c_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.14.attn.c_attn.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.14.attn.c_attn.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.14.attn.c_proj.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.14.attn.c_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.14.ln_1.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.14.ln_1.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.14.ln_2.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.14.ln_2.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.14.mlp.c_fc.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.14.mlp.c_fc.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.14.mlp.c_proj.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.14.mlp.c_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.15.attn.c_attn.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.15.attn.c_attn.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.15.attn.c_proj.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.15.attn.c_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.15.ln_1.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.15.ln_1.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.15.ln_2.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.15.ln_2.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.15.mlp.c_fc.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.15.mlp.c_fc.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.15.mlp.c_proj.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.15.mlp.c_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.16.attn.c_attn.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.16.attn.c_attn.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.16.attn.c_proj.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.16.attn.c_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.16.ln_1.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.16.ln_1.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.16.ln_2.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.16.ln_2.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.16.mlp.c_fc.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.16.mlp.c_fc.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.16.mlp.c_proj.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.16.mlp.c_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.17.attn.c_attn.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.17.attn.c_attn.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.17.attn.c_proj.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.17.attn.c_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.17.ln_1.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.17.ln_1.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.17.ln_2.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.17.ln_2.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.17.mlp.c_fc.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.17.mlp.c_fc.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.17.mlp.c_proj.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.17.mlp.c_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.18.attn.c_attn.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.18.attn.c_attn.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.18.attn.c_proj.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.18.attn.c_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.18.ln_1.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.18.ln_1.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.18.ln_2.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.18.ln_2.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.18.mlp.c_fc.bias": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.18.mlp.c_fc.weight": "pytorch_model-00003-of-00007.bin",
+    "transformer.h.18.mlp.c_proj.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.18.mlp.c_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.19.attn.c_attn.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.19.attn.c_attn.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.19.attn.c_proj.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.19.attn.c_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.19.ln_1.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.19.ln_1.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.19.ln_2.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.19.ln_2.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.19.mlp.c_fc.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.19.mlp.c_fc.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.19.mlp.c_proj.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.19.mlp.c_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.2.attn.c_attn.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.2.attn.c_attn.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.2.attn.c_proj.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.2.attn.c_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.2.ln_1.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.2.ln_1.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.2.ln_2.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.2.ln_2.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.2.mlp.c_fc.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.2.mlp.c_fc.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.2.mlp.c_proj.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.2.mlp.c_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.20.attn.c_attn.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.20.attn.c_attn.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.20.attn.c_proj.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.20.attn.c_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.20.ln_1.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.20.ln_1.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.20.ln_2.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.20.ln_2.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.20.mlp.c_fc.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.20.mlp.c_fc.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.20.mlp.c_proj.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.20.mlp.c_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.21.attn.c_attn.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.21.attn.c_attn.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.21.attn.c_proj.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.21.attn.c_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.21.ln_1.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.21.ln_1.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.21.ln_2.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.21.ln_2.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.21.mlp.c_fc.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.21.mlp.c_fc.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.21.mlp.c_proj.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.21.mlp.c_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.22.attn.c_attn.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.22.attn.c_attn.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.22.attn.c_proj.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.22.attn.c_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.22.ln_1.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.22.ln_1.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.22.ln_2.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.22.ln_2.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.22.mlp.c_fc.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.22.mlp.c_fc.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.22.mlp.c_proj.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.22.mlp.c_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.23.attn.c_attn.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.23.attn.c_attn.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.23.attn.c_proj.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.23.attn.c_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.23.ln_1.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.23.ln_1.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.23.ln_2.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.23.ln_2.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.23.mlp.c_fc.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.23.mlp.c_fc.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.23.mlp.c_proj.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.23.mlp.c_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.24.attn.c_attn.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.24.attn.c_attn.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.24.attn.c_proj.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.24.attn.c_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.24.ln_1.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.24.ln_1.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.24.ln_2.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.24.ln_2.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.24.mlp.c_fc.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.24.mlp.c_fc.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.24.mlp.c_proj.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.24.mlp.c_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.25.attn.c_attn.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.25.attn.c_attn.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.25.attn.c_proj.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.25.attn.c_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.25.ln_1.bias": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.25.ln_1.weight": "pytorch_model-00004-of-00007.bin",
+    "transformer.h.25.ln_2.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.25.ln_2.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.25.mlp.c_fc.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.25.mlp.c_fc.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.25.mlp.c_proj.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.25.mlp.c_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.26.attn.c_attn.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.26.attn.c_attn.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.26.attn.c_proj.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.26.attn.c_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.26.ln_1.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.26.ln_1.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.26.ln_2.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.26.ln_2.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.26.mlp.c_fc.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.26.mlp.c_fc.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.26.mlp.c_proj.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.26.mlp.c_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.27.attn.c_attn.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.27.attn.c_attn.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.27.attn.c_proj.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.27.attn.c_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.27.ln_1.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.27.ln_1.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.27.ln_2.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.27.ln_2.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.27.mlp.c_fc.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.27.mlp.c_fc.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.27.mlp.c_proj.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.27.mlp.c_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.28.attn.c_attn.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.28.attn.c_attn.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.28.attn.c_proj.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.28.attn.c_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.28.ln_1.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.28.ln_1.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.28.ln_2.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.28.ln_2.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.28.mlp.c_fc.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.28.mlp.c_fc.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.28.mlp.c_proj.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.28.mlp.c_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.29.attn.c_attn.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.29.attn.c_attn.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.29.attn.c_proj.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.29.attn.c_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.29.ln_1.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.29.ln_1.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.29.ln_2.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.29.ln_2.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.29.mlp.c_fc.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.29.mlp.c_fc.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.29.mlp.c_proj.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.29.mlp.c_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.3.attn.c_attn.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.3.attn.c_attn.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.3.attn.c_proj.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.3.attn.c_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.3.ln_1.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.3.ln_1.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.3.ln_2.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.3.ln_2.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.3.mlp.c_fc.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.3.mlp.c_fc.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.3.mlp.c_proj.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.3.mlp.c_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.30.attn.c_attn.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.30.attn.c_attn.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.30.attn.c_proj.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.30.attn.c_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.30.ln_1.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.30.ln_1.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.30.ln_2.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.30.ln_2.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.30.mlp.c_fc.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.30.mlp.c_fc.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.30.mlp.c_proj.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.30.mlp.c_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.31.attn.c_attn.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.31.attn.c_attn.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.31.attn.c_proj.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.31.attn.c_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.31.ln_1.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.31.ln_1.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.31.ln_2.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.31.ln_2.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.31.mlp.c_fc.bias": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.31.mlp.c_fc.weight": "pytorch_model-00005-of-00007.bin",
+    "transformer.h.31.mlp.c_proj.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.31.mlp.c_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.32.attn.c_attn.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.32.attn.c_attn.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.32.attn.c_proj.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.32.attn.c_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.32.ln_1.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.32.ln_1.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.32.ln_2.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.32.ln_2.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.32.mlp.c_fc.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.32.mlp.c_fc.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.32.mlp.c_proj.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.32.mlp.c_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.33.attn.c_attn.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.33.attn.c_attn.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.33.attn.c_proj.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.33.attn.c_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.33.ln_1.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.33.ln_1.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.33.ln_2.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.33.ln_2.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.33.mlp.c_fc.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.33.mlp.c_fc.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.33.mlp.c_proj.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.33.mlp.c_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.34.attn.c_attn.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.34.attn.c_attn.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.34.attn.c_proj.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.34.attn.c_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.34.ln_1.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.34.ln_1.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.34.ln_2.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.34.ln_2.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.34.mlp.c_fc.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.34.mlp.c_fc.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.34.mlp.c_proj.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.34.mlp.c_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.35.attn.c_attn.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.35.attn.c_attn.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.35.attn.c_proj.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.35.attn.c_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.35.ln_1.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.35.ln_1.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.35.ln_2.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.35.ln_2.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.35.mlp.c_fc.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.35.mlp.c_fc.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.35.mlp.c_proj.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.35.mlp.c_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.36.attn.c_attn.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.36.attn.c_attn.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.36.attn.c_proj.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.36.attn.c_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.36.ln_1.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.36.ln_1.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.36.ln_2.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.36.ln_2.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.36.mlp.c_fc.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.36.mlp.c_fc.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.36.mlp.c_proj.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.36.mlp.c_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.37.attn.c_attn.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.37.attn.c_attn.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.37.attn.c_proj.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.37.attn.c_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.37.ln_1.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.37.ln_1.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.37.ln_2.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.37.ln_2.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.37.mlp.c_fc.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.37.mlp.c_fc.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.37.mlp.c_proj.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.37.mlp.c_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.38.attn.c_attn.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.38.attn.c_attn.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.38.attn.c_proj.bias": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.38.attn.c_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.38.ln_1.bias": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.38.ln_1.weight": "pytorch_model-00006-of-00007.bin",
+    "transformer.h.38.ln_2.bias": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.38.ln_2.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.38.mlp.c_fc.bias": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.38.mlp.c_fc.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.38.mlp.c_proj.bias": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.38.mlp.c_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.39.attn.c_attn.bias": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.39.attn.c_attn.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.39.attn.c_proj.bias": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.39.attn.c_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.39.ln_1.bias": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.39.ln_1.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.39.ln_2.bias": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.39.ln_2.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.39.mlp.c_fc.bias": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.39.mlp.c_fc.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.39.mlp.c_proj.bias": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.39.mlp.c_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.h.4.attn.c_attn.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.4.attn.c_attn.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.4.attn.c_proj.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.4.attn.c_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.4.ln_1.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.4.ln_1.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.4.ln_2.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.4.ln_2.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.4.mlp.c_fc.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.4.mlp.c_fc.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.4.mlp.c_proj.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.4.mlp.c_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.5.attn.c_attn.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.5.attn.c_attn.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.5.attn.c_proj.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.5.attn.c_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.5.ln_1.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.5.ln_1.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.5.ln_2.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.5.ln_2.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.5.mlp.c_fc.bias": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.5.mlp.c_fc.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.h.5.mlp.c_proj.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.5.mlp.c_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.6.attn.c_attn.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.6.attn.c_attn.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.6.attn.c_proj.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.6.attn.c_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.6.ln_1.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.6.ln_1.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.6.ln_2.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.6.ln_2.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.6.mlp.c_fc.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.6.mlp.c_fc.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.6.mlp.c_proj.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.6.mlp.c_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.7.attn.c_attn.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.7.attn.c_attn.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.7.attn.c_proj.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.7.attn.c_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.7.ln_1.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.7.ln_1.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.7.ln_2.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.7.ln_2.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.7.mlp.c_fc.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.7.mlp.c_fc.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.7.mlp.c_proj.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.7.mlp.c_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.8.attn.c_attn.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.8.attn.c_attn.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.8.attn.c_proj.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.8.attn.c_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.8.ln_1.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.8.ln_1.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.8.ln_2.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.8.ln_2.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.8.mlp.c_fc.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.8.mlp.c_fc.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.8.mlp.c_proj.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.8.mlp.c_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.9.attn.c_attn.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.9.attn.c_attn.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.9.attn.c_proj.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.9.attn.c_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.9.ln_1.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.9.ln_1.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.9.ln_2.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.9.ln_2.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.9.mlp.c_fc.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.9.mlp.c_fc.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.9.mlp.c_proj.bias": "pytorch_model-00002-of-00007.bin",
+    "transformer.h.9.mlp.c_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "transformer.ln_f.bias": "pytorch_model-00007-of-00007.bin",
+    "transformer.ln_f.weight": "pytorch_model-00007-of-00007.bin",
+    "transformer.wpe.weight": "pytorch_model-00001-of-00007.bin",
+    "transformer.wte.weight": "pytorch_model-00001-of-00007.bin"
+  }
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,419 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 10,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.03,
+      "learning_rate": 6.47685462377997e-07,
+      "loss": 1.1777,
+      "step": 5
+    },
+    {
+      "epoch": 0.05,
+      "learning_rate": 9.266284080291269e-07,
+      "loss": 1.1847,
+      "step": 10
+    },
+    {
+      "epoch": 0.05,
+      "eval_loss": 1.1642409563064575,
+      "eval_runtime": 70.6255,
+      "eval_samples_per_second": 0.212,
+      "eval_steps_per_second": 0.028,
+      "step": 10
+    },
+    {
+      "epoch": 0.07,
+      "learning_rate": 1e-06,
+      "loss": 1.1025,
+      "step": 15
+    },
+    {
+      "epoch": 0.1,
+      "learning_rate": 1e-06,
+      "loss": 1.0968,
+      "step": 20
+    },
+    {
+      "epoch": 0.1,
+      "eval_loss": 1.1332802772521973,
+      "eval_runtime": 70.733,
+      "eval_samples_per_second": 0.212,
+      "eval_steps_per_second": 0.028,
+      "step": 20
+    },
+    {
+      "epoch": 0.12,
+      "learning_rate": 1e-06,
+      "loss": 1.0927,
+      "step": 25
+    },
+    {
+      "epoch": 0.15,
+      "learning_rate": 1e-06,
+      "loss": 1.1003,
+      "step": 30
+    },
+    {
+      "epoch": 0.15,
+      "eval_loss": 1.1125657558441162,
+      "eval_runtime": 70.756,
+      "eval_samples_per_second": 0.212,
+      "eval_steps_per_second": 0.028,
+      "step": 30
+    },
+    {
+      "epoch": 0.17,
+      "learning_rate": 1e-06,
+      "loss": 1.0657,
+      "step": 35
+    },
+    {
+      "epoch": 0.2,
+      "learning_rate": 1e-06,
+      "loss": 1.0789,
+      "step": 40
+    },
+    {
+      "epoch": 0.2,
+      "eval_loss": 1.0948463678359985,
+      "eval_runtime": 70.7686,
+      "eval_samples_per_second": 0.212,
+      "eval_steps_per_second": 0.028,
+      "step": 40
+    },
+    {
+      "epoch": 0.23,
+      "learning_rate": 1e-06,
+      "loss": 1.0963,
+      "step": 45
+    },
+    {
+      "epoch": 0.25,
+      "learning_rate": 1e-06,
+      "loss": 1.0648,
+      "step": 50
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 1.087746262550354,
+      "eval_runtime": 70.9688,
+      "eval_samples_per_second": 0.211,
+      "eval_steps_per_second": 0.028,
+      "step": 50
+    },
+    {
+      "epoch": 0.28,
+      "learning_rate": 1e-06,
+      "loss": 1.0504,
+      "step": 55
+    },
+    {
+      "epoch": 0.3,
+      "learning_rate": 1e-06,
+      "loss": 1.0568,
+      "step": 60
+    },
+    {
+      "epoch": 0.3,
+      "eval_loss": 1.083579659461975,
+      "eval_runtime": 70.7858,
+      "eval_samples_per_second": 0.212,
+      "eval_steps_per_second": 0.028,
+      "step": 60
+    },
+    {
+      "epoch": 0.33,
+      "learning_rate": 1e-06,
+      "loss": 1.0472,
+      "step": 65
+    },
+    {
+      "epoch": 0.35,
+      "learning_rate": 1e-06,
+      "loss": 1.0587,
+      "step": 70
+    },
+    {
+      "epoch": 0.35,
+      "eval_loss": 1.080450177192688,
+      "eval_runtime": 70.8142,
+      "eval_samples_per_second": 0.212,
+      "eval_steps_per_second": 0.028,
+      "step": 70
+    },
+    {
+      "epoch": 0.38,
+      "learning_rate": 1e-06,
+      "loss": 1.0609,
+      "step": 75
+    },
+    {
+      "epoch": 0.4,
+      "learning_rate": 1e-06,
+      "loss": 1.021,
+      "step": 80
+    },
+    {
+      "epoch": 0.4,
+      "eval_loss": 1.0777862071990967,
+      "eval_runtime": 70.7864,
+      "eval_samples_per_second": 0.212,
+      "eval_steps_per_second": 0.028,
+      "step": 80
+    },
+    {
+      "epoch": 0.42,
+      "learning_rate": 1e-06,
+      "loss": 1.048,
+      "step": 85
+    },
+    {
+      "epoch": 0.45,
+      "learning_rate": 1e-06,
+      "loss": 1.0857,
+      "step": 90
+    },
+    {
+      "epoch": 0.45,
+      "eval_loss": 1.075345516204834,
+      "eval_runtime": 70.7802,
+      "eval_samples_per_second": 0.212,
+      "eval_steps_per_second": 0.028,
+      "step": 90
+    },
+    {
+      "epoch": 0.47,
+      "learning_rate": 1e-06,
+      "loss": 1.0077,
+      "step": 95
+    },
+    {
+      "epoch": 0.5,
+      "learning_rate": 1e-06,
+      "loss": 1.035,
+      "step": 100
+    },
+    {
+      "epoch": 0.5,
+      "eval_loss": 1.0735156536102295,
+      "eval_runtime": 70.9576,
+      "eval_samples_per_second": 0.211,
+      "eval_steps_per_second": 0.028,
+      "step": 100
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 1e-06,
+      "loss": 1.0042,
+      "step": 105
+    },
+    {
+      "epoch": 0.55,
+      "learning_rate": 1e-06,
+      "loss": 1.0547,
+      "step": 110
+    },
+    {
+      "epoch": 0.55,
+      "eval_loss": 1.0722615718841553,
+      "eval_runtime": 70.7348,
+      "eval_samples_per_second": 0.212,
+      "eval_steps_per_second": 0.028,
+      "step": 110
+    },
+    {
+      "epoch": 0.57,
+      "learning_rate": 1e-06,
+      "loss": 1.0077,
+      "step": 115
+    },
+    {
+      "epoch": 0.6,
+      "learning_rate": 1e-06,
+      "loss": 1.0646,
+      "step": 120
+    },
+    {
+      "epoch": 0.6,
+      "eval_loss": 1.07070791721344,
+      "eval_runtime": 70.8054,
+      "eval_samples_per_second": 0.212,
+      "eval_steps_per_second": 0.028,
+      "step": 120
+    },
+    {
+      "epoch": 0.62,
+      "learning_rate": 1e-06,
+      "loss": 0.9997,
+      "step": 125
+    },
+    {
+      "epoch": 0.65,
+      "learning_rate": 1e-06,
+      "loss": 1.0396,
+      "step": 130
+    },
+    {
+      "epoch": 0.65,
+      "eval_loss": 1.0689507722854614,
+      "eval_runtime": 70.8505,
+      "eval_samples_per_second": 0.212,
+      "eval_steps_per_second": 0.028,
+      "step": 130
+    },
+    {
+      "epoch": 0.68,
+      "learning_rate": 1e-06,
+      "loss": 1.0387,
+      "step": 135
+    },
+    {
+      "epoch": 0.7,
+      "learning_rate": 1e-06,
+      "loss": 1.0197,
+      "step": 140
+    },
+    {
+      "epoch": 0.7,
+      "eval_loss": 1.0676530599594116,
+      "eval_runtime": 70.7124,
+      "eval_samples_per_second": 0.212,
+      "eval_steps_per_second": 0.028,
+      "step": 140
+    },
+    {
+      "epoch": 0.72,
+      "learning_rate": 1e-06,
+      "loss": 1.0274,
+      "step": 145
+    },
+    {
+      "epoch": 0.75,
+      "learning_rate": 1e-06,
+      "loss": 1.0331,
+      "step": 150
+    },
+    {
+      "epoch": 0.75,
+      "eval_loss": 1.066578984260559,
+      "eval_runtime": 70.851,
+      "eval_samples_per_second": 0.212,
+      "eval_steps_per_second": 0.028,
+      "step": 150
+    },
+    {
+      "epoch": 0.78,
+      "learning_rate": 1e-06,
+      "loss": 0.9796,
+      "step": 155
+    },
+    {
+      "epoch": 0.8,
+      "learning_rate": 1e-06,
+      "loss": 0.9945,
+      "step": 160
+    },
+    {
+      "epoch": 0.8,
+      "eval_loss": 1.0656859874725342,
+      "eval_runtime": 70.6043,
+      "eval_samples_per_second": 0.212,
+      "eval_steps_per_second": 0.028,
+      "step": 160
+    },
+    {
+      "epoch": 0.82,
+      "learning_rate": 1e-06,
+      "loss": 1.0549,
+      "step": 165
+    },
+    {
+      "epoch": 0.85,
+      "learning_rate": 1e-06,
+      "loss": 1.0134,
+      "step": 170
+    },
+    {
+      "epoch": 0.85,
+      "eval_loss": 1.0651814937591553,
+      "eval_runtime": 70.8302,
+      "eval_samples_per_second": 0.212,
+      "eval_steps_per_second": 0.028,
+      "step": 170
+    },
+    {
+      "epoch": 0.88,
+      "learning_rate": 1e-06,
+      "loss": 0.9615,
+      "step": 175
+    },
+    {
+      "epoch": 0.9,
+      "learning_rate": 1e-06,
+      "loss": 0.9737,
+      "step": 180
+    },
+    {
+      "epoch": 0.9,
+      "eval_loss": 1.0644478797912598,
+      "eval_runtime": 70.9689,
+      "eval_samples_per_second": 0.211,
+      "eval_steps_per_second": 0.028,
+      "step": 180
+    },
+    {
+      "epoch": 0.93,
+      "learning_rate": 1e-06,
+      "loss": 1.0104,
+      "step": 185
+    },
+    {
+      "epoch": 0.95,
+      "learning_rate": 1e-06,
+      "loss": 1.027,
+      "step": 190
+    },
+    {
+      "epoch": 0.95,
+      "eval_loss": 1.0634864568710327,
+      "eval_runtime": 71.0057,
+      "eval_samples_per_second": 0.211,
+      "eval_steps_per_second": 0.028,
+      "step": 190
+    },
+    {
+      "epoch": 0.97,
+      "learning_rate": 1e-06,
+      "loss": 1.028,
+      "step": 195
+    },
+    {
+      "epoch": 1.0,
+      "learning_rate": 1e-06,
+      "loss": 0.9929,
+      "step": 200
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.062495470046997,
+      "eval_runtime": 71.1183,
+      "eval_samples_per_second": 0.211,
+      "eval_steps_per_second": 0.028,
+      "step": 200
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 200,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 10,
+  "total_flos": 214104589467648.0,
+  "trial_name": null,
+  "trial_params": null
+}