Initial file commit

Browse files

Files changed (16) hide show

README.md +60 -0
chat_class.py +94 -0
config.json +35 -0
generation_config.json +6 -0
model-00001-of-00006.safetensors +3 -0
model-00002-of-00006.safetensors +3 -0
model-00003-of-00006.safetensors +3 -0
model-00004-of-00006.safetensors +3 -0
model-00005-of-00006.safetensors +3 -0
model-00006-of-00006.safetensors +3 -0
model.safetensors.index.json +298 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer_config.json +256 -0
trainer_state.json +0 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,60 @@

+# DCLM-7B-Chat
+This is a fine-tuned version of the DCLM-7B baseline model trained for chat
+completions.
+## Quick start
+To use the model, `open_lm` must first be installed:
+```shell
+pip install git+https://github.com/mlfoundations/open_lm.git
+```
+Then simply load the model and generate responses:
+```python
+from open_lm.hf import *
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+)
+model = AutoModelForCausalLM.from_pretrained("mathewhe/DCLM-7B-Chat")
+tokenizer = AutoTokenizer.from_pretrained("mathewhe/DCLM-7B-Chat")
+messages = [
+    {"role": "user", "content": "What is an LLM?"},
+]
+inputs = tokenizer.apply_chat_template(messages)
+print(tokenizer.decode(model.generate(**inputs)[0]))
+```
+## Chat template
+This model uses the following chat template and does not support a separate
+system prompt:
+```
+<|endoftext|>[INST] <user-message> [/INST][ASST] <llm-response> [/ASST]<|endoftext|>
+```
+The included tokenizer will correctly format messages, so you should not have
+to manually format the input text.
+Instead, use the tokenizer's `apply_chat_template()` method on a list of
+messages.
+Each message should be a dict with two keys:
+- "role": Either "user" or "assistant".
+- "content": The message to include.
+For example:
+```python
+messages = [
+    {"role": "user", "content": "Solve for x: 3x=4"},
+    {"role": "assistant", "content": "3x=4\n(3x)/3=(4)/3\nx=4/3"},
+    {"role": "user", "content": "Please explain your work."},
+]
+```
+See the example code in the included `chat_class.py` module for more details.

chat_class.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from open_lm.hf import *
+from transformers import AutoTokenizer, AutoModelForCausalLM
+class Chat:
+    def __init__(
+        self,
+        path="mathewhe/DCLM-7B-Chat"
+        device="cuda",
+    ):
+        r"""
+        Construct :class:`Chat`\.
+        Args:
+            path (str): Model name or path.
+            device (str): Model device.
+        """
+        self.tokenizer = AutoTokenizer.from_pretrained(path)
+        self.tokenizer.add_tokens(
+            ["[ASST]", "[INST]", "[/ASST]", "[/INST]"],
+            special_tokens=True,
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(path, device_map="cuda")
+        self.messages = list()
+        self.device = device
+        self.gen_kwargs = {
+            "min_new_tokens": 1,
+            "max_new_tokens": 2048,
+            "top_p": 0.8,
+            "temperature": 0.8,
+            "do_sample": True,
+            "repetition_penalty": 1.1,
+        }
+    def reset(self):
+        self.messages = list()
+    def _inference(self, messages):
+        chat = self.tokenizer.apply_chat_template(messages, tokenize=False)
+        inputs = {
+            k: v.to(self.device)
+            for k, v in self.tokenizer(chat, return_tensors="pt").items()
+        }
+        input_length = len(inputs["input_ids"][0])
+        output = self.model.generate(**inputs, **self.gen_kwargs)
+        response = self.tokenizer.decode(
+            output[0].tolist()[input_length:],
+            skip_special_tokens=True,
+        )
+        if response.startswith(" "):  # fix this so it's handled correctly by the tokenizer
+            response = response[1:]
+        return response
+    def message(self, message):
+        r"""
+        Add a user message to the chat history and save and return a response.
+        Args:
+            message (str): The user message.
+        """
+        self.messages.append({"role": "user", "content": message})
+        response = self._inference(self.messages)
+        self.messages.append({"role": "assistant", "content": response})
+        return response
+    def cli_chat(self):
+        r"""
+        For CLI-based chatting (with history).
+        """
+        asst_prompt = "Assistant: "
+        user_prompt = "---> User: "
+        print(f"{asst_prompt}Hi! How can I help you?\n")
+        message = input(user_prompt)
+        while not (message is None or message == ""):
+            response = self.message(message)
+            print(f"\n{asst_prompt}{response}\n")
+            message = input(user_prompt)
+    def instruct(self, message):
+        r"""
+        For single instruction-response interactions (without history).
+        Args:
+            message (str): An instruction or one-off user message.
+        """
+        messages = [{"role": "user", "content": message}]
+        response = self._inference(messages)
+        return response
+if __name__ == "__main__":
+    chat = Chat()
+    chat.cli_chat()

config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "_name_or_path": "apple/DCLM-Baseline-7B",
+  "apply_qk_norm": true,
+  "architectures": [
+    "OpenLMForCausalLM"
+  ],
+  "attn_activation": null,
+  "attn_name": "torch_attn",
+  "attn_seq_scalar": null,
+  "attn_seq_scalar_alpha": null,
+  "dim": 4096,
+  "ffn_type": "swiglu_torch",
+  "model": "open_lm_7b",
+  "model_type": "openlm",
+  "moe_capacity_factor": 1.25,
+  "moe_expert_model_parallelism": false,
+  "moe_freq": 0,
+  "moe_loss_weight": 0.1,
+  "moe_num_experts": null,
+  "moe_top_k": 2,
+  "moe_weight_parallelism": false,
+  "n_heads": 32,
+  "n_layers": 32,
+  "norm_eps": 1e-05,
+  "norm_type": "gain_only_lp_layer_norm",
+  "params": null,
+  "positional_embedding_type": "rotary",
+  "post_embed_norm": false,
+  "qk_norm": true,
+  "seq_len": 2048,
+  "torch_dtype": "float32",
+  "transformers_version": "4.42.4",
+  "vocab_size": 50432,
+  "weight_tying": false
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.42.4",
+  "eos_token_id": 0,
+  "pad_token_id": 0
+}

model-00001-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4407c083346f4dedacbfbe71ddc38e1913e575fef3b601159231e312db460999
+size 4874115416

model-00002-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:200788ae42fedbf8b005d7f4ff5f052ba9507ef7b4c9f74abdcdf5fee0f49f1b
+size 4857404912

model-00003-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d91609e4ce09bbc6489fbaefeb67238a753acf4e378de26af0299931076292b9
+size 4857404960

model-00004-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:adf3854317a3d13f84d2c505be1c49aac69c674346cf09d432525f227d6d8666
+size 4857404960

model-00005-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:563c4f1c3bdbaa6264ef3863936586ab466056db929fa801feb9ce03c661fc00
+size 4857404960

model-00006-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a0843dcd98c19d253b4b4400fd8c689df071922401a67df37f243aff67613c8
+size 3254996944

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,298 @@

+{
+  "metadata": {
+    "total_size": 27558699008
+  },
+  "weight_map": {
+    "model.layers.0.attention.in_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.attention.k_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.attention.out_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.attention.pos_embed.inv_freq": "model-00001-of-00006.safetensors",
+    "model.layers.0.attention.q_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.attention_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.feed_forward.w12.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.feed_forward.w3.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.ffn_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.attention.in_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.attention.k_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.attention.out_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.attention.pos_embed.inv_freq": "model-00001-of-00006.safetensors",
+    "model.layers.1.attention.q_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.attention_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.feed_forward.w12.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.feed_forward.w3.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.ffn_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.10.attention.in_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.attention.k_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.attention.out_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.attention.pos_embed.inv_freq": "model-00002-of-00006.safetensors",
+    "model.layers.10.attention.q_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.attention_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.feed_forward.w12.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.feed_forward.w3.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.ffn_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.attention.in_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.11.attention.k_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.11.attention.out_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.11.attention.pos_embed.inv_freq": "model-00003-of-00006.safetensors",
+    "model.layers.11.attention.q_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.11.attention_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.11.feed_forward.w12.weight": "model-00003-of-00006.safetensors",
+    "model.layers.11.feed_forward.w3.weight": "model-00003-of-00006.safetensors",
+    "model.layers.11.ffn_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.attention.in_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.attention.k_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.attention.out_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.attention.pos_embed.inv_freq": "model-00003-of-00006.safetensors",
+    "model.layers.12.attention.q_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.attention_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.feed_forward.w12.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.feed_forward.w3.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.ffn_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.attention.in_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.attention.k_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.attention.out_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.attention.pos_embed.inv_freq": "model-00003-of-00006.safetensors",
+    "model.layers.13.attention.q_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.attention_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.feed_forward.w12.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.feed_forward.w3.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.ffn_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.attention.in_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.attention.k_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.attention.out_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.attention.pos_embed.inv_freq": "model-00003-of-00006.safetensors",
+    "model.layers.14.attention.q_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.attention_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.feed_forward.w12.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.feed_forward.w3.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.ffn_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.attention.in_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.attention.k_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.attention.out_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.attention.pos_embed.inv_freq": "model-00003-of-00006.safetensors",
+    "model.layers.15.attention.q_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.attention_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.feed_forward.w12.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.feed_forward.w3.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.ffn_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.attention.in_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.attention.k_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.attention.out_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.attention.pos_embed.inv_freq": "model-00003-of-00006.safetensors",
+    "model.layers.16.attention.q_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.attention_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.feed_forward.w12.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.feed_forward.w3.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.ffn_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.attention.in_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.17.attention.k_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.17.attention.out_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.17.attention.pos_embed.inv_freq": "model-00004-of-00006.safetensors",
+    "model.layers.17.attention.q_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.17.attention_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.17.feed_forward.w12.weight": "model-00004-of-00006.safetensors",
+    "model.layers.17.feed_forward.w3.weight": "model-00004-of-00006.safetensors",
+    "model.layers.17.ffn_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.18.attention.in_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.18.attention.k_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.18.attention.out_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.18.attention.pos_embed.inv_freq": "model-00004-of-00006.safetensors",
+    "model.layers.18.attention.q_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.18.attention_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.18.feed_forward.w12.weight": "model-00004-of-00006.safetensors",
+    "model.layers.18.feed_forward.w3.weight": "model-00004-of-00006.safetensors",
+    "model.layers.18.ffn_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.19.attention.in_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.19.attention.k_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.19.attention.out_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.19.attention.pos_embed.inv_freq": "model-00004-of-00006.safetensors",
+    "model.layers.19.attention.q_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.19.attention_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.19.feed_forward.w12.weight": "model-00004-of-00006.safetensors",
+    "model.layers.19.feed_forward.w3.weight": "model-00004-of-00006.safetensors",
+    "model.layers.19.ffn_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.2.attention.in_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.attention.k_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.attention.out_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.attention.pos_embed.inv_freq": "model-00001-of-00006.safetensors",
+    "model.layers.2.attention.q_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.attention_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.feed_forward.w12.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.feed_forward.w3.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.ffn_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.20.attention.in_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.20.attention.k_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.20.attention.out_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.20.attention.pos_embed.inv_freq": "model-00004-of-00006.safetensors",
+    "model.layers.20.attention.q_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.20.attention_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.20.feed_forward.w12.weight": "model-00004-of-00006.safetensors",
+    "model.layers.20.feed_forward.w3.weight": "model-00004-of-00006.safetensors",
+    "model.layers.20.ffn_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.attention.in_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.attention.k_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.attention.out_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.attention.pos_embed.inv_freq": "model-00004-of-00006.safetensors",
+    "model.layers.21.attention.q_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.attention_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.feed_forward.w12.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.feed_forward.w3.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.ffn_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.attention.in_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.attention.k_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.attention.out_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.attention.pos_embed.inv_freq": "model-00004-of-00006.safetensors",
+    "model.layers.22.attention.q_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.attention_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.feed_forward.w12.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.feed_forward.w3.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.ffn_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.23.attention.in_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.23.attention.k_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.23.attention.out_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.23.attention.pos_embed.inv_freq": "model-00005-of-00006.safetensors",
+    "model.layers.23.attention.q_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.23.attention_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.23.feed_forward.w12.weight": "model-00005-of-00006.safetensors",
+    "model.layers.23.feed_forward.w3.weight": "model-00005-of-00006.safetensors",
+    "model.layers.23.ffn_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.24.attention.in_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.24.attention.k_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.24.attention.out_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.24.attention.pos_embed.inv_freq": "model-00005-of-00006.safetensors",
+    "model.layers.24.attention.q_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.24.attention_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.24.feed_forward.w12.weight": "model-00005-of-00006.safetensors",
+    "model.layers.24.feed_forward.w3.weight": "model-00005-of-00006.safetensors",
+    "model.layers.24.ffn_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.25.attention.in_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.25.attention.k_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.25.attention.out_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.25.attention.pos_embed.inv_freq": "model-00005-of-00006.safetensors",
+    "model.layers.25.attention.q_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.25.attention_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.25.feed_forward.w12.weight": "model-00005-of-00006.safetensors",
+    "model.layers.25.feed_forward.w3.weight": "model-00005-of-00006.safetensors",
+    "model.layers.25.ffn_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.26.attention.in_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.26.attention.k_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.26.attention.out_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.26.attention.pos_embed.inv_freq": "model-00005-of-00006.safetensors",
+    "model.layers.26.attention.q_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.26.attention_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.26.feed_forward.w12.weight": "model-00005-of-00006.safetensors",
+    "model.layers.26.feed_forward.w3.weight": "model-00005-of-00006.safetensors",
+    "model.layers.26.ffn_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.attention.in_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.attention.k_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.attention.out_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.attention.pos_embed.inv_freq": "model-00005-of-00006.safetensors",
+    "model.layers.27.attention.q_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.attention_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.feed_forward.w12.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.feed_forward.w3.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.ffn_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.attention.in_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.attention.k_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.attention.out_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.attention.pos_embed.inv_freq": "model-00005-of-00006.safetensors",
+    "model.layers.28.attention.q_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.attention_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.feed_forward.w12.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.feed_forward.w3.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.ffn_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.29.attention.in_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.29.attention.k_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.29.attention.out_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.29.attention.pos_embed.inv_freq": "model-00006-of-00006.safetensors",
+    "model.layers.29.attention.q_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.29.attention_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.29.feed_forward.w12.weight": "model-00006-of-00006.safetensors",
+    "model.layers.29.feed_forward.w3.weight": "model-00006-of-00006.safetensors",
+    "model.layers.29.ffn_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.3.attention.in_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.attention.k_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.attention.out_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.attention.pos_embed.inv_freq": "model-00001-of-00006.safetensors",
+    "model.layers.3.attention.q_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.attention_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.feed_forward.w12.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.feed_forward.w3.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.ffn_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.30.attention.in_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.30.attention.k_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.30.attention.out_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.30.attention.pos_embed.inv_freq": "model-00006-of-00006.safetensors",
+    "model.layers.30.attention.q_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.30.attention_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.30.feed_forward.w12.weight": "model-00006-of-00006.safetensors",
+    "model.layers.30.feed_forward.w3.weight": "model-00006-of-00006.safetensors",
+    "model.layers.30.ffn_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.31.attention.in_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.31.attention.k_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.31.attention.out_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.31.attention.pos_embed.inv_freq": "model-00006-of-00006.safetensors",
+    "model.layers.31.attention.q_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.31.attention_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.31.feed_forward.w12.weight": "model-00006-of-00006.safetensors",
+    "model.layers.31.feed_forward.w3.weight": "model-00006-of-00006.safetensors",
+    "model.layers.31.ffn_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.4.attention.in_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.attention.k_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.attention.out_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.attention.pos_embed.inv_freq": "model-00001-of-00006.safetensors",
+    "model.layers.4.attention.q_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.attention_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.feed_forward.w12.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.feed_forward.w3.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.ffn_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.5.attention.in_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.attention.k_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.attention.out_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.attention.pos_embed.inv_freq": "model-00002-of-00006.safetensors",
+    "model.layers.5.attention.q_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.attention_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.feed_forward.w12.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.feed_forward.w3.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.ffn_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.attention.in_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.attention.k_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.attention.out_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.attention.pos_embed.inv_freq": "model-00002-of-00006.safetensors",
+    "model.layers.6.attention.q_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.attention_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.feed_forward.w12.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.feed_forward.w3.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.ffn_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.attention.in_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.attention.k_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.attention.out_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.attention.pos_embed.inv_freq": "model-00002-of-00006.safetensors",
+    "model.layers.7.attention.q_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.attention_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.feed_forward.w12.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.feed_forward.w3.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.ffn_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.attention.in_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.attention.k_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.attention.out_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.attention.pos_embed.inv_freq": "model-00002-of-00006.safetensors",
+    "model.layers.8.attention.q_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.attention_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.feed_forward.w12.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.feed_forward.w3.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.ffn_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.attention.in_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.attention.k_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.attention.out_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.attention.pos_embed.inv_freq": "model-00002-of-00006.safetensors",
+    "model.layers.9.attention.q_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.attention_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.feed_forward.w12.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.feed_forward.w3.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.ffn_norm.weight": "model-00002-of-00006.safetensors",
+    "model.norm.weight": "model-00006-of-00006.safetensors",
+    "model.output.weight": "model-00006-of-00006.safetensors",
+    "model.tok_embeddings.weight": "model-00001-of-00006.safetensors"
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,256 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|padding|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50254": {
+      "content": "                        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50255": {
+      "content": "                       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50256": {
+      "content": "                      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50257": {
+      "content": "                     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "                    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "                   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "                  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "                 ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "                ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "               ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "              ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "             ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "            ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "           ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "          ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "         ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "        ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "       ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "      ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "     ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "    ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "   ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "  ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "[ASST]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50278": {
+      "content": "[INST]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50279": {
+      "content": "[/ASST]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50280": {
+      "content": "[/INST]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50281": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "chat_template": "\n{%- for message in messages %}\n  {%- if message['role'] == 'user' %}\n    {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}\n  {%- elif message['role'] == 'assistant' %}\n    {{- '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }}\n  {%- endif %}\n{%- endfor %}\n",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 2048,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": "<|endoftext|>"
+}

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4895dcb3fb309f7434d4ef72c8cb9e37d5d4192ba7352ea9e3a79bd6d2f86af
+size 5432