saneowl
/

llamagloo-lynx-3b

Safetensors

llamagloo

Model card Files Files and versions

xet

Community

Oscar Wang commited on Jan 17

Commit

31fe5ed

verified ·

1 Parent(s): ad5d4bc

Create modelling_llamagloo.py

Browse files

Files changed (1) hide show

modelling_llamagloo.py +412 -0

modelling_llamagloo.py ADDED Viewed

	@@ -0,0 +1,412 @@

+from transformers import PreTrainedModel, PretrainedConfig
+import torch
+import torch.nn as nn
+from transformers.modeling_outputs import CausalLMOutputWithPast
+# -------------------- Configuration --------------------
+class LlamaGlooConfig(PretrainedConfig):
+    model_type = "llamagloo"
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=2560,
+        intermediate_size=10240,
+        num_hidden_layers=24,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        rope_theta=10000.0,
+        use_rms_norm=True,
+        rms_norm_eps=1e-6,
+        use_gqa=False,
+        ffn_type="llama",
+        initializer_range=0.02,
+        tie_word_embeddings=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads if num_key_value_heads is not None else num_attention_heads
+        self.rope_theta = rope_theta
+        self.use_rms_norm = use_rms_norm
+        self.rms_norm_eps = rms_norm_eps
+        self.use_gqa = use_gqa
+        self.ffn_type = ffn_type
+        self.initializer_range = initializer_range
+        super().__init__(tie_word_embeddings=tie_word_embeddings, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+# -------------------- Rotary Position Embeddings --------------------
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    cos = cos[position_ids].unsqueeze(1)
+    sin = sin[position_ids].unsqueeze(1)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class LlamaGlooRotaryEmbedding(nn.Module):
+    def __init__(self, dim, base=10000):
+        super().__init__()
+        self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.dim = dim
+        self.cos_cache = None
+        self.sin_cache = None
+    def forward(self, x, seq_len=None):
+        if seq_len is None:
+            seq_len = x.shape[-2]
+        t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+        cos = emb.cos()
+        sin = emb.sin()
+        return cos, sin
+# -------------------- RMS Normalization --------------------
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight * hidden_states).to(input_dtype)
+# -------------------- Attention Mechanism --------------------
+class LlamaGlooAttention(nn.Module):
+    def __init__(self, config: LlamaGlooConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.rope_theta = config.rope_theta
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.rotary_emb = LlamaGlooRotaryEmbedding(self.head_dim, base=self.rope_theta)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads if not self.config.use_gqa else self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    def _unshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.transpose(1, 2).contiguous().view(bsz, seq_len, self.hidden_size)
+    def forward(self, hidden_states, attention_mask=None, past_key_value=None, output_attentions=False, use_cache=True):
+        bsz, seq_len, _ = hidden_states.size()
+        q_proj = self.q_proj(hidden_states)
+        k_proj = self.k_proj(hidden_states)
+        v_proj = self.v_proj(hidden_states)
+        q = self._shape(q_proj, seq_len, bsz)
+        k = self._shape(k_proj, seq_len, bsz)
+        v = self._shape(v_proj, seq_len, bsz)
+        cos, sin = self.rotary_emb(q, seq_len)
+        q, k = apply_rotary_pos_emb(q, k, cos, sin, torch.arange(seq_len, device=hidden_states.device))
+        if past_key_value is not None:
+            kv_seq_len = past_key_value[0].shape[-2]
+            cos, sin = self.rotary_emb(k, seq_len + kv_seq_len)
+            k, v = apply_rotary_pos_emb(k, v, cos, sin, torch.arange(kv_seq_len, seq_len + kv_seq_len, device=hidden_states.device))
+            k = torch.cat([past_key_value[0], k], dim=1)
+            v = torch.cat([past_key_value[1], v], dim=1)
+        past_key_value = (k, v) if use_cache else None
+        if self.config.use_gqa:
+            k = k.repeat_interleave(self.num_key_value_groups, dim=1)
+            v = v.repeat_interleave(self.num_key_value_groups, dim=1)
+        attn_weights = torch.matmul(q, k.transpose(2, 3)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32, device=hidden_states.device))
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        attn_weights = torch.nn.functional.softmax(attn_weights.float(), dim=-1).type_as(attn_weights)
+        attn_output = torch.matmul(attn_weights, v)
+        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, seq_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        if use_cache:
+            outputs = outputs + (past_key_value,)
+        return outputs
+# -------------------- Feedforward Network --------------------
+class LlamaGlooMLP(nn.Module):
+    def __init__(self, config: LlamaGlooConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.ffn_type = config.ffn_type
+    def forward(self, x):
+        if self.ffn_type == "llama":
+            gate = torch.nn.functional.silu(self.gate_proj(x))
+            up = self.up_proj(x)
+            return self.down_proj(gate * up)
+        elif self.ffn_type == "glu":
+            return self.down_proj(self.gate_proj(x) * self.up_proj(x)) # Example GLU
+        else:
+            raise ValueError(f"Unknown ffn_type: {self.ffn_type}")
+# -------------------- Transformer Layer --------------------
+class LlamaGlooDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaGlooConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaGlooAttention(config=config)
+        self.mlp = LlamaGlooMLP(config)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) if config.use_rms_norm else nn.LayerNorm(config.hidden_size)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) if config.use_rms_norm else nn.LayerNorm(config.hidden_size)
+    def forward(self, hidden_states, attention_mask=None, past_key_value=None, output_attentions=False, use_cache=True):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        attn_outputs = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        attn_output = attn_outputs[0]
+        outputs = attn_outputs[1:]
+        hidden_states = residual + attn_output
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        if use_cache:
+            outputs = (past_key_value,) + outputs
+        return (hidden_states,) + outputs
+# -------------------- LlamaGloo Model --------------------
+class LlamaGlooModel(PreTrainedModel):
+    config_class = LlamaGlooConfig
+    def __init__(self, config: LlamaGlooConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([LlamaGlooDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) if config.use_rms_norm else nn.LayerNorm(config.hidden_size)
+        self.gradient_checkpointing = False
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.layers))
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+            attention_mask = attention_mask.to(device)
+            if attention_mask.dim() == 3:
+                extended_attention_mask = attention_mask[:, None, :, :]
+            elif attention_mask.dim() == 2:
+                extended_attention_mask = attention_mask[:, None, None, :]
+            else:
+                raise ValueError(
+                    f"Wrong number of dimensions of attention_mask. Expected 2 or 3, but got {attention_mask.dim()}"
+                )
+            extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)
+            extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(self.dtype).min
+        else:
+            extended_attention_mask = None
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        hidden_states = inputs_embeds
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx]
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=extended_attention_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[2],)
+        hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return CausalLMOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+# -------------------- LlamaGloo For Causal LM --------------------
+class LlamaGlooForCausalLM(PreTrainedModel):
+    config_class = LlamaGlooConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaGlooModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = self.lm_head(outputs[0])
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )