chandar-lab
/

NeoBERT

@@ -9,7 +9,11 @@ from torch.nn.functional import scaled_dot_product_attention
 from typing import Optional
 import numpy as np
-from xformers.ops import SwiGLU
 try:
     from flash_attn.flash_attn_interface import flash_attn_varlen_func
@@ -100,6 +104,21 @@ class NeoBERTConfig(PretrainedConfig):
         self.max_length = max_length
         self.kwargs = kwargs
 class EncoderBlock(nn.Module):
     """Transformer encoder block."""
@@ -117,7 +136,10 @@ class EncoderBlock(nn.Module):
         multiple_of = 8
         intermediate_size = int(2 * config.intermediate_size / 3)
         intermediate_size = multiple_of * ((intermediate_size + multiple_of - 1) // multiple_of)
-        self.ffn = SwiGLU(config.hidden_size, intermediate_size, config.hidden_size, bias=False)
         # Layer norms
         self.attention_norm = nn.RMSNorm(config.hidden_size, config.norm_eps)

 from typing import Optional
 import numpy as np
+try:
+    from xformers.ops import SwiGLU
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    XFORMERS_AVAILABLE = False
 try:
     from flash_attn.flash_attn_interface import flash_attn_varlen_func
         self.max_length = max_length
         self.kwargs = kwargs
+# Adapted from transformers.models.llama.modeling_llama.LlamaMLP
+class NeobertMLP(nn.Module):
+    def __init__(self, hidden_size, intermediate_size, bias=False):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.w12 = nn.Linear(self.hidden_size, 2 * self.intermediate_size, bias=bias)
+        self.w3 = nn.Linear(self.intermediate_size, self.hidden_size, bias=bias)
+        self.act_fn = nn.SiLU()
+    def forward(self, x):
+        w1, w2 = self.w12(x).chunk(2, dim=-1)
+        w3 = self.w3(self.act_fn(w1) * w2)
+        return w3
 class EncoderBlock(nn.Module):
     """Transformer encoder block."""
         multiple_of = 8
         intermediate_size = int(2 * config.intermediate_size / 3)
         intermediate_size = multiple_of * ((intermediate_size + multiple_of - 1) // multiple_of)
+        if XFORMERS_AVAILABLE:
+            self.ffn = SwiGLU(config.hidden_size, intermediate_size, config.hidden_size, bias=False)
+        else:
+            self.ffn = NeobertMLP(config.hidden_size, intermediate_size, config.hidden_size, bias=False)
         # Layer norms
         self.attention_norm = nn.RMSNorm(config.hidden_size, config.norm_eps)