Upload model
Browse files- config.json +6 -1
- modeling_mamba.py +5 -4
config.json
CHANGED
@@ -1,6 +1,10 @@
|
|
1 |
{
|
|
|
|
|
|
|
2 |
"auto_map": {
|
3 |
-
"AutoConfig": "configuration_mamba.MambaConfig"
|
|
|
4 |
},
|
5 |
"bias": false,
|
6 |
"conv_bias": true,
|
@@ -14,6 +18,7 @@
|
|
14 |
"model_type": "mamba",
|
15 |
"n_layer": 24,
|
16 |
"pad_vocab_size_multiple": 8,
|
|
|
17 |
"transformers_version": "4.37.2",
|
18 |
"vocab_size": 50280
|
19 |
}
|
|
|
1 |
{
|
2 |
+
"architectures": [
|
3 |
+
"MambaModelForCausalLM"
|
4 |
+
],
|
5 |
"auto_map": {
|
6 |
+
"AutoConfig": "configuration_mamba.MambaConfig",
|
7 |
+
"AutoModelForCausalLM": "modeling_mamba.MambaModelForCausalLM"
|
8 |
},
|
9 |
"bias": false,
|
10 |
"conv_bias": true,
|
|
|
18 |
"model_type": "mamba",
|
19 |
"n_layer": 24,
|
20 |
"pad_vocab_size_multiple": 8,
|
21 |
+
"torch_dtype": "float32",
|
22 |
"transformers_version": "4.37.2",
|
23 |
"vocab_size": 50280
|
24 |
}
|
modeling_mamba.py
CHANGED
@@ -76,8 +76,8 @@ class Mamba(nn.Module):
|
|
76 |
"""
|
77 |
|
78 |
(b, l, d) = x.shape
|
79 |
-
x_copy = x # There was a separate class for residual, I deleted that part and added it here.
|
80 |
-
x = self.norm(x)
|
81 |
x_and_res = self.in_proj(x) # shape (b, l, 2 * d_in)
|
82 |
(x, res) = x_and_res.split(
|
83 |
split_size=[self.config.d_inner, self.config.d_inner], dim=-1
|
@@ -93,7 +93,8 @@ class Mamba(nn.Module):
|
|
93 |
|
94 |
y = y * F.silu(res)
|
95 |
|
96 |
-
output = self.out_proj(y) + x_copy
|
|
|
97 |
|
98 |
return output
|
99 |
|
@@ -196,7 +197,7 @@ class MambaBlock(nn.Module):
|
|
196 |
self.norm = MambaRMSNorm(config.d_model)
|
197 |
|
198 |
def forward(self, x):
|
199 |
-
return self.
|
200 |
|
201 |
|
202 |
class MambaPreTrainedModel(PreTrainedModel):
|
|
|
76 |
"""
|
77 |
|
78 |
(b, l, d) = x.shape
|
79 |
+
# x_copy = x # There was a separate class for residual, I deleted that part and added it here.
|
80 |
+
# x = self.norm(x)
|
81 |
x_and_res = self.in_proj(x) # shape (b, l, 2 * d_in)
|
82 |
(x, res) = x_and_res.split(
|
83 |
split_size=[self.config.d_inner, self.config.d_inner], dim=-1
|
|
|
93 |
|
94 |
y = y * F.silu(res)
|
95 |
|
96 |
+
# output = self.out_proj(y) + x_copy
|
97 |
+
output = self.out_proj(y)
|
98 |
|
99 |
return output
|
100 |
|
|
|
197 |
self.norm = MambaRMSNorm(config.d_model)
|
198 |
|
199 |
def forward(self, x):
|
200 |
+
return self.mixer(self.norm(x)) + x
|
201 |
|
202 |
|
203 |
class MambaPreTrainedModel(PreTrainedModel):
|