mjschock commited on
Commit
0912136
·
verified ·
1 Parent(s): cb92e10

Upload config

Browse files
Files changed (2) hide show
  1. config.json +10 -12
  2. configuration_mamba.py +47 -16
config.json CHANGED
@@ -1,21 +1,19 @@
1
  {
2
- "architectures": [
3
- "MambaModelForCausalLM"
4
- ],
5
  "auto_map": {
6
- "AutoConfig": "configuration_mamba.MambaConfig",
7
- "AutoModel": "modeling_mamba.MambaModel",
8
- "AutoModelForCausalLM": "modeling_mamba.MambaModelForCausalLM"
9
  },
 
 
 
 
10
  "d_model": 768,
11
- "fused_add_norm": true,
 
 
 
12
  "model_type": "mamba",
13
  "n_layer": 24,
14
  "pad_vocab_size_multiple": 8,
15
- "residual_in_fp32": true,
16
- "rms_norm": true,
17
- "ssm_cfg": {},
18
- "torch_dtype": "float16",
19
  "transformers_version": "4.37.2",
20
- "vocab_size": 50277
21
  }
 
1
  {
 
 
 
2
  "auto_map": {
3
+ "AutoConfig": "configuration_mamba.MambaConfig"
 
 
4
  },
5
+ "bias": false,
6
+ "conv_bias": true,
7
+ "d_conv": 4,
8
+ "d_inner": 1536,
9
  "d_model": 768,
10
+ "d_state": 16,
11
+ "dt_rank": 48,
12
+ "expand": 2,
13
+ "hidden_size": 768,
14
  "model_type": "mamba",
15
  "n_layer": 24,
16
  "pad_vocab_size_multiple": 8,
 
 
 
 
17
  "transformers_version": "4.37.2",
18
+ "vocab_size": 50280
19
  }
configuration_mamba.py CHANGED
@@ -1,30 +1,61 @@
1
- import mamba_ssm
 
 
2
  from transformers import PretrainedConfig
3
 
4
- mamba_config_defaults = mamba_ssm.models.config_mamba.MambaConfig()
5
 
 
 
 
 
6
  class MambaConfig(PretrainedConfig):
7
- model_type = "mamba"
8
 
9
  def __init__(
10
  self,
11
- d_model: int = mamba_config_defaults.d_model,
12
- fused_add_norm: bool = mamba_config_defaults.fused_add_norm,
13
- n_layer: int = mamba_config_defaults.n_layer,
14
- pad_vocab_size_multiple: int = mamba_config_defaults.pad_vocab_size_multiple,
15
- residual_in_fp32: bool = mamba_config_defaults.residual_in_fp32,
16
- rms_norm: bool = mamba_config_defaults.rms_norm,
17
- ssm_cfg: dict = mamba_config_defaults.ssm_cfg,
18
- vocab_size: int = mamba_config_defaults.vocab_size,
 
 
 
 
 
 
 
19
  **kwargs,
20
  ):
 
 
 
21
  self.d_model = d_model
22
- self.fused_add_norm = fused_add_norm
 
 
23
  self.n_layer = n_layer
24
  self.pad_vocab_size_multiple = pad_vocab_size_multiple
25
- self.residual_in_fp32 = residual_in_fp32
26
- self.rms_norm = rms_norm
27
- self.ssm_cfg = ssm_cfg
28
  self.vocab_size = vocab_size
29
 
30
- super().__init__(**kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import Union
3
+
4
  from transformers import PretrainedConfig
5
 
 
6
 
7
+ # Inspired by:
8
+ # - https://huggingface.co/Q-bert/Mamba-130M/blob/9fad7fb5fb9c9416fab4f70ecd62498478be2074/configuration_mamba.py#L5
9
+ # - https://github.com/johnma2006/mamba-minimal/blob/03de542a36d873f6e6c4057ad687278cc6ae944d/model.py#L33
10
+ # - https://github.com/state-spaces/mamba/blob/009bec5ee37f586844a3fc89c040a9c1a9d8badf/mamba_ssm/models/config_mamba.py#L5
11
  class MambaConfig(PretrainedConfig):
12
+ model_type: str = "mamba"
13
 
14
  def __init__(
15
  self,
16
+ bias: bool = False,
17
+ conv_bias: bool = True,
18
+ d_conv: int = 4,
19
+ d_model: int = 2560,
20
+ d_state: int = 16,
21
+ dt_rank: Union[int, str] = "auto",
22
+ expand: int = 2,
23
+ # fused_add_norm: bool = True,
24
+ # initializer_range: float = 0.02,
25
+ n_layer: int = 64, # TODO: Rename to num_hidden_layers?
26
+ pad_vocab_size_multiple: int = 8,
27
+ # residual_in_fp32: bool = True,
28
+ # rms_norm: bool = True,
29
+ # ssm_config: dict = {},
30
+ vocab_size: int = 50277,
31
  **kwargs,
32
  ):
33
+ self.bias = bias
34
+ self.conv_bias = conv_bias
35
+ self.d_conv = d_conv
36
  self.d_model = d_model
37
+ self.d_state = d_state
38
+ self.dt_rank = dt_rank
39
+ self.expand = expand
40
  self.n_layer = n_layer
41
  self.pad_vocab_size_multiple = pad_vocab_size_multiple
 
 
 
42
  self.vocab_size = vocab_size
43
 
44
+ self.d_inner = int(self.expand * self.d_model)
45
+
46
+ if self.dt_rank == "auto":
47
+ self.dt_rank = math.ceil(self.d_model / 16) # TODO: 16 is self.d_state?
48
+
49
+ if self.vocab_size % self.pad_vocab_size_multiple != 0:
50
+ self.vocab_size += (
51
+ self.pad_vocab_size_multiple
52
+ - self.vocab_size % self.pad_vocab_size_multiple
53
+ )
54
+
55
+ # TODO: According to https://huggingface.co/docs/transformers/create_a_model#configuration,
56
+ # "all NLP models have the hidden_size, num_attention_heads, num_hidden_layers and vocab_size attributes in common."
57
+ self.hidden_size = self.d_model
58
+
59
+ super().__init__(
60
+ **kwargs,
61
+ )