mjschock commited on
Commit
606b88d
·
verified ·
1 Parent(s): 843c0d6

Upload config

Browse files
Files changed (2) hide show
  1. config.json +2 -6
  2. configuration_mamba.py +26 -87
config.json CHANGED
@@ -1,10 +1,6 @@
1
  {
2
- "architectures": [
3
- "MambaLMHeadModel"
4
- ],
5
  "auto_map": {
6
- "AutoConfig": "configuration_mamba.MambaConfig",
7
- "AutoModelForCausalLM": "modeling_mamba.MambaLMHeadModel"
8
  },
9
  "bias": false,
10
  "conv_bias": true,
@@ -14,10 +10,10 @@
14
  "d_state": 16,
15
  "dt_rank": 48,
16
  "expand": 2,
 
17
  "model_type": "mamba",
18
  "n_layer": 24,
19
  "pad_vocab_size_multiple": 8,
20
- "torch_dtype": "float32",
21
  "transformers_version": "4.37.2",
22
  "vocab_size": 50280
23
  }
 
1
  {
 
 
 
2
  "auto_map": {
3
+ "AutoConfig": "configuration_mamba.MambaConfig"
 
4
  },
5
  "bias": false,
6
  "conv_bias": true,
 
10
  "d_state": 16,
11
  "dt_rank": 48,
12
  "expand": 2,
13
+ "initializer_range": 0.02,
14
  "model_type": "mamba",
15
  "n_layer": 24,
16
  "pad_vocab_size_multiple": 8,
 
17
  "transformers_version": "4.37.2",
18
  "vocab_size": 50280
19
  }
configuration_mamba.py CHANGED
@@ -4,106 +4,45 @@ from typing import Union
4
  from transformers import PretrainedConfig
5
 
6
 
7
- # Inspired by:
8
- # - https://huggingface.co/docs/transformers/custom_models#writing-a-custom-configuration
9
- # - https://huggingface.co/Q-bert/Mamba-130M/blob/9fad7fb5fb9c9416fab4f70ecd62498478be2074/configuration_mamba.py#L5
10
- # - https://github.com/johnma2006/mamba-minimal/blob/03de542a36d873f6e6c4057ad687278cc6ae944d/model.py#L33
11
- # - https://github.com/state-spaces/mamba/blob/009bec5ee37f586844a3fc89c040a9c1a9d8badf/mamba_ssm/models/config_mamba.py#L5
12
  class MambaConfig(PretrainedConfig):
13
- model_type: str = "mamba"
14
 
15
  def __init__(
16
  self,
17
- # bias: bool = False,
18
- # conv_bias: bool = True,
19
- # d_conv: int = 4,
20
- # d_model: int = 2560,
21
- # d_state: int = 16,
22
- # dt_rank: Union[int, str] = "auto",
23
- # expand: int = 2,
24
- # fused_add_norm: bool = True,
25
- # # initializer_range: float = 0.02,
26
- # n_layer: int = 64, # TODO: Rename to num_hidden_layers?
27
- # norm_epsilon: float = 1e-5,
28
- # pad_vocab_size_multiple: int = 8,
29
- # residual_in_fp32: bool = True,
30
- # rms_norm: bool = True,
31
- # ssm_config: dict = {},
32
- # vocab_size: int = 50277,
33
-
34
- d_model: int = 2560,
35
- n_layer: int = 64,
36
- vocab_size: int = 50277,
37
- d_state: int = 16,
38
- expand: int = 2,
39
- dt_rank: Union[int, str] = 'auto',
40
- d_conv: int = 4,
41
- pad_vocab_size_multiple: int = 8,
42
- conv_bias: bool = True,
43
- bias: bool = False,
44
  **kwargs,
45
  ):
46
- # self.bias = bias
47
- # self.conv_bias = conv_bias
48
- # self.d_conv = d_conv
49
- # self.d_model = d_model
50
- # self.d_state = d_state
51
- # self.dt_rank = dt_rank
52
- # self.expand = expand
53
- # self.fused_add_norm = fused_add_norm
54
- # self.n_layer = n_layer
55
- # self.norm_epsilon = norm_epsilon
56
- # self.pad_vocab_size_multiple = pad_vocab_size_multiple
57
- # self.residual_in_fp32 = residual_in_fp32
58
- # self.rms_norm = rms_norm
59
- # self.ssm_config = ssm_config
60
- # self.vocab_size = vocab_size
61
-
62
- # d_model: int
63
- # n_layer: int
64
- # vocab_size: int
65
- # d_state: int = 16
66
- # expand: int = 2
67
- # dt_rank: Union[int, str] = 'auto'
68
- # d_conv: int = 4
69
- # pad_vocab_size_multiple: int = 8
70
- # conv_bias: bool = True
71
- # bias: bool = False
72
-
73
- self.d_model = d_model
74
- self.n_layer = n_layer
75
  self.vocab_size = vocab_size
76
- self.d_state = d_state
 
77
  self.expand = expand
78
- self.dt_rank = dt_rank
79
- self.d_conv = d_conv
80
  self.pad_vocab_size_multiple = pad_vocab_size_multiple
81
- self.conv_bias = conv_bias
82
- self.bias = bias
83
-
84
  self.d_inner = int(self.expand * self.d_model)
 
 
 
85
 
86
- self.d_inner = int(self.expand * self.d_model)
87
-
88
- if self.dt_rank == 'auto':
89
  self.dt_rank = math.ceil(self.d_model / 16)
90
-
91
- if self.vocab_size % self.pad_vocab_size_multiple != 0:
92
- self.vocab_size += (self.pad_vocab_size_multiple
93
- - self.vocab_size % self.pad_vocab_size_multiple)
94
-
95
- # if self.dt_rank == "auto":
96
- # self.dt_rank = math.ceil(self.d_model / 16) # TODO: 16 is self.d_state?
97
 
98
- # if self.vocab_size % self.pad_vocab_size_multiple != 0:
99
- # self.vocab_size += (
100
- # self.pad_vocab_size_multiple
101
- # - self.vocab_size % self.pad_vocab_size_multiple
102
- # )
103
-
104
- # # TODO: According to https://huggingface.co/docs/transformers/create_a_model#configuration,
105
- # # "all NLP models have the hidden_size, num_attention_heads, num_hidden_layers and vocab_size attributes in common."
106
- # self.hidden_size = self.d_model
107
 
108
  super().__init__(
109
  **kwargs,
 
4
  from transformers import PretrainedConfig
5
 
6
 
 
 
 
 
 
7
  class MambaConfig(PretrainedConfig):
8
+ model_type = "mamba"
9
 
10
  def __init__(
11
  self,
12
+ vocab_size=50277,
13
+ d_state=16,
14
+ d_model=2560,
15
+ d_conv=4,
16
+ expand=2,
17
+ conv_bias=True,
18
+ bias=False,
19
+ n_layer=64,
20
+ dt_rank: Union[int, str] = "auto",
21
+ pad_vocab_size_multiple=8,
22
+ initializer_range=0.02,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  **kwargs,
24
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  self.vocab_size = vocab_size
26
+ self.n_layer = n_layer
27
+ self.conv_bias = conv_bias
28
  self.expand = expand
 
 
29
  self.pad_vocab_size_multiple = pad_vocab_size_multiple
30
+ self.d_conv = d_conv
31
+ self.d_model = d_model
32
+ self.d_state = d_state
33
  self.d_inner = int(self.expand * self.d_model)
34
+ self.dt_rank = dt_rank
35
+ self.initializer_range = initializer_range
36
+ self.bias = bias
37
 
38
+ if self.dt_rank == "auto":
 
 
39
  self.dt_rank = math.ceil(self.d_model / 16)
 
 
 
 
 
 
 
40
 
41
+ if self.vocab_size % self.pad_vocab_size_multiple != 0:
42
+ self.vocab_size += (
43
+ self.pad_vocab_size_multiple
44
+ - self.vocab_size % self.pad_vocab_size_multiple
45
+ )
 
 
 
 
46
 
47
  super().__init__(
48
  **kwargs,