mamba-130m / configuration_mamba.py
mjschock's picture
Upload config
6dd6c41 verified
raw
history blame
2.26 kB
import math
from typing import Union
from transformers import PretrainedConfig
# Inspired by:
# - https://huggingface.co/docs/transformers/custom_models#writing-a-custom-configuration
# - https://huggingface.co/Q-bert/Mamba-130M/blob/9fad7fb5fb9c9416fab4f70ecd62498478be2074/configuration_mamba.py#L5
# - https://github.com/johnma2006/mamba-minimal/blob/03de542a36d873f6e6c4057ad687278cc6ae944d/model.py#L33
# - https://github.com/state-spaces/mamba/blob/009bec5ee37f586844a3fc89c040a9c1a9d8badf/mamba_ssm/models/config_mamba.py#L5
class MambaConfig(PretrainedConfig):
model_type: str = "mamba"
def __init__(
self,
bias: bool = False,
conv_bias: bool = True,
d_conv: int = 4,
d_model: int = 2560,
d_state: int = 16,
dt_rank: Union[int, str] = "auto",
expand: int = 2,
# fused_add_norm: bool = True,
# initializer_range: float = 0.02,
n_layer: int = 64, # TODO: Rename to num_hidden_layers?
pad_vocab_size_multiple: int = 8,
# residual_in_fp32: bool = True,
# rms_norm: bool = True,
# ssm_config: dict = {},
vocab_size: int = 50277,
**kwargs,
):
self.bias = bias
self.conv_bias = conv_bias
self.d_conv = d_conv
self.d_model = d_model
self.d_state = d_state
self.dt_rank = dt_rank
self.expand = expand
self.n_layer = n_layer
self.pad_vocab_size_multiple = pad_vocab_size_multiple
self.vocab_size = vocab_size
self.d_inner = int(self.expand * self.d_model)
if self.dt_rank == "auto":
self.dt_rank = math.ceil(self.d_model / 16) # TODO: 16 is self.d_state?
if self.vocab_size % self.pad_vocab_size_multiple != 0:
self.vocab_size += (
self.pad_vocab_size_multiple
- self.vocab_size % self.pad_vocab_size_multiple
)
# TODO: According to https://huggingface.co/docs/transformers/create_a_model#configuration,
# "all NLP models have the hidden_size, num_attention_heads, num_hidden_layers and vocab_size attributes in common."
self.hidden_size = self.d_model
super().__init__(
**kwargs,
)