import math from typing import Union from transformers import PretrainedConfig # Inspired by: # - https://huggingface.co/docs/transformers/custom_models#writing-a-custom-configuration # - https://huggingface.co/Q-bert/Mamba-130M/blob/9fad7fb5fb9c9416fab4f70ecd62498478be2074/configuration_mamba.py#L5 # - https://github.com/johnma2006/mamba-minimal/blob/03de542a36d873f6e6c4057ad687278cc6ae944d/model.py#L33 # - https://github.com/state-spaces/mamba/blob/009bec5ee37f586844a3fc89c040a9c1a9d8badf/mamba_ssm/models/config_mamba.py#L5 class MambaConfig(PretrainedConfig): model_type: str = "mamba" def __init__( self, bias: bool = False, conv_bias: bool = True, d_conv: int = 4, d_model: int = 2560, d_state: int = 16, dt_rank: Union[int, str] = "auto", expand: int = 2, # fused_add_norm: bool = True, # initializer_range: float = 0.02, n_layer: int = 64, # TODO: Rename to num_hidden_layers? pad_vocab_size_multiple: int = 8, # residual_in_fp32: bool = True, # rms_norm: bool = True, # ssm_config: dict = {}, vocab_size: int = 50277, **kwargs, ): self.bias = bias self.conv_bias = conv_bias self.d_conv = d_conv self.d_model = d_model self.d_state = d_state self.dt_rank = dt_rank self.expand = expand self.n_layer = n_layer self.pad_vocab_size_multiple = pad_vocab_size_multiple self.vocab_size = vocab_size self.d_inner = int(self.expand * self.d_model) if self.dt_rank == "auto": self.dt_rank = math.ceil(self.d_model / 16) # TODO: 16 is self.d_state? if self.vocab_size % self.pad_vocab_size_multiple != 0: self.vocab_size += ( self.pad_vocab_size_multiple - self.vocab_size % self.pad_vocab_size_multiple ) # TODO: According to https://huggingface.co/docs/transformers/create_a_model#configuration, # "all NLP models have the hidden_size, num_attention_heads, num_hidden_layers and vocab_size attributes in common." self.hidden_size = self.d_model super().__init__( **kwargs, )