import math
from typing import Union

from transformers import PretrainedConfig


# Inspired by:
# - https://huggingface.co/docs/transformers/custom_models#writing-a-custom-configuration
# - https://huggingface.co/Q-bert/Mamba-130M/blob/9fad7fb5fb9c9416fab4f70ecd62498478be2074/configuration_mamba.py#L5
# - https://github.com/johnma2006/mamba-minimal/blob/03de542a36d873f6e6c4057ad687278cc6ae944d/model.py#L33
# - https://github.com/state-spaces/mamba/blob/009bec5ee37f586844a3fc89c040a9c1a9d8badf/mamba_ssm/models/config_mamba.py#L5
class MambaConfig(PretrainedConfig):
    model_type: str = "mamba"

    def __init__(
        self,
        bias: bool = False,
        conv_bias: bool = True,
        d_conv: int = 4,
        d_model: int = 2560,
        d_state: int = 16,
        dt_rank: Union[int, str] = "auto",
        expand: int = 2,
        # fused_add_norm: bool = True,
        # initializer_range: float = 0.02,
        n_layer: int = 64,  # TODO: Rename to num_hidden_layers?
        pad_vocab_size_multiple: int = 8,
        # residual_in_fp32: bool = True,
        # rms_norm: bool = True,
        # ssm_config: dict = {},
        vocab_size: int = 50277,
        **kwargs,
    ):
        self.bias = bias
        self.conv_bias = conv_bias
        self.d_conv = d_conv
        self.d_model = d_model
        self.d_state = d_state
        self.dt_rank = dt_rank
        self.expand = expand
        self.n_layer = n_layer
        self.pad_vocab_size_multiple = pad_vocab_size_multiple
        self.vocab_size = vocab_size

        self.d_inner = int(self.expand * self.d_model)

        if self.dt_rank == "auto":
            self.dt_rank = math.ceil(self.d_model / 16)  # TODO: 16 is self.d_state?

        if self.vocab_size % self.pad_vocab_size_multiple != 0:
            self.vocab_size += (
                self.pad_vocab_size_multiple
                - self.vocab_size % self.pad_vocab_size_multiple
            )

        # TODO: According to https://huggingface.co/docs/transformers/create_a_model#configuration,
        # "all NLP models have the hidden_size, num_attention_heads, num_hidden_layers and vocab_size attributes in common."
        self.hidden_size = self.d_model

        super().__init__(
            **kwargs,
        )