# coding=utf-8 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from transformers import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) class BaichuanM1Config(PretrainedConfig): r""" Configuration objects inherit from [`PretrainedConfig`] and control the behavior of model outputs. For more details, refer to the documentation of [`PretrainedConfig`]. Args: vocab_size (`int`, *optional*, defaults to 133120): The size of the vocabulary used by the model. hidden_size (`int`, *optional*, defaults to 4096): The dimensionality of the hidden representations. intermediate_size (`int`, *optional*, defaults to 22016): The dimensionality of the intermediate (MLP) representations. num_hidden_layers (`int`, *optional*, defaults to 32): The number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 32): The number of attention heads for each attention layer in the Transformer encoder. num_key_value_heads (`int`, *optional*, defaults to 32): The number of key-value heads used to implement Grouped Query Attention (GQA). - If `num_key_value_heads == num_attention_heads`, the model uses Multi-Head Attention (MHA). - If `num_key_value_heads == 1`, the model uses Multi-Query Attention (MQA). - Otherwise, the model uses Grouped Query Attention (GQA). When converting a multi-head checkpoint to a GQA checkpoint, each group's key and value heads are constructed by mean-pooling the original heads within that group. For more details, refer to [this paper](https://arxiv.org/pdf/2305.13245.pdf). If not specified, this defaults to `32`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (either a string or a callable function) used in the decoder. max_position_embeddings (`int`, *optional*, defaults to 32768): The maximum sequence length the model can handle. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated normal initializer for initializing all weight matrices. rms_norm_eps (`float`, *optional*, defaults to 1e-06): The epsilon value used by the RMS normalization layers. use_cache (`bool`, *optional*, defaults to `True`): Whether the model should return the last key/value attentions. This is only relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie the model's input and output word embeddings. rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the Rotary Position Embeddings (RoPE). use_sliding_window (`bool`, *optional*, defaults to `False`): Whether to enable sliding window attention. sliding_window (`int`, *optional*, defaults to 4096): The size of the sliding window for sliding window attention (SWA). If not specified, it defaults to `2048`. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio applied to the attention probabilities. """ model_type = "baichuan" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, vocab_size=133120, hidden_size=5120, intermediate_size=17408, num_hidden_layers=40, num_attention_heads=40, num_key_value_heads=2, num_swa_attention_heads: int = 20, num_swa_key_value_heads=8, sliding_window_layers: list = None, hidden_act="silu", max_position_embeddings=32768, initializer_range=0.02, rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, rope_theta=100000.0, sliding_window=2048, attention_dropout=0.0, conv_window = 2, **kwargs, ): self.sliding_window_layers = sliding_window_layers self.num_swa_key_value_heads = num_swa_key_value_heads self.num_swa_attention_heads = num_swa_attention_heads self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.sliding_window = sliding_window # for backward compatibility if num_key_value_heads is None: num_key_value_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.rope_theta = rope_theta self.attention_dropout = attention_dropout self.conv_window = conv_window super().__init__( tie_word_embeddings=tie_word_embeddings, **kwargs, )