msr2000 commited on
Commit
44f7caf
·
1 Parent(s): 3d445e7

Update model names

Browse files
Files changed (3) hide show
  1. config.json +5 -5
  2. configuration_deepseek.py +8 -8
  3. modeling_deepseek.py +69 -93
config.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "architectures": [
3
- "DeepseekForCausalLM"
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
  "auto_map": {
8
- "AutoConfig": "configuration_deepseek.DeepseekConfig",
9
- "AutoModel": "modeling_deepseek.DeepseekModel",
10
- "AutoModelForCausalLM": "modeling_deepseek.DeepseekForCausalLM"
11
  },
12
  "aux_loss_alpha": 0.001,
13
  "bos_token_id": 100000,
@@ -19,7 +19,7 @@
19
  "intermediate_size": 12288,
20
  "kv_lora_rank": 512,
21
  "max_position_embeddings": 163840,
22
- "model_type": "deepseek",
23
  "moe_intermediate_size": 1536,
24
  "moe_layer_freq": 1,
25
  "n_group": 8,
 
1
  {
2
  "architectures": [
3
+ "DeepseekV2ForCausalLM"
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
  "auto_map": {
8
+ "AutoConfig": "configuration_deepseek.DeepseekV2Config",
9
+ "AutoModel": "modeling_deepseek.DeepseekV2Model",
10
+ "AutoModelForCausalLM": "modeling_deepseek.DeepseekV2ForCausalLM"
11
  },
12
  "aux_loss_alpha": 0.001,
13
  "bos_token_id": 100000,
 
19
  "intermediate_size": 12288,
20
  "kv_lora_rank": 512,
21
  "max_position_embeddings": 163840,
22
+ "model_type": "deepseek_v2",
23
  "moe_intermediate_size": 1536,
24
  "moe_layer_freq": 1,
25
  "n_group": 8,
configuration_deepseek.py CHANGED
@@ -4,11 +4,11 @@ from transformers.utils import logging
4
  logger = logging.get_logger(__name__)
5
 
6
  DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
7
- class DeepseekConfig(PretrainedConfig):
8
  r"""
9
- This is the configuration class to store the configuration of a [`DeepseekModel`]. It is used to instantiate an DeepSeek
10
  model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
11
- defaults will yield a similar configuration to that of the DeepSeek-7B.
12
 
13
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
14
  documentation from [`PretrainedConfig`] for more information.
@@ -17,7 +17,7 @@ class DeepseekConfig(PretrainedConfig):
17
  Args:
18
  vocab_size (`int`, *optional*, defaults to 102400):
19
  Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
20
- `inputs_ids` passed when calling [`DeepseekModel`]
21
  hidden_size (`int`, *optional*, defaults to 4096):
22
  Dimension of the hidden representations.
23
  intermediate_size (`int`, *optional*, defaults to 11008):
@@ -100,16 +100,16 @@ class DeepseekConfig(PretrainedConfig):
100
  The dropout ratio for the attention probabilities.
101
 
102
  ```python
103
- >>> from transformers import DeepseekModel, DeepseekConfig
104
 
105
- >>> # Initializing a Deepseek deepseek-7b style configuration
106
- >>> configuration = DeepseekConfig()
107
 
108
  >>> # Accessing the model configuration
109
  >>> configuration = model.config
110
  ```"""
111
 
112
- model_type = "deepseek"
113
  keys_to_ignore_at_inference = ["past_key_values"]
114
 
115
  def __init__(
 
4
  logger = logging.get_logger(__name__)
5
 
6
  DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
7
+ class DeepseekV2Config(PretrainedConfig):
8
  r"""
9
+ This is the configuration class to store the configuration of a [`DeepseekV2Model`]. It is used to instantiate an DeepSeek
10
  model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
11
+ defaults will yield a similar configuration to that of the DeepSeek-V2.
12
 
13
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
14
  documentation from [`PretrainedConfig`] for more information.
 
17
  Args:
18
  vocab_size (`int`, *optional*, defaults to 102400):
19
  Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
20
+ `inputs_ids` passed when calling [`DeepseekV2Model`]
21
  hidden_size (`int`, *optional*, defaults to 4096):
22
  Dimension of the hidden representations.
23
  intermediate_size (`int`, *optional*, defaults to 11008):
 
100
  The dropout ratio for the attention probabilities.
101
 
102
  ```python
103
+ >>> from transformers import DeepseekV2Model, DeepseekV2Config
104
 
105
+ >>> # Initializing a Deepseek-V2 style configuration
106
+ >>> configuration = DeepseekV2Config()
107
 
108
  >>> # Accessing the model configuration
109
  >>> configuration = model.config
110
  ```"""
111
 
112
+ model_type = "deepseek_v2"
113
  keys_to_ignore_at_inference = ["past_key_values"]
114
 
115
  def __init__(
modeling_deepseek.py CHANGED
@@ -55,7 +55,7 @@ from transformers.utils import (
55
  replace_return_docstrings,
56
  )
57
  from transformers.utils.import_utils import is_torch_fx_available
58
- from .configuration_deepseek import DeepseekConfig
59
  import torch.distributed as dist
60
  import numpy as np
61
 
@@ -75,7 +75,7 @@ if is_torch_fx_available():
75
 
76
  logger = logging.get_logger(__name__)
77
 
78
- _CONFIG_FOR_DOC = "DeepseekConfig"
79
 
80
 
81
  def _get_unpad_data(attention_mask):
@@ -92,34 +92,10 @@ def _get_unpad_data(attention_mask):
92
  )
93
 
94
 
95
- def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
96
- warnings.warn(
97
- "Calling `transformers.models.Deepseek.modeling_Deepseek._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask"
98
- )
99
- return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
100
-
101
-
102
- def _make_causal_mask(
103
- input_ids_shape: torch.Size,
104
- dtype: torch.dtype,
105
- device: torch.device,
106
- past_key_values_length: int = 0,
107
- ):
108
- warnings.warn(
109
- "Calling `transformers.models.Deepseek.modeling_Deepseek._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.Deepseek.modeling_Deepseek.AttentionMaskConverter._make_causal_mask"
110
- )
111
- return AttentionMaskConverter._make_causal_mask(
112
- input_ids_shape=input_ids_shape,
113
- dtype=dtype,
114
- device=device,
115
- past_key_values_length=past_key_values_length,
116
- )
117
-
118
-
119
- class DeepseekRMSNorm(nn.Module):
120
  def __init__(self, hidden_size, eps=1e-6):
121
  """
122
- DeepseekRMSNorm is equivalent to T5LayerNorm
123
  """
124
  super().__init__()
125
  self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -133,10 +109,10 @@ class DeepseekRMSNorm(nn.Module):
133
  return self.weight * hidden_states.to(input_dtype)
134
 
135
 
136
- ALL_LAYERNORM_LAYERS.append(DeepseekRMSNorm)
137
 
138
 
139
- class DeepseekRotaryEmbedding(nn.Module):
140
  def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
141
  super().__init__()
142
 
@@ -179,9 +155,9 @@ class DeepseekRotaryEmbedding(nn.Module):
179
  )
180
 
181
 
182
- # Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Deepseek
183
- class DeepseekLinearScalingRotaryEmbedding(DeepseekRotaryEmbedding):
184
- """DeepseekRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
185
 
186
  def __init__(
187
  self,
@@ -208,9 +184,9 @@ class DeepseekLinearScalingRotaryEmbedding(DeepseekRotaryEmbedding):
208
  self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
209
 
210
 
211
- # Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Deepseek
212
- class DeepseekDynamicNTKScalingRotaryEmbedding(DeepseekRotaryEmbedding):
213
- """DeepseekRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
214
 
215
  def __init__(
216
  self,
@@ -284,7 +260,7 @@ def yarn_linear_ramp_mask(min, max, dim):
284
  return ramp_func
285
 
286
 
287
- class DeepseekYarnRotaryEmbedding(DeepseekRotaryEmbedding):
288
 
289
  def __init__(
290
  self,
@@ -396,7 +372,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
396
  return q_embed, k_embed
397
 
398
 
399
- class DeepseekMLP(nn.Module):
400
  def __init__(self, config, hidden_size=None, intermediate_size=None):
401
  super().__init__()
402
  self.config = config
@@ -543,7 +519,7 @@ class AddAuxiliaryLoss(torch.autograd.Function):
543
  return grad_output, grad_loss
544
 
545
 
546
- class DeepseekMoE(nn.Module):
547
  """
548
  A mixed expert module containing shared experts.
549
  """
@@ -561,7 +537,7 @@ class DeepseekMoE(nn.Module):
561
  self.experts = nn.ModuleList(
562
  [
563
  (
564
- DeepseekMLP(
565
  config, intermediate_size=config.moe_intermediate_size
566
  )
567
  if i >= self.ep_rank * self.experts_per_rank
@@ -577,14 +553,14 @@ class DeepseekMoE(nn.Module):
577
  self.ep_rank = 0
578
  self.experts = nn.ModuleList(
579
  [
580
- DeepseekMLP(config, intermediate_size=config.moe_intermediate_size)
581
  for i in range(config.n_routed_experts)
582
  ]
583
  )
584
  self.gate = MoEGate(config)
585
  if config.n_shared_experts is not None:
586
  intermediate_size = config.moe_intermediate_size * config.n_shared_experts
587
- self.shared_experts = DeepseekMLP(
588
  config=config, intermediate_size=intermediate_size
589
  )
590
 
@@ -702,11 +678,11 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
702
  return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
703
 
704
 
705
- # Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->Deepseek
706
- class DeepseekAttention(nn.Module):
707
  """Multi-headed attention from 'Attention Is All You Need' paper"""
708
 
709
- def __init__(self, config: DeepseekConfig, layer_idx: Optional[int] = None):
710
  super().__init__()
711
  self.config = config
712
  self.layer_idx = layer_idx
@@ -735,7 +711,7 @@ class DeepseekAttention(nn.Module):
735
  self.q_a_proj = nn.Linear(
736
  self.hidden_size, config.q_lora_rank, bias=config.attention_bias
737
  )
738
- self.q_a_layernorm = DeepseekRMSNorm(config.q_lora_rank)
739
  self.q_b_proj = nn.Linear(
740
  config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
741
  )
@@ -745,7 +721,7 @@ class DeepseekAttention(nn.Module):
745
  config.kv_lora_rank + config.qk_rope_head_dim,
746
  bias=config.attention_bias,
747
  )
748
- self.kv_a_layernorm = DeepseekRMSNorm(config.kv_lora_rank)
749
  self.kv_b_proj = nn.Linear(
750
  config.kv_lora_rank,
751
  self.num_heads
@@ -770,7 +746,7 @@ class DeepseekAttention(nn.Module):
770
 
771
  def _init_rope(self):
772
  if self.config.rope_scaling is None:
773
- self.rotary_emb = DeepseekRotaryEmbedding(
774
  self.qk_rope_head_dim,
775
  max_position_embeddings=self.max_position_embeddings,
776
  base=self.rope_theta,
@@ -779,14 +755,14 @@ class DeepseekAttention(nn.Module):
779
  scaling_type = self.config.rope_scaling["type"]
780
  scaling_factor = self.config.rope_scaling["factor"]
781
  if scaling_type == "linear":
782
- self.rotary_emb = DeepseekLinearScalingRotaryEmbedding(
783
  self.qk_rope_head_dim,
784
  max_position_embeddings=self.max_position_embeddings,
785
  scaling_factor=scaling_factor,
786
  base=self.rope_theta,
787
  )
788
  elif scaling_type == "dynamic":
789
- self.rotary_emb = DeepseekDynamicNTKScalingRotaryEmbedding(
790
  self.qk_rope_head_dim,
791
  max_position_embeddings=self.max_position_embeddings,
792
  scaling_factor=scaling_factor,
@@ -804,7 +780,7 @@ class DeepseekAttention(nn.Module):
804
  ]
805
  if key in self.config.rope_scaling
806
  }
807
- self.rotary_emb = DeepseekYarnRotaryEmbedding(
808
  self.qk_rope_head_dim,
809
  max_position_embeddings=self.max_position_embeddings,
810
  scaling_factor=scaling_factor,
@@ -927,10 +903,10 @@ class DeepseekAttention(nn.Module):
927
  return attn_output, attn_weights, past_key_value
928
 
929
 
930
- # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Deepseek
931
- class DeepseekFlashAttention2(DeepseekAttention):
932
  """
933
- Deepseek flash attention module. This module inherits from `DeepseekAttention` as the weights of the module stays
934
  untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
935
  flash attention and deal with padding tokens in case the input contains any of them.
936
  """
@@ -953,7 +929,7 @@ class DeepseekFlashAttention2(DeepseekAttention):
953
  use_cache: bool = False,
954
  **kwargs,
955
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
956
- # DeepseekFlashAttention2 attention does not support output_attentions
957
  if "padding_mask" in kwargs:
958
  warnings.warn(
959
  "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
@@ -1027,7 +1003,7 @@ class DeepseekFlashAttention2(DeepseekAttention):
1027
  # therefore the input hidden states gets silently casted in float32. Hence, we need
1028
  # cast them back in the correct dtype just to be sure everything works as expected.
1029
  # This might slowdown training & inference so it is recommended to not cast the LayerNorms
1030
- # in fp32. (DeepseekRMSNorm handles it correctly)
1031
 
1032
  input_dtype = query_states.dtype
1033
  if input_dtype == torch.float32:
@@ -1103,7 +1079,7 @@ class DeepseekFlashAttention2(DeepseekAttention):
1103
  if not self._flash_attn_uses_top_left_mask:
1104
  causal = self.is_causal
1105
  else:
1106
- # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekFlashAttention2 __init__.
1107
  causal = self.is_causal and query_length != 1
1108
 
1109
  # Contains at least one padding token in the sequence
@@ -1198,13 +1174,13 @@ class DeepseekFlashAttention2(DeepseekAttention):
1198
 
1199
 
1200
  ATTENTION_CLASSES = {
1201
- "eager": DeepseekAttention,
1202
- "flash_attention_2": DeepseekFlashAttention2,
1203
  }
1204
 
1205
 
1206
- class DeepseekDecoderLayer(nn.Module):
1207
- def __init__(self, config: DeepseekConfig, layer_idx: int):
1208
  super().__init__()
1209
  self.hidden_size = config.hidden_size
1210
 
@@ -1213,18 +1189,18 @@ class DeepseekDecoderLayer(nn.Module):
1213
  )
1214
 
1215
  self.mlp = (
1216
- DeepseekMoE(config)
1217
  if (
1218
  config.n_routed_experts is not None
1219
  and layer_idx >= config.first_k_dense_replace
1220
  and layer_idx % config.moe_layer_freq == 0
1221
  )
1222
- else DeepseekMLP(config)
1223
  )
1224
- self.input_layernorm = DeepseekRMSNorm(
1225
  config.hidden_size, eps=config.rms_norm_eps
1226
  )
1227
- self.post_attention_layernorm = DeepseekRMSNorm(
1228
  config.hidden_size, eps=config.rms_norm_eps
1229
  )
1230
 
@@ -1291,7 +1267,7 @@ class DeepseekDecoderLayer(nn.Module):
1291
  return outputs
1292
 
1293
 
1294
- Deepseek_START_DOCSTRING = r"""
1295
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
1296
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
1297
  etc.)
@@ -1301,7 +1277,7 @@ Deepseek_START_DOCSTRING = r"""
1301
  and behavior.
1302
 
1303
  Parameters:
1304
- config ([`DeepseekConfig`]):
1305
  Model configuration class with all the parameters of the model. Initializing with a config file does not
1306
  load the weights associated with the model, only the configuration. Check out the
1307
  [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1309,14 +1285,14 @@ Deepseek_START_DOCSTRING = r"""
1309
 
1310
 
1311
  @add_start_docstrings(
1312
- "The bare Deepseek Model outputting raw hidden-states without any specific head on top.",
1313
- Deepseek_START_DOCSTRING,
1314
  )
1315
- class DeepseekPreTrainedModel(PreTrainedModel):
1316
- config_class = DeepseekConfig
1317
  base_model_prefix = "model"
1318
  supports_gradient_checkpointing = True
1319
- _no_split_modules = ["DeepseekDecoderLayer"]
1320
  _skip_keys_device_placement = "past_key_values"
1321
  _supports_flash_attn_2 = True
1322
  _supports_sdpa = True
@@ -1334,7 +1310,7 @@ class DeepseekPreTrainedModel(PreTrainedModel):
1334
  module.weight.data[module.padding_idx].zero_()
1335
 
1336
 
1337
- Deepseek_INPUTS_DOCSTRING = r"""
1338
  Args:
1339
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1340
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -1405,18 +1381,18 @@ Deepseek_INPUTS_DOCSTRING = r"""
1405
 
1406
 
1407
  @add_start_docstrings(
1408
- "The bare Deepseek Model outputting raw hidden-states without any specific head on top.",
1409
- Deepseek_START_DOCSTRING,
1410
  )
1411
- class DeepseekModel(DeepseekPreTrainedModel):
1412
  """
1413
- Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekDecoderLayer`]
1414
 
1415
  Args:
1416
- config: DeepseekConfig
1417
  """
1418
 
1419
- def __init__(self, config: DeepseekConfig):
1420
  super().__init__(config)
1421
  self.padding_idx = config.pad_token_id
1422
  self.vocab_size = config.vocab_size
@@ -1426,13 +1402,13 @@ class DeepseekModel(DeepseekPreTrainedModel):
1426
  )
1427
  self.layers = nn.ModuleList(
1428
  [
1429
- DeepseekDecoderLayer(config, layer_idx)
1430
  for layer_idx in range(config.num_hidden_layers)
1431
  ]
1432
  )
1433
  self._use_sdpa = config._attn_implementation == "sdpa"
1434
  self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
1435
- self.norm = DeepseekRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1436
 
1437
  self.gradient_checkpointing = False
1438
  # Initialize weights and apply final processing
@@ -1444,7 +1420,7 @@ class DeepseekModel(DeepseekPreTrainedModel):
1444
  def set_input_embeddings(self, value):
1445
  self.embed_tokens = value
1446
 
1447
- @add_start_docstrings_to_model_forward(Deepseek_INPUTS_DOCSTRING)
1448
  def forward(
1449
  self,
1450
  input_ids: torch.LongTensor = None,
@@ -1604,12 +1580,12 @@ class DeepseekModel(DeepseekPreTrainedModel):
1604
  )
1605
 
1606
 
1607
- class DeepseekForCausalLM(DeepseekPreTrainedModel):
1608
  _tied_weights_keys = ["lm_head.weight"]
1609
 
1610
  def __init__(self, config):
1611
  super().__init__(config)
1612
- self.model = DeepseekModel(config)
1613
  self.vocab_size = config.vocab_size
1614
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1615
 
@@ -1634,7 +1610,7 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
1634
  def get_decoder(self):
1635
  return self.model
1636
 
1637
- @add_start_docstrings_to_model_forward(Deepseek_INPUTS_DOCSTRING)
1638
  @replace_return_docstrings(
1639
  output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
1640
  )
@@ -1663,9 +1639,9 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
1663
  Example:
1664
 
1665
  ```python
1666
- >>> from transformers import AutoTokenizer, DeepseekForCausalLM
1667
 
1668
- >>> model = DeepseekForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1669
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1670
 
1671
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
@@ -1811,9 +1787,9 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
1811
 
1812
  @add_start_docstrings(
1813
  """
1814
- The Deepseek Model transformer with a sequence classification head on top (linear layer).
1815
 
1816
- [`DeepseekForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1817
  (e.g. GPT-2) do.
1818
 
1819
  Since it does classification on the last token, it requires to know the position of the last token. If a
@@ -1822,13 +1798,13 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
1822
  padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1823
  each row of the batch).
1824
  """,
1825
- Deepseek_START_DOCSTRING,
1826
  )
1827
- class DeepseekForSequenceClassification(DeepseekPreTrainedModel):
1828
  def __init__(self, config):
1829
  super().__init__(config)
1830
  self.num_labels = config.num_labels
1831
- self.model = DeepseekModel(config)
1832
  self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1833
 
1834
  # Initialize weights and apply final processing
@@ -1840,7 +1816,7 @@ class DeepseekForSequenceClassification(DeepseekPreTrainedModel):
1840
  def set_input_embeddings(self, value):
1841
  self.model.embed_tokens = value
1842
 
1843
- @add_start_docstrings_to_model_forward(Deepseek_INPUTS_DOCSTRING)
1844
  def forward(
1845
  self,
1846
  input_ids: torch.LongTensor = None,
 
55
  replace_return_docstrings,
56
  )
57
  from transformers.utils.import_utils import is_torch_fx_available
58
+ from .configuration_deepseek import DeepseekV2Config
59
  import torch.distributed as dist
60
  import numpy as np
61
 
 
75
 
76
  logger = logging.get_logger(__name__)
77
 
78
+ _CONFIG_FOR_DOC = "DeepseekV2Config"
79
 
80
 
81
  def _get_unpad_data(attention_mask):
 
92
  )
93
 
94
 
95
+ class DeepseekV2RMSNorm(nn.Module):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  def __init__(self, hidden_size, eps=1e-6):
97
  """
98
+ DeepseekV2RMSNorm is equivalent to T5LayerNorm
99
  """
100
  super().__init__()
101
  self.weight = nn.Parameter(torch.ones(hidden_size))
 
109
  return self.weight * hidden_states.to(input_dtype)
110
 
111
 
112
+ ALL_LAYERNORM_LAYERS.append(DeepseekV2RMSNorm)
113
 
114
 
115
+ class DeepseekV2RotaryEmbedding(nn.Module):
116
  def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
117
  super().__init__()
118
 
 
155
  )
156
 
157
 
158
+ # Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->DeepseekV2
159
+ class DeepseekV2LinearScalingRotaryEmbedding(DeepseekV2RotaryEmbedding):
160
+ """DeepseekV2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
161
 
162
  def __init__(
163
  self,
 
184
  self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
185
 
186
 
187
+ # Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->DeepseekV2
188
+ class DeepseekV2DynamicNTKScalingRotaryEmbedding(DeepseekV2RotaryEmbedding):
189
+ """DeepseekV2RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
190
 
191
  def __init__(
192
  self,
 
260
  return ramp_func
261
 
262
 
263
+ class DeepseekV2YarnRotaryEmbedding(DeepseekV2RotaryEmbedding):
264
 
265
  def __init__(
266
  self,
 
372
  return q_embed, k_embed
373
 
374
 
375
+ class DeepseekV2MLP(nn.Module):
376
  def __init__(self, config, hidden_size=None, intermediate_size=None):
377
  super().__init__()
378
  self.config = config
 
519
  return grad_output, grad_loss
520
 
521
 
522
+ class DeepseekV2MoE(nn.Module):
523
  """
524
  A mixed expert module containing shared experts.
525
  """
 
537
  self.experts = nn.ModuleList(
538
  [
539
  (
540
+ DeepseekV2MLP(
541
  config, intermediate_size=config.moe_intermediate_size
542
  )
543
  if i >= self.ep_rank * self.experts_per_rank
 
553
  self.ep_rank = 0
554
  self.experts = nn.ModuleList(
555
  [
556
+ DeepseekV2MLP(config, intermediate_size=config.moe_intermediate_size)
557
  for i in range(config.n_routed_experts)
558
  ]
559
  )
560
  self.gate = MoEGate(config)
561
  if config.n_shared_experts is not None:
562
  intermediate_size = config.moe_intermediate_size * config.n_shared_experts
563
+ self.shared_experts = DeepseekV2MLP(
564
  config=config, intermediate_size=intermediate_size
565
  )
566
 
 
678
  return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
679
 
680
 
681
+ # Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV2
682
+ class DeepseekV2Attention(nn.Module):
683
  """Multi-headed attention from 'Attention Is All You Need' paper"""
684
 
685
+ def __init__(self, config: DeepseekV2Config, layer_idx: Optional[int] = None):
686
  super().__init__()
687
  self.config = config
688
  self.layer_idx = layer_idx
 
711
  self.q_a_proj = nn.Linear(
712
  self.hidden_size, config.q_lora_rank, bias=config.attention_bias
713
  )
714
+ self.q_a_layernorm = DeepseekV2RMSNorm(config.q_lora_rank)
715
  self.q_b_proj = nn.Linear(
716
  config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
717
  )
 
721
  config.kv_lora_rank + config.qk_rope_head_dim,
722
  bias=config.attention_bias,
723
  )
724
+ self.kv_a_layernorm = DeepseekV2RMSNorm(config.kv_lora_rank)
725
  self.kv_b_proj = nn.Linear(
726
  config.kv_lora_rank,
727
  self.num_heads
 
746
 
747
  def _init_rope(self):
748
  if self.config.rope_scaling is None:
749
+ self.rotary_emb = DeepseekV2RotaryEmbedding(
750
  self.qk_rope_head_dim,
751
  max_position_embeddings=self.max_position_embeddings,
752
  base=self.rope_theta,
 
755
  scaling_type = self.config.rope_scaling["type"]
756
  scaling_factor = self.config.rope_scaling["factor"]
757
  if scaling_type == "linear":
758
+ self.rotary_emb = DeepseekV2LinearScalingRotaryEmbedding(
759
  self.qk_rope_head_dim,
760
  max_position_embeddings=self.max_position_embeddings,
761
  scaling_factor=scaling_factor,
762
  base=self.rope_theta,
763
  )
764
  elif scaling_type == "dynamic":
765
+ self.rotary_emb = DeepseekV2DynamicNTKScalingRotaryEmbedding(
766
  self.qk_rope_head_dim,
767
  max_position_embeddings=self.max_position_embeddings,
768
  scaling_factor=scaling_factor,
 
780
  ]
781
  if key in self.config.rope_scaling
782
  }
783
+ self.rotary_emb = DeepseekV2YarnRotaryEmbedding(
784
  self.qk_rope_head_dim,
785
  max_position_embeddings=self.max_position_embeddings,
786
  scaling_factor=scaling_factor,
 
903
  return attn_output, attn_weights, past_key_value
904
 
905
 
906
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->DeepseekV2
907
+ class DeepseekV2FlashAttention2(DeepseekV2Attention):
908
  """
909
+ DeepseekV2 flash attention module. This module inherits from `DeepseekV2Attention` as the weights of the module stays
910
  untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
911
  flash attention and deal with padding tokens in case the input contains any of them.
912
  """
 
929
  use_cache: bool = False,
930
  **kwargs,
931
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
932
+ # DeepseekV2FlashAttention2 attention does not support output_attentions
933
  if "padding_mask" in kwargs:
934
  warnings.warn(
935
  "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
 
1003
  # therefore the input hidden states gets silently casted in float32. Hence, we need
1004
  # cast them back in the correct dtype just to be sure everything works as expected.
1005
  # This might slowdown training & inference so it is recommended to not cast the LayerNorms
1006
+ # in fp32. (DeepseekV2RMSNorm handles it correctly)
1007
 
1008
  input_dtype = query_states.dtype
1009
  if input_dtype == torch.float32:
 
1079
  if not self._flash_attn_uses_top_left_mask:
1080
  causal = self.is_causal
1081
  else:
1082
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekV2FlashAttention2 __init__.
1083
  causal = self.is_causal and query_length != 1
1084
 
1085
  # Contains at least one padding token in the sequence
 
1174
 
1175
 
1176
  ATTENTION_CLASSES = {
1177
+ "eager": DeepseekV2Attention,
1178
+ "flash_attention_2": DeepseekV2FlashAttention2,
1179
  }
1180
 
1181
 
1182
+ class DeepseekV2DecoderLayer(nn.Module):
1183
+ def __init__(self, config: DeepseekV2Config, layer_idx: int):
1184
  super().__init__()
1185
  self.hidden_size = config.hidden_size
1186
 
 
1189
  )
1190
 
1191
  self.mlp = (
1192
+ DeepseekV2MoE(config)
1193
  if (
1194
  config.n_routed_experts is not None
1195
  and layer_idx >= config.first_k_dense_replace
1196
  and layer_idx % config.moe_layer_freq == 0
1197
  )
1198
+ else DeepseekV2MLP(config)
1199
  )
1200
+ self.input_layernorm = DeepseekV2RMSNorm(
1201
  config.hidden_size, eps=config.rms_norm_eps
1202
  )
1203
+ self.post_attention_layernorm = DeepseekV2RMSNorm(
1204
  config.hidden_size, eps=config.rms_norm_eps
1205
  )
1206
 
 
1267
  return outputs
1268
 
1269
 
1270
+ DeepseekV2_START_DOCSTRING = r"""
1271
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
1272
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
1273
  etc.)
 
1277
  and behavior.
1278
 
1279
  Parameters:
1280
+ config ([`DeepseekV2Config`]):
1281
  Model configuration class with all the parameters of the model. Initializing with a config file does not
1282
  load the weights associated with the model, only the configuration. Check out the
1283
  [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 
1285
 
1286
 
1287
  @add_start_docstrings(
1288
+ "The bare DeepseekV2 Model outputting raw hidden-states without any specific head on top.",
1289
+ DeepseekV2_START_DOCSTRING,
1290
  )
1291
+ class DeepseekV2PreTrainedModel(PreTrainedModel):
1292
+ config_class = DeepseekV2Config
1293
  base_model_prefix = "model"
1294
  supports_gradient_checkpointing = True
1295
+ _no_split_modules = ["DeepseekV2DecoderLayer"]
1296
  _skip_keys_device_placement = "past_key_values"
1297
  _supports_flash_attn_2 = True
1298
  _supports_sdpa = True
 
1310
  module.weight.data[module.padding_idx].zero_()
1311
 
1312
 
1313
+ DeepseekV2_INPUTS_DOCSTRING = r"""
1314
  Args:
1315
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1316
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
 
1381
 
1382
 
1383
  @add_start_docstrings(
1384
+ "The bare DeepseekV2 Model outputting raw hidden-states without any specific head on top.",
1385
+ DeepseekV2_START_DOCSTRING,
1386
  )
1387
+ class DeepseekV2Model(DeepseekV2PreTrainedModel):
1388
  """
1389
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
1390
 
1391
  Args:
1392
+ config: DeepseekV2Config
1393
  """
1394
 
1395
+ def __init__(self, config: DeepseekV2Config):
1396
  super().__init__(config)
1397
  self.padding_idx = config.pad_token_id
1398
  self.vocab_size = config.vocab_size
 
1402
  )
1403
  self.layers = nn.ModuleList(
1404
  [
1405
+ DeepseekV2DecoderLayer(config, layer_idx)
1406
  for layer_idx in range(config.num_hidden_layers)
1407
  ]
1408
  )
1409
  self._use_sdpa = config._attn_implementation == "sdpa"
1410
  self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
1411
+ self.norm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1412
 
1413
  self.gradient_checkpointing = False
1414
  # Initialize weights and apply final processing
 
1420
  def set_input_embeddings(self, value):
1421
  self.embed_tokens = value
1422
 
1423
+ @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
1424
  def forward(
1425
  self,
1426
  input_ids: torch.LongTensor = None,
 
1580
  )
1581
 
1582
 
1583
+ class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
1584
  _tied_weights_keys = ["lm_head.weight"]
1585
 
1586
  def __init__(self, config):
1587
  super().__init__(config)
1588
+ self.model = DeepseekV2Model(config)
1589
  self.vocab_size = config.vocab_size
1590
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1591
 
 
1610
  def get_decoder(self):
1611
  return self.model
1612
 
1613
+ @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
1614
  @replace_return_docstrings(
1615
  output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
1616
  )
 
1639
  Example:
1640
 
1641
  ```python
1642
+ >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM
1643
 
1644
+ >>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1645
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1646
 
1647
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
 
1787
 
1788
  @add_start_docstrings(
1789
  """
1790
+ The DeepseekV2 Model transformer with a sequence classification head on top (linear layer).
1791
 
1792
+ [`DeepseekV2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1793
  (e.g. GPT-2) do.
1794
 
1795
  Since it does classification on the last token, it requires to know the position of the last token. If a
 
1798
  padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1799
  each row of the batch).
1800
  """,
1801
+ DeepseekV2_START_DOCSTRING,
1802
  )
1803
+ class DeepseekV2ForSequenceClassification(DeepseekV2PreTrainedModel):
1804
  def __init__(self, config):
1805
  super().__init__(config)
1806
  self.num_labels = config.num_labels
1807
+ self.model = DeepseekV2Model(config)
1808
  self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1809
 
1810
  # Initialize weights and apply final processing
 
1816
  def set_input_embeddings(self, value):
1817
  self.model.embed_tokens = value
1818
 
1819
+ @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
1820
  def forward(
1821
  self,
1822
  input_ids: torch.LongTensor = None,