the config class and config.json uses DeepseekConfig, not v2

#5
Files changed (1) hide show
  1. modeling_deepseek.py +7 -7
modeling_deepseek.py CHANGED
@@ -54,7 +54,7 @@ from transformers.utils import (
54
  replace_return_docstrings,
55
  )
56
  from transformers.utils.import_utils import is_torch_fx_available
57
- from .configuration_deepseek import DeepseekV2Config
58
  import torch.distributed as dist
59
  import numpy as np
60
 
@@ -681,7 +681,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
681
  class DeepseekV2Attention(nn.Module):
682
  """Multi-headed attention from 'Attention Is All You Need' paper"""
683
 
684
- def __init__(self, config: DeepseekV2Config, layer_idx: Optional[int] = None):
685
  super().__init__()
686
  self.config = config
687
  self.layer_idx = layer_idx
@@ -1190,7 +1190,7 @@ ATTENTION_CLASSES = {
1190
 
1191
 
1192
  class DeepseekV2DecoderLayer(nn.Module):
1193
- def __init__(self, config: DeepseekV2Config, layer_idx: int):
1194
  super().__init__()
1195
  self.hidden_size = config.hidden_size
1196
 
@@ -1287,7 +1287,7 @@ DeepseekV2_START_DOCSTRING = r"""
1287
  and behavior.
1288
 
1289
  Parameters:
1290
- config ([`DeepseekV2Config`]):
1291
  Model configuration class with all the parameters of the model. Initializing with a config file does not
1292
  load the weights associated with the model, only the configuration. Check out the
1293
  [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1299,7 +1299,7 @@ DeepseekV2_START_DOCSTRING = r"""
1299
  DeepseekV2_START_DOCSTRING,
1300
  )
1301
  class DeepseekV2PreTrainedModel(PreTrainedModel):
1302
- config_class = DeepseekV2Config
1303
  base_model_prefix = "model"
1304
  supports_gradient_checkpointing = True
1305
  _no_split_modules = ["DeepseekV2DecoderLayer"]
@@ -1398,10 +1398,10 @@ class DeepseekV2Model(DeepseekV2PreTrainedModel):
1398
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
1399
 
1400
  Args:
1401
- config: DeepseekV2Config
1402
  """
1403
 
1404
- def __init__(self, config: DeepseekV2Config):
1405
  super().__init__(config)
1406
  self.padding_idx = config.pad_token_id
1407
  self.vocab_size = config.vocab_size
 
54
  replace_return_docstrings,
55
  )
56
  from transformers.utils.import_utils import is_torch_fx_available
57
+ from .configuration_deepseek import DeepseekConfig
58
  import torch.distributed as dist
59
  import numpy as np
60
 
 
681
  class DeepseekV2Attention(nn.Module):
682
  """Multi-headed attention from 'Attention Is All You Need' paper"""
683
 
684
+ def __init__(self, config: DeepseekConfig, layer_idx: Optional[int] = None):
685
  super().__init__()
686
  self.config = config
687
  self.layer_idx = layer_idx
 
1190
 
1191
 
1192
  class DeepseekV2DecoderLayer(nn.Module):
1193
+ def __init__(self, config: DeepseekConfig, layer_idx: int):
1194
  super().__init__()
1195
  self.hidden_size = config.hidden_size
1196
 
 
1287
  and behavior.
1288
 
1289
  Parameters:
1290
+ config ([`DeepseekConfig`]):
1291
  Model configuration class with all the parameters of the model. Initializing with a config file does not
1292
  load the weights associated with the model, only the configuration. Check out the
1293
  [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 
1299
  DeepseekV2_START_DOCSTRING,
1300
  )
1301
  class DeepseekV2PreTrainedModel(PreTrainedModel):
1302
+ config_class = DeepseekConfig
1303
  base_model_prefix = "model"
1304
  supports_gradient_checkpointing = True
1305
  _no_split_modules = ["DeepseekV2DecoderLayer"]
 
1398
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
1399
 
1400
  Args:
1401
+ config: DeepseekConfig
1402
  """
1403
 
1404
+ def __init__(self, config: DeepseekConfig):
1405
  super().__init__(config)
1406
  self.padding_idx = config.pad_token_id
1407
  self.vocab_size = config.vocab_size