x54-729 commited on
Commit
5370161
1 Parent(s): a80bd54

Update configuration_internlm2.py

Browse files
Files changed (1) hide show
  1. configuration_internlm2.py +42 -13
configuration_internlm2.py CHANGED
@@ -44,9 +44,9 @@ class InternLM2Config(PretrainedConfig):
44
  intermediate_size (`int`, *optional*, defaults to 11008):
45
  Dimension of the MLP representations.
46
  num_hidden_layers (`int`, *optional*, defaults to 32):
47
- Number of hidden layers in the Transformer encoder.
48
  num_attention_heads (`int`, *optional*, defaults to 32):
49
- Number of attention heads for each attention layer in the Transformer encoder.
50
  num_key_value_heads (`int`, *optional*):
51
  This is the number of key_value heads that should be used to implement Grouped Query Attention. If
52
  `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
@@ -58,22 +58,42 @@ class InternLM2Config(PretrainedConfig):
58
  hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
59
  The non-linear activation function (function or string) in the decoder.
60
  max_position_embeddings (`int`, *optional*, defaults to 2048):
61
- The maximum sequence length that this model might ever be used with. Typically set this to something large
62
- just in case (e.g., 512 or 1024 or 2048).
63
  initializer_range (`float`, *optional*, defaults to 0.02):
64
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
65
- rms_norm_eps (`float`, *optional*, defaults to 1e-12):
66
  The epsilon used by the rms normalization layers.
67
  use_cache (`bool`, *optional*, defaults to `True`):
68
  Whether or not the model should return the last key/values attentions (not used by all models). Only
69
  relevant if `config.is_decoder=True`.
70
- tie_word_embeddings(`bool`, *optional*, defaults to `False`):
 
 
 
 
 
 
 
 
 
 
 
 
71
  Whether to tie weight embeddings
72
- Example:
73
-
 
 
 
 
 
 
 
 
74
  """
75
- model_type = "internlm2"
76
  _auto_class = "AutoConfig"
 
 
77
 
78
  def __init__( # pylint: disable=W0102
79
  self,
@@ -91,11 +111,12 @@ class InternLM2Config(PretrainedConfig):
91
  pad_token_id=0,
92
  bos_token_id=1,
93
  eos_token_id=2,
 
94
  tie_word_embeddings=False,
95
  bias=True,
96
  rope_theta=10000,
97
  rope_scaling=None,
98
- attn_implementation="eager",
99
  **kwargs,
100
  ):
101
  self.vocab_size = vocab_size
@@ -113,14 +134,15 @@ class InternLM2Config(PretrainedConfig):
113
  self.hidden_act = hidden_act
114
  self.initializer_range = initializer_range
115
  self.rms_norm_eps = rms_norm_eps
 
116
  self.use_cache = use_cache
117
  self.rope_theta = rope_theta
118
  self.rope_scaling = rope_scaling
119
  self._rope_scaling_validation()
120
-
121
  self.attn_implementation = attn_implementation
122
  if self.attn_implementation is None:
123
  self.attn_implementation = "eager"
 
124
  super().__init__(
125
  pad_token_id=pad_token_id,
126
  bos_token_id=bos_token_id,
@@ -147,5 +169,12 @@ class InternLM2Config(PretrainedConfig):
147
  raise ValueError(
148
  f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
149
  )
150
- if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
151
- raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")
 
 
 
 
 
 
 
 
44
  intermediate_size (`int`, *optional*, defaults to 11008):
45
  Dimension of the MLP representations.
46
  num_hidden_layers (`int`, *optional*, defaults to 32):
47
+ Number of hidden layers in the Transformer decoder.
48
  num_attention_heads (`int`, *optional*, defaults to 32):
49
+ Number of attention heads for each attention layer in the Transformer decoder.
50
  num_key_value_heads (`int`, *optional*):
51
  This is the number of key_value heads that should be used to implement Grouped Query Attention. If
52
  `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
 
58
  hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
59
  The non-linear activation function (function or string) in the decoder.
60
  max_position_embeddings (`int`, *optional*, defaults to 2048):
61
+ The maximum sequence length that this model might ever be used with. InternLM2 supports up to 32768 tokens.
 
62
  initializer_range (`float`, *optional*, defaults to 0.02):
63
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
64
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
65
  The epsilon used by the rms normalization layers.
66
  use_cache (`bool`, *optional*, defaults to `True`):
67
  Whether or not the model should return the last key/values attentions (not used by all models). Only
68
  relevant if `config.is_decoder=True`.
69
+ pad_token_id (`int`, *optional*):
70
+ Padding token id.
71
+ bos_token_id (`int`, *optional*, defaults to 1):
72
+ Beginning of stream token id.
73
+ eos_token_id (`int`, *optional*, defaults to 2):
74
+ End of stream token id.
75
+ pretraining_tp (`int`, *optional*, defaults to 1):
76
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
77
+ document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism)
78
+ to understand more about it. This value is necessary to ensure exact reproducibility
79
+ of the pretraining results. Please refer to [this
80
+ issue](https://github.com/pytorch/pytorch/issues/76232).
81
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
82
  Whether to tie weight embeddings
83
+ rope_theta (`float`, *optional*, defaults to 10000.0):
84
+ The base period of the RoPE embeddings.
85
+ rope_scaling (`Dict`, *optional*):
86
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
87
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
88
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
89
+ `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
90
+ these scaling strategies behave:
91
+ https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
92
+ experimental feature, subject to breaking API changes in future versions.
93
  """
 
94
  _auto_class = "AutoConfig"
95
+ model_type = "internlm2"
96
+ keys_to_ignore_at_inference = ["past_key_values"]
97
 
98
  def __init__( # pylint: disable=W0102
99
  self,
 
111
  pad_token_id=0,
112
  bos_token_id=1,
113
  eos_token_id=2,
114
+ pretraining_tp=1,
115
  tie_word_embeddings=False,
116
  bias=True,
117
  rope_theta=10000,
118
  rope_scaling=None,
119
+ attn_implementation=None,
120
  **kwargs,
121
  ):
122
  self.vocab_size = vocab_size
 
134
  self.hidden_act = hidden_act
135
  self.initializer_range = initializer_range
136
  self.rms_norm_eps = rms_norm_eps
137
+ self.pretraining_tp = pretraining_tp
138
  self.use_cache = use_cache
139
  self.rope_theta = rope_theta
140
  self.rope_scaling = rope_scaling
141
  self._rope_scaling_validation()
 
142
  self.attn_implementation = attn_implementation
143
  if self.attn_implementation is None:
144
  self.attn_implementation = "eager"
145
+
146
  super().__init__(
147
  pad_token_id=pad_token_id,
148
  bos_token_id=bos_token_id,
 
169
  raise ValueError(
170
  f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
171
  )
172
+ if (
173
+ rope_scaling_factor is None
174
+ or not isinstance(rope_scaling_factor, (float, int))
175
+ or rope_scaling_factor < 1.0
176
+ ):
177
+ raise ValueError(
178
+ f"`rope_scaling`'s factor field must be a number >= 1, got {rope_scaling_factor} "
179
+ f"of type {type(rope_scaling_factor)}"
180
+ )