d-Matrix commited on
Commit
3c69922
1 Parent(s): a26e7df

Update modeling_llama.py

Browse files
Files changed (1) hide show
  1. modeling_llama.py +26 -2
modeling_llama.py CHANGED
@@ -167,6 +167,7 @@ def rotate_half(x):
167
 
168
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
169
  """Applies Rotary Position Embedding to the query and key tensors.
 
170
  Args:
171
  q (`torch.Tensor`): The query tensor.
172
  k (`torch.Tensor`): The key tensor.
@@ -488,6 +489,7 @@ class LlamaFlashAttention2(LlamaAttention):
488
  """
489
  Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
490
  first unpad the input, then computes the attention scores and pad the final attention scores.
 
491
  Args:
492
  query_states (`torch.Tensor`):
493
  Input query states to be passed to Flash Attention API
@@ -753,9 +755,11 @@ LLAMA_START_DOCSTRING = r"""
753
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
754
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
755
  etc.)
 
756
  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
757
  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
758
  and behavior.
 
759
  Parameters:
760
  config ([`LlamaConfig`]):
761
  Model configuration class with all the parameters of the model. Initializing with a config file does not
@@ -816,38 +820,50 @@ LLAMA_INPUTS_DOCSTRING = r"""
816
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
817
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
818
  it.
 
819
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
820
  [`PreTrainedTokenizer.__call__`] for details.
 
821
  [What are input IDs?](../glossary#input-ids)
822
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
823
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
824
  - 1 for tokens that are **not masked**,
825
  - 0 for tokens that are **masked**.
 
826
  [What are attention masks?](../glossary#attention-mask)
 
827
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
828
  [`PreTrainedTokenizer.__call__`] for details.
 
829
  If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
830
  `past_key_values`).
 
831
  If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
832
  and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
833
  information on the default strategy.
 
834
  - 1 indicates the head is **not masked**,
835
  - 0 indicates the head is **masked**.
836
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
837
  Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
838
  config.n_positions - 1]`.
 
839
  [What are position IDs?](../glossary#position-ids)
840
  past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
841
  Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
842
  blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
843
  returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
844
  Two formats are allowed:
845
  - a [`~cache_utils.Cache`] instance;
846
  - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
847
  shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
848
  cache format.
 
849
  The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
850
  legacy cache format will be returned.
 
851
  If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
852
  have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
853
  of shape `(batch_size, sequence_length)`.
@@ -880,6 +896,7 @@ LLAMA_INPUTS_DOCSTRING = r"""
880
  class LlamaModel(LlamaPreTrainedModel):
881
  """
882
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
 
883
  Args:
884
  config: LlamaConfig
885
  """
@@ -942,7 +959,7 @@ class LlamaModel(LlamaPreTrainedModel):
942
 
943
  past_seen_tokens = 0
944
  if use_cache: # kept for BC (cache positions)
945
- if not isinstance(past_key_values, StaticCache):
946
  past_key_values = DynamicCache.from_legacy_cache(past_key_values)
947
  past_seen_tokens = past_key_values.get_seq_length()
948
 
@@ -1146,14 +1163,20 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
1146
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1147
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1148
  (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
 
1149
  Returns:
 
1150
  Example:
 
1151
  ```python
1152
  >>> from transformers import AutoTokenizer, LlamaForCausalLM
 
1153
  >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
1154
  >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
 
1155
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
1156
  >>> inputs = tokenizer(prompt, return_tensors="pt")
 
1157
  >>> # Generate
1158
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1159
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -1308,8 +1331,10 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
1308
  @add_start_docstrings(
1309
  """
1310
  The LLaMa Model transformer with a sequence classification head on top (linear layer).
 
1311
  [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1312
  (e.g. GPT-2) do.
 
1313
  Since it does classification on the last token, it requires to know the position of the last token. If a
1314
  `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1315
  no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
@@ -1523,4 +1548,3 @@ class LlamaForQuestionAnswering(LlamaPreTrainedModel):
1523
  hidden_states=outputs.hidden_states,
1524
  attentions=outputs.attentions,
1525
  )
1526
-
 
167
 
168
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
169
  """Applies Rotary Position Embedding to the query and key tensors.
170
+
171
  Args:
172
  q (`torch.Tensor`): The query tensor.
173
  k (`torch.Tensor`): The key tensor.
 
489
  """
490
  Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
491
  first unpad the input, then computes the attention scores and pad the final attention scores.
492
+
493
  Args:
494
  query_states (`torch.Tensor`):
495
  Input query states to be passed to Flash Attention API
 
755
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
756
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
757
  etc.)
758
+
759
  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
760
  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
761
  and behavior.
762
+
763
  Parameters:
764
  config ([`LlamaConfig`]):
765
  Model configuration class with all the parameters of the model. Initializing with a config file does not
 
820
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
821
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
822
  it.
823
+
824
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
825
  [`PreTrainedTokenizer.__call__`] for details.
826
+
827
  [What are input IDs?](../glossary#input-ids)
828
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
829
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
830
+
831
  - 1 for tokens that are **not masked**,
832
  - 0 for tokens that are **masked**.
833
+
834
  [What are attention masks?](../glossary#attention-mask)
835
+
836
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
837
  [`PreTrainedTokenizer.__call__`] for details.
838
+
839
  If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
840
  `past_key_values`).
841
+
842
  If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
843
  and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
844
  information on the default strategy.
845
+
846
  - 1 indicates the head is **not masked**,
847
  - 0 indicates the head is **masked**.
848
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
849
  Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
850
  config.n_positions - 1]`.
851
+
852
  [What are position IDs?](../glossary#position-ids)
853
  past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
854
  Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
855
  blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
856
  returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
857
+
858
  Two formats are allowed:
859
  - a [`~cache_utils.Cache`] instance;
860
  - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
861
  shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
862
  cache format.
863
+
864
  The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
865
  legacy cache format will be returned.
866
+
867
  If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
868
  have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
869
  of shape `(batch_size, sequence_length)`.
 
896
  class LlamaModel(LlamaPreTrainedModel):
897
  """
898
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
899
+
900
  Args:
901
  config: LlamaConfig
902
  """
 
959
 
960
  past_seen_tokens = 0
961
  if use_cache: # kept for BC (cache positions)
962
+ if past_key_values and not isinstance(past_key_values, StaticCache):
963
  past_key_values = DynamicCache.from_legacy_cache(past_key_values)
964
  past_seen_tokens = past_key_values.get_seq_length()
965
 
 
1163
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1164
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1165
  (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1166
+
1167
  Returns:
1168
+
1169
  Example:
1170
+
1171
  ```python
1172
  >>> from transformers import AutoTokenizer, LlamaForCausalLM
1173
+
1174
  >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
1175
  >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
1176
+
1177
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
1178
  >>> inputs = tokenizer(prompt, return_tensors="pt")
1179
+
1180
  >>> # Generate
1181
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1182
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
1331
  @add_start_docstrings(
1332
  """
1333
  The LLaMa Model transformer with a sequence classification head on top (linear layer).
1334
+
1335
  [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1336
  (e.g. GPT-2) do.
1337
+
1338
  Since it does classification on the last token, it requires to know the position of the last token. If a
1339
  `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1340
  no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
 
1548
  hidden_states=outputs.hidden_states,
1549
  attentions=outputs.attentions,
1550
  )