d-Matrix
commited on
Commit
•
3c69922
1
Parent(s):
a26e7df
Update modeling_llama.py
Browse files- modeling_llama.py +26 -2
modeling_llama.py
CHANGED
@@ -167,6 +167,7 @@ def rotate_half(x):
|
|
167 |
|
168 |
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
169 |
"""Applies Rotary Position Embedding to the query and key tensors.
|
|
|
170 |
Args:
|
171 |
q (`torch.Tensor`): The query tensor.
|
172 |
k (`torch.Tensor`): The key tensor.
|
@@ -488,6 +489,7 @@ class LlamaFlashAttention2(LlamaAttention):
|
|
488 |
"""
|
489 |
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
|
490 |
first unpad the input, then computes the attention scores and pad the final attention scores.
|
|
|
491 |
Args:
|
492 |
query_states (`torch.Tensor`):
|
493 |
Input query states to be passed to Flash Attention API
|
@@ -753,9 +755,11 @@ LLAMA_START_DOCSTRING = r"""
|
|
753 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
754 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
755 |
etc.)
|
|
|
756 |
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
757 |
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
758 |
and behavior.
|
|
|
759 |
Parameters:
|
760 |
config ([`LlamaConfig`]):
|
761 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
@@ -816,38 +820,50 @@ LLAMA_INPUTS_DOCSTRING = r"""
|
|
816 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
817 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
818 |
it.
|
|
|
819 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
820 |
[`PreTrainedTokenizer.__call__`] for details.
|
|
|
821 |
[What are input IDs?](../glossary#input-ids)
|
822 |
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
823 |
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
|
|
824 |
- 1 for tokens that are **not masked**,
|
825 |
- 0 for tokens that are **masked**.
|
|
|
826 |
[What are attention masks?](../glossary#attention-mask)
|
|
|
827 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
828 |
[`PreTrainedTokenizer.__call__`] for details.
|
|
|
829 |
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
|
830 |
`past_key_values`).
|
|
|
831 |
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
832 |
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
|
833 |
information on the default strategy.
|
|
|
834 |
- 1 indicates the head is **not masked**,
|
835 |
- 0 indicates the head is **masked**.
|
836 |
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
837 |
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
838 |
config.n_positions - 1]`.
|
|
|
839 |
[What are position IDs?](../glossary#position-ids)
|
840 |
past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
|
841 |
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
842 |
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
843 |
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
|
|
844 |
Two formats are allowed:
|
845 |
- a [`~cache_utils.Cache`] instance;
|
846 |
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
|
847 |
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
|
848 |
cache format.
|
|
|
849 |
The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
|
850 |
legacy cache format will be returned.
|
|
|
851 |
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
852 |
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
853 |
of shape `(batch_size, sequence_length)`.
|
@@ -880,6 +896,7 @@ LLAMA_INPUTS_DOCSTRING = r"""
|
|
880 |
class LlamaModel(LlamaPreTrainedModel):
|
881 |
"""
|
882 |
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
|
|
|
883 |
Args:
|
884 |
config: LlamaConfig
|
885 |
"""
|
@@ -942,7 +959,7 @@ class LlamaModel(LlamaPreTrainedModel):
|
|
942 |
|
943 |
past_seen_tokens = 0
|
944 |
if use_cache: # kept for BC (cache positions)
|
945 |
-
if not isinstance(past_key_values, StaticCache):
|
946 |
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
|
947 |
past_seen_tokens = past_key_values.get_seq_length()
|
948 |
|
@@ -1146,14 +1163,20 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
|
|
1146 |
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
1147 |
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
1148 |
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
|
|
1149 |
Returns:
|
|
|
1150 |
Example:
|
|
|
1151 |
```python
|
1152 |
>>> from transformers import AutoTokenizer, LlamaForCausalLM
|
|
|
1153 |
>>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
|
1154 |
>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
|
|
1155 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
1156 |
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
|
|
1157 |
>>> # Generate
|
1158 |
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
1159 |
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
@@ -1308,8 +1331,10 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
|
|
1308 |
@add_start_docstrings(
|
1309 |
"""
|
1310 |
The LLaMa Model transformer with a sequence classification head on top (linear layer).
|
|
|
1311 |
[`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
1312 |
(e.g. GPT-2) do.
|
|
|
1313 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
1314 |
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
|
1315 |
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
|
@@ -1523,4 +1548,3 @@ class LlamaForQuestionAnswering(LlamaPreTrainedModel):
|
|
1523 |
hidden_states=outputs.hidden_states,
|
1524 |
attentions=outputs.attentions,
|
1525 |
)
|
1526 |
-
|
|
|
167 |
|
168 |
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
169 |
"""Applies Rotary Position Embedding to the query and key tensors.
|
170 |
+
|
171 |
Args:
|
172 |
q (`torch.Tensor`): The query tensor.
|
173 |
k (`torch.Tensor`): The key tensor.
|
|
|
489 |
"""
|
490 |
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
|
491 |
first unpad the input, then computes the attention scores and pad the final attention scores.
|
492 |
+
|
493 |
Args:
|
494 |
query_states (`torch.Tensor`):
|
495 |
Input query states to be passed to Flash Attention API
|
|
|
755 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
756 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
757 |
etc.)
|
758 |
+
|
759 |
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
760 |
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
761 |
and behavior.
|
762 |
+
|
763 |
Parameters:
|
764 |
config ([`LlamaConfig`]):
|
765 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
|
|
820 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
821 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
822 |
it.
|
823 |
+
|
824 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
825 |
[`PreTrainedTokenizer.__call__`] for details.
|
826 |
+
|
827 |
[What are input IDs?](../glossary#input-ids)
|
828 |
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
829 |
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
830 |
+
|
831 |
- 1 for tokens that are **not masked**,
|
832 |
- 0 for tokens that are **masked**.
|
833 |
+
|
834 |
[What are attention masks?](../glossary#attention-mask)
|
835 |
+
|
836 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
837 |
[`PreTrainedTokenizer.__call__`] for details.
|
838 |
+
|
839 |
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
|
840 |
`past_key_values`).
|
841 |
+
|
842 |
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
843 |
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
|
844 |
information on the default strategy.
|
845 |
+
|
846 |
- 1 indicates the head is **not masked**,
|
847 |
- 0 indicates the head is **masked**.
|
848 |
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
849 |
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
850 |
config.n_positions - 1]`.
|
851 |
+
|
852 |
[What are position IDs?](../glossary#position-ids)
|
853 |
past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
|
854 |
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
855 |
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
856 |
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
857 |
+
|
858 |
Two formats are allowed:
|
859 |
- a [`~cache_utils.Cache`] instance;
|
860 |
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
|
861 |
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
|
862 |
cache format.
|
863 |
+
|
864 |
The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
|
865 |
legacy cache format will be returned.
|
866 |
+
|
867 |
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
868 |
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
869 |
of shape `(batch_size, sequence_length)`.
|
|
|
896 |
class LlamaModel(LlamaPreTrainedModel):
|
897 |
"""
|
898 |
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
|
899 |
+
|
900 |
Args:
|
901 |
config: LlamaConfig
|
902 |
"""
|
|
|
959 |
|
960 |
past_seen_tokens = 0
|
961 |
if use_cache: # kept for BC (cache positions)
|
962 |
+
if past_key_values and not isinstance(past_key_values, StaticCache):
|
963 |
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
|
964 |
past_seen_tokens = past_key_values.get_seq_length()
|
965 |
|
|
|
1163 |
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
1164 |
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
1165 |
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
1166 |
+
|
1167 |
Returns:
|
1168 |
+
|
1169 |
Example:
|
1170 |
+
|
1171 |
```python
|
1172 |
>>> from transformers import AutoTokenizer, LlamaForCausalLM
|
1173 |
+
|
1174 |
>>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
|
1175 |
>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
1176 |
+
|
1177 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
1178 |
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
1179 |
+
|
1180 |
>>> # Generate
|
1181 |
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
1182 |
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
|
|
1331 |
@add_start_docstrings(
|
1332 |
"""
|
1333 |
The LLaMa Model transformer with a sequence classification head on top (linear layer).
|
1334 |
+
|
1335 |
[`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
1336 |
(e.g. GPT-2) do.
|
1337 |
+
|
1338 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
1339 |
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
|
1340 |
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
|
|
|
1548 |
hidden_states=outputs.hidden_states,
|
1549 |
attentions=outputs.attentions,
|
1550 |
)
|
|