Use try-except for flash_attn import

#4
Files changed (1) hide show
  1. modeling_deepseek.py +3 -27
modeling_deepseek.py CHANGED
@@ -58,10 +58,11 @@ from .configuration_deepseek import DeepseekV2Config
58
  import torch.distributed as dist
59
  import numpy as np
60
 
61
- if is_flash_attn_2_available():
62
  from flash_attn import flash_attn_func, flash_attn_varlen_func
63
  from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
64
-
 
65
 
66
  # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
67
  # It means that the function will not be traced through and simply appear as a node in the graph.
@@ -338,7 +339,6 @@ def rotate_half(x):
338
  # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
339
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
340
  """Applies Rotary Position Embedding to the query and key tensors.
341
-
342
  Args:
343
  q (`torch.Tensor`): The query tensor.
344
  k (`torch.Tensor`): The key tensor.
@@ -1076,7 +1076,6 @@ class DeepseekV2FlashAttention2(DeepseekV2Attention):
1076
  """
1077
  Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
1078
  first unpad the input, then computes the attention scores and pad the final attention scores.
1079
-
1080
  Args:
1081
  query_states (`torch.Tensor`):
1082
  Input query states to be passed to Flash Attention API
@@ -1287,11 +1286,9 @@ DeepseekV2_START_DOCSTRING = r"""
1287
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
1288
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
1289
  etc.)
1290
-
1291
  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
1292
  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
1293
  and behavior.
1294
-
1295
  Parameters:
1296
  config ([`DeepseekV2Config`]):
1297
  Model configuration class with all the parameters of the model. Initializing with a config file does not
@@ -1330,50 +1327,38 @@ DeepseekV2_INPUTS_DOCSTRING = r"""
1330
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1331
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1332
  it.
1333
-
1334
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1335
  [`PreTrainedTokenizer.__call__`] for details.
1336
-
1337
  [What are input IDs?](../glossary#input-ids)
1338
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1339
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1340
-
1341
  - 1 for tokens that are **not masked**,
1342
  - 0 for tokens that are **masked**.
1343
-
1344
  [What are attention masks?](../glossary#attention-mask)
1345
-
1346
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1347
  [`PreTrainedTokenizer.__call__`] for details.
1348
-
1349
  If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
1350
  `past_key_values`).
1351
-
1352
  If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
1353
  and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
1354
  information on the default strategy.
1355
-
1356
  - 1 indicates the head is **not masked**,
1357
  - 0 indicates the head is **masked**.
1358
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1359
  Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1360
  config.n_positions - 1]`.
1361
-
1362
  [What are position IDs?](../glossary#position-ids)
1363
  past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
1364
  Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1365
  blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
1366
  returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
1367
-
1368
  Two formats are allowed:
1369
  - a [`~cache_utils.Cache`] instance;
1370
  - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
1371
  shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
1372
  cache format.
1373
-
1374
  The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
1375
  legacy cache format will be returned.
1376
-
1377
  If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
1378
  have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
1379
  of shape `(batch_size, sequence_length)`.
@@ -1402,7 +1387,6 @@ DeepseekV2_INPUTS_DOCSTRING = r"""
1402
  class DeepseekV2Model(DeepseekV2PreTrainedModel):
1403
  """
1404
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
1405
-
1406
  Args:
1407
  config: DeepseekV2Config
1408
  """
@@ -1638,20 +1622,14 @@ class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
1638
  Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
1639
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1640
  (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.
1641
-
1642
  Returns:
1643
-
1644
  Example:
1645
-
1646
  ```python
1647
  >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM
1648
-
1649
  >>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1650
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1651
-
1652
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
1653
  >>> inputs = tokenizer(prompt, return_tensors="pt")
1654
-
1655
  >>> # Generate
1656
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1657
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -1793,10 +1771,8 @@ class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
1793
  @add_start_docstrings(
1794
  """
1795
  The DeepseekV2 Model transformer with a sequence classification head on top (linear layer).
1796
-
1797
  [`DeepseekV2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1798
  (e.g. GPT-2) do.
1799
-
1800
  Since it does classification on the last token, it requires to know the position of the last token. If a
1801
  `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1802
  no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
 
58
  import torch.distributed as dist
59
  import numpy as np
60
 
61
+ try:
62
  from flash_attn import flash_attn_func, flash_attn_varlen_func
63
  from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
64
+ except ImportError:
65
+ pass
66
 
67
  # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
68
  # It means that the function will not be traced through and simply appear as a node in the graph.
 
339
  # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
340
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
341
  """Applies Rotary Position Embedding to the query and key tensors.
 
342
  Args:
343
  q (`torch.Tensor`): The query tensor.
344
  k (`torch.Tensor`): The key tensor.
 
1076
  """
1077
  Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
1078
  first unpad the input, then computes the attention scores and pad the final attention scores.
 
1079
  Args:
1080
  query_states (`torch.Tensor`):
1081
  Input query states to be passed to Flash Attention API
 
1286
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
1287
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
1288
  etc.)
 
1289
  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
1290
  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
1291
  and behavior.
 
1292
  Parameters:
1293
  config ([`DeepseekV2Config`]):
1294
  Model configuration class with all the parameters of the model. Initializing with a config file does not
 
1327
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1328
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1329
  it.
 
1330
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1331
  [`PreTrainedTokenizer.__call__`] for details.
 
1332
  [What are input IDs?](../glossary#input-ids)
1333
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1334
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
1335
  - 1 for tokens that are **not masked**,
1336
  - 0 for tokens that are **masked**.
 
1337
  [What are attention masks?](../glossary#attention-mask)
 
1338
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1339
  [`PreTrainedTokenizer.__call__`] for details.
 
1340
  If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
1341
  `past_key_values`).
 
1342
  If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
1343
  and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
1344
  information on the default strategy.
 
1345
  - 1 indicates the head is **not masked**,
1346
  - 0 indicates the head is **masked**.
1347
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1348
  Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1349
  config.n_positions - 1]`.
 
1350
  [What are position IDs?](../glossary#position-ids)
1351
  past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
1352
  Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1353
  blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
1354
  returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
1355
  Two formats are allowed:
1356
  - a [`~cache_utils.Cache`] instance;
1357
  - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
1358
  shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
1359
  cache format.
 
1360
  The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
1361
  legacy cache format will be returned.
 
1362
  If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
1363
  have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
1364
  of shape `(batch_size, sequence_length)`.
 
1387
  class DeepseekV2Model(DeepseekV2PreTrainedModel):
1388
  """
1389
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
 
1390
  Args:
1391
  config: DeepseekV2Config
1392
  """
 
1622
  Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
1623
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1624
  (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.
 
1625
  Returns:
 
1626
  Example:
 
1627
  ```python
1628
  >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM
 
1629
  >>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1630
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
 
1631
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
1632
  >>> inputs = tokenizer(prompt, return_tensors="pt")
 
1633
  >>> # Generate
1634
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1635
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
1771
  @add_start_docstrings(
1772
  """
1773
  The DeepseekV2 Model transformer with a sequence classification head on top (linear layer).
 
1774
  [`DeepseekV2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1775
  (e.g. GPT-2) do.
 
1776
  Since it does classification on the last token, it requires to know the position of the last token. If a
1777
  `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1778
  no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the