LiangliangMa commited on
Commit
75f2232
1 Parent(s): 604d566

Use try-except for flash_attn import

Browse files

This PR is to avoid transformers hard check failure on import dependencies.
If user doesn't install flash_attn, they will not pass the import check for flash_attn. Also same for non-cuda users.

To solve this, we can see that try-except content will be filtered out: https://github.com/huggingface/transformers/blob/main/src/transformers/dynamic_module_utils.py#L155.

Files changed (1) hide show
  1. modeling_deepseek.py +3 -28
modeling_deepseek.py CHANGED
@@ -48,7 +48,6 @@ from transformers.pytorch_utils import (
48
  from transformers.utils import (
49
  add_start_docstrings,
50
  add_start_docstrings_to_model_forward,
51
- is_flash_attn_2_available,
52
  is_flash_attn_greater_or_equal_2_10,
53
  logging,
54
  replace_return_docstrings,
@@ -58,10 +57,11 @@ from .configuration_deepseek import DeepseekV2Config
58
  import torch.distributed as dist
59
  import numpy as np
60
 
61
- if is_flash_attn_2_available():
62
  from flash_attn import flash_attn_func, flash_attn_varlen_func
63
  from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
64
-
 
65
 
66
  # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
67
  # It means that the function will not be traced through and simply appear as a node in the graph.
@@ -338,7 +338,6 @@ def rotate_half(x):
338
  # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
339
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
340
  """Applies Rotary Position Embedding to the query and key tensors.
341
-
342
  Args:
343
  q (`torch.Tensor`): The query tensor.
344
  k (`torch.Tensor`): The key tensor.
@@ -1076,7 +1075,6 @@ class DeepseekV2FlashAttention2(DeepseekV2Attention):
1076
  """
1077
  Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
1078
  first unpad the input, then computes the attention scores and pad the final attention scores.
1079
-
1080
  Args:
1081
  query_states (`torch.Tensor`):
1082
  Input query states to be passed to Flash Attention API
@@ -1287,11 +1285,9 @@ DeepseekV2_START_DOCSTRING = r"""
1287
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
1288
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
1289
  etc.)
1290
-
1291
  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
1292
  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
1293
  and behavior.
1294
-
1295
  Parameters:
1296
  config ([`DeepseekV2Config`]):
1297
  Model configuration class with all the parameters of the model. Initializing with a config file does not
@@ -1330,50 +1326,38 @@ DeepseekV2_INPUTS_DOCSTRING = r"""
1330
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1331
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1332
  it.
1333
-
1334
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1335
  [`PreTrainedTokenizer.__call__`] for details.
1336
-
1337
  [What are input IDs?](../glossary#input-ids)
1338
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1339
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1340
-
1341
  - 1 for tokens that are **not masked**,
1342
  - 0 for tokens that are **masked**.
1343
-
1344
  [What are attention masks?](../glossary#attention-mask)
1345
-
1346
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1347
  [`PreTrainedTokenizer.__call__`] for details.
1348
-
1349
  If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
1350
  `past_key_values`).
1351
-
1352
  If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
1353
  and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
1354
  information on the default strategy.
1355
-
1356
  - 1 indicates the head is **not masked**,
1357
  - 0 indicates the head is **masked**.
1358
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1359
  Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1360
  config.n_positions - 1]`.
1361
-
1362
  [What are position IDs?](../glossary#position-ids)
1363
  past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
1364
  Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1365
  blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
1366
  returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
1367
-
1368
  Two formats are allowed:
1369
  - a [`~cache_utils.Cache`] instance;
1370
  - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
1371
  shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
1372
  cache format.
1373
-
1374
  The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
1375
  legacy cache format will be returned.
1376
-
1377
  If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
1378
  have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
1379
  of shape `(batch_size, sequence_length)`.
@@ -1402,7 +1386,6 @@ DeepseekV2_INPUTS_DOCSTRING = r"""
1402
  class DeepseekV2Model(DeepseekV2PreTrainedModel):
1403
  """
1404
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
1405
-
1406
  Args:
1407
  config: DeepseekV2Config
1408
  """
@@ -1638,20 +1621,14 @@ class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
1638
  Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
1639
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1640
  (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.
1641
-
1642
  Returns:
1643
-
1644
  Example:
1645
-
1646
  ```python
1647
  >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM
1648
-
1649
  >>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1650
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1651
-
1652
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
1653
  >>> inputs = tokenizer(prompt, return_tensors="pt")
1654
-
1655
  >>> # Generate
1656
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1657
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -1793,10 +1770,8 @@ class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
1793
  @add_start_docstrings(
1794
  """
1795
  The DeepseekV2 Model transformer with a sequence classification head on top (linear layer).
1796
-
1797
  [`DeepseekV2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1798
  (e.g. GPT-2) do.
1799
-
1800
  Since it does classification on the last token, it requires to know the position of the last token. If a
1801
  `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1802
  no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
 
48
  from transformers.utils import (
49
  add_start_docstrings,
50
  add_start_docstrings_to_model_forward,
 
51
  is_flash_attn_greater_or_equal_2_10,
52
  logging,
53
  replace_return_docstrings,
 
57
  import torch.distributed as dist
58
  import numpy as np
59
 
60
+ try:
61
  from flash_attn import flash_attn_func, flash_attn_varlen_func
62
  from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
63
+ except ImportError:
64
+ pass
65
 
66
  # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
67
  # It means that the function will not be traced through and simply appear as a node in the graph.
 
338
  # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
339
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
340
  """Applies Rotary Position Embedding to the query and key tensors.
 
341
  Args:
342
  q (`torch.Tensor`): The query tensor.
343
  k (`torch.Tensor`): The key tensor.
 
1075
  """
1076
  Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
1077
  first unpad the input, then computes the attention scores and pad the final attention scores.
 
1078
  Args:
1079
  query_states (`torch.Tensor`):
1080
  Input query states to be passed to Flash Attention API
 
1285
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
1286
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
1287
  etc.)
 
1288
  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
1289
  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
1290
  and behavior.
 
1291
  Parameters:
1292
  config ([`DeepseekV2Config`]):
1293
  Model configuration class with all the parameters of the model. Initializing with a config file does not
 
1326
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1327
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1328
  it.
 
1329
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1330
  [`PreTrainedTokenizer.__call__`] for details.
 
1331
  [What are input IDs?](../glossary#input-ids)
1332
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1333
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
1334
  - 1 for tokens that are **not masked**,
1335
  - 0 for tokens that are **masked**.
 
1336
  [What are attention masks?](../glossary#attention-mask)
 
1337
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1338
  [`PreTrainedTokenizer.__call__`] for details.
 
1339
  If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
1340
  `past_key_values`).
 
1341
  If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
1342
  and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
1343
  information on the default strategy.
 
1344
  - 1 indicates the head is **not masked**,
1345
  - 0 indicates the head is **masked**.
1346
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1347
  Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1348
  config.n_positions - 1]`.
 
1349
  [What are position IDs?](../glossary#position-ids)
1350
  past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
1351
  Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1352
  blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
1353
  returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
1354
  Two formats are allowed:
1355
  - a [`~cache_utils.Cache`] instance;
1356
  - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
1357
  shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
1358
  cache format.
 
1359
  The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
1360
  legacy cache format will be returned.
 
1361
  If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
1362
  have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
1363
  of shape `(batch_size, sequence_length)`.
 
1386
  class DeepseekV2Model(DeepseekV2PreTrainedModel):
1387
  """
1388
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
 
1389
  Args:
1390
  config: DeepseekV2Config
1391
  """
 
1621
  Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
1622
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1623
  (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.
 
1624
  Returns:
 
1625
  Example:
 
1626
  ```python
1627
  >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM
 
1628
  >>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1629
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
 
1630
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
1631
  >>> inputs = tokenizer(prompt, return_tensors="pt")
 
1632
  >>> # Generate
1633
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1634
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
1770
  @add_start_docstrings(
1771
  """
1772
  The DeepseekV2 Model transformer with a sequence classification head on top (linear layer).
 
1773
  [`DeepseekV2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1774
  (e.g. GPT-2) do.
 
1775
  Since it does classification on the last token, it requires to know the position of the last token. If a
1776
  `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1777
  no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the