Use try-except for flash_attn import

This PR is to avoid transformers hard check failure on import dependencies.
If user doesn't install flash_attn, they will not pass the import check for flash_attn. Also same for non-cuda users.

To solve this, we can see that try-except content will be filtered out: https://github.com/huggingface/transformers/blob/main/src/transformers/dynamic_module_utils.py#L155.

Files changed (1) hide show

modeling_deepseek.py +3 -28

modeling_deepseek.py CHANGED Viewed

@@ -48,7 +48,6 @@ from transformers.pytorch_utils import (
 from transformers.utils import (
  add_start_docstrings,
  add_start_docstrings_to_model_forward,
- is_flash_attn_2_available,
  is_flash_attn_greater_or_equal_2_10,
  logging,
  replace_return_docstrings,
@@ -58,10 +57,11 @@ from .configuration_deepseek import DeepseekV2Config
 import torch.distributed as dist
 import numpy as np
-if is_flash_attn_2_available():
  from flash_attn import flash_attn_func, flash_attn_varlen_func
  from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
 # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
 # It means that the function will not be traced through and simply appear as a node in the graph.
@@ -338,7 +338,6 @@ def rotate_half(x):
 # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
  """Applies Rotary Position Embedding to the query and key tensors.
  Args:
  q (`torch.Tensor`): The query tensor.
  k (`torch.Tensor`): The key tensor.
@@ -1076,7 +1075,6 @@ class DeepseekV2FlashAttention2(DeepseekV2Attention):
  """
  Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
  first unpad the input, then computes the attention scores and pad the final attention scores.
  Args:
  query_states (`torch.Tensor`):
  Input query states to be passed to Flash Attention API
@@ -1287,11 +1285,9 @@ DeepseekV2_START_DOCSTRING = r"""
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
  etc.)
  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
  and behavior.
  Parameters:
  config ([`DeepseekV2Config`]):
  Model configuration class with all the parameters of the model. Initializing with a config file does not
@@ -1330,50 +1326,38 @@ DeepseekV2_INPUTS_DOCSTRING = r"""
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
  it.
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
  [`PreTrainedTokenizer.__call__`] for details.
  [What are input IDs?](../glossary#input-ids)
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  - 1 for tokens that are **not masked**,
  - 0 for tokens that are **masked**.
  [What are attention masks?](../glossary#attention-mask)
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
  [`PreTrainedTokenizer.__call__`] for details.
  If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
  `past_key_values`).
  If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
  and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
  information on the default strategy.
  - 1 indicates the head is **not masked**,
  - 0 indicates the head is **masked**.
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
  config.n_positions - 1]`.
  [What are position IDs?](../glossary#position-ids)
  past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
  Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
  returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
  Two formats are allowed:
  - a [`~cache_utils.Cache`] instance;
  - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
  shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
  cache format.
  The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
  legacy cache format will be returned.
  If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
  have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
  of shape `(batch_size, sequence_length)`.
@@ -1402,7 +1386,6 @@ DeepseekV2_INPUTS_DOCSTRING = r"""
 class DeepseekV2Model(DeepseekV2PreTrainedModel):
  """
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
  Args:
  config: DeepseekV2Config
  """
@@ -1638,20 +1621,14 @@ class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
  Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
  (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.
  Returns:
  Example:
  ```python
  >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM
  >>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
  >>> inputs = tokenizer(prompt, return_tensors="pt")
  >>> # Generate
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -1793,10 +1770,8 @@ class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
 @add_start_docstrings(
  """
  The DeepseekV2 Model transformer with a sequence classification head on top (linear layer).
  [`DeepseekV2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
  (e.g. GPT-2) do.
  Since it does classification on the last token, it requires to know the position of the last token. If a
  `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
  no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the

 from transformers.utils import (
  add_start_docstrings,
  add_start_docstrings_to_model_forward,
  is_flash_attn_greater_or_equal_2_10,
  logging,
  replace_return_docstrings,
 import torch.distributed as dist
 import numpy as np
+try:
  from flash_attn import flash_attn_func, flash_attn_varlen_func
  from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
+except ImportError:
+ pass
 # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
 # It means that the function will not be traced through and simply appear as a node in the graph.
 # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
  """Applies Rotary Position Embedding to the query and key tensors.
  Args:
  q (`torch.Tensor`): The query tensor.
  k (`torch.Tensor`): The key tensor.
  """
  Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
  first unpad the input, then computes the attention scores and pad the final attention scores.
  Args:
  query_states (`torch.Tensor`):
  Input query states to be passed to Flash Attention API
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
  etc.)
  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
  and behavior.
  Parameters:
  config ([`DeepseekV2Config`]):
  Model configuration class with all the parameters of the model. Initializing with a config file does not
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
  it.
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
  [`PreTrainedTokenizer.__call__`] for details.
  [What are input IDs?](../glossary#input-ids)
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
  - 1 for tokens that are **not masked**,
  - 0 for tokens that are **masked**.
  [What are attention masks?](../glossary#attention-mask)
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
  [`PreTrainedTokenizer.__call__`] for details.
  If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
  `past_key_values`).
  If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
  and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
  information on the default strategy.
  - 1 indicates the head is **not masked**,
  - 0 indicates the head is **masked**.
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
  Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
  config.n_positions - 1]`.
  [What are position IDs?](../glossary#position-ids)
  past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
  Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
  blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
  returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
  Two formats are allowed:
  - a [`~cache_utils.Cache`] instance;
  - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
  shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
  cache format.
  The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
  legacy cache format will be returned.
  If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
  have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
  of shape `(batch_size, sequence_length)`.
 class DeepseekV2Model(DeepseekV2PreTrainedModel):
  """
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
  Args:
  config: DeepseekV2Config
  """
  Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
  (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.
  Returns:
  Example:
  ```python
  >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM
  >>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
  >>> inputs = tokenizer(prompt, return_tensors="pt")
  >>> # Generate
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 @add_start_docstrings(
  """
  The DeepseekV2 Model transformer with a sequence classification head on top (linear layer).
  [`DeepseekV2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
  (e.g. GPT-2) do.
  Since it does classification on the last token, it requires to know the position of the last token. If a
  `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
  no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the