microsoft/Phi-3-small-8k-instruct · Keep getting AssertionError: Flash Attention is not available when load the model

May 25
I followed the instruction in model card and confirmed I'm using transformer 4.40.2.
Any ideas?
File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py:558, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
    556     else:
    557         cls.register(config.__class__, model_class, exist_ok=True)
--> 558     return model_class.from_pretrained(
    559         pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
    560     )
    561 elif type(config) in cls._model_mapping.keys():
    562     model_class = _get_model_class(config, cls._model_mapping)

File ~/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/transformers/modeling_utils.py:3550, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)
   3544 config = cls._autoset_attn_implementation(
   3545     config, use_flash_attention_2=use_flash_attention_2, torch_dtype=torch_dtype, device_map=device_map
   3546 )
   3548 with ContextManagers(init_contexts):
   3549     # Let's make sure we don't run the init function of buffer modules
-> 3550     model = cls(config, *model_args, **model_kwargs)
   3552 # make sure we use the model's config since the __init__ call might have copied it
   3553 config = model.config

File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/1adb635233ffce9e13385862a4111606d4382762/modeling_phi3_small.py:903, in Phi3SmallForCausalLM.__init__(self, config)
    901 def __init__(self, config):
    902     super().__init__(config)
--> 903     self.model = Phi3SmallModel(config)
    904     self.vocab_size = config.vocab_size
    905     self.lm_head = nn.Linear(config.hidden_size, self.vocab_size, bias=False)

File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/1adb635233ffce9e13385862a4111606d4382762/modeling_phi3_small.py:745, in Phi3SmallModel.__init__(self, config)
    742 # MuP Embedding scaling
    743 self.mup_embedding_multiplier = config.mup_embedding_multiplier
--> 745 self.layers = nn.ModuleList([Phi3SmallDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
    747 self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
    749 self.gradient_checkpointing = False

File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/1adb635233ffce9e13385862a4111606d4382762/modeling_phi3_small.py:745, in <listcomp>(.0)
    742 # MuP Embedding scaling
    743 self.mup_embedding_multiplier = config.mup_embedding_multiplier
--> 745 self.layers = nn.ModuleList([Phi3SmallDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
    747 self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
    749 self.gradient_checkpointing = False

File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/1adb635233ffce9e13385862a4111606d4382762/modeling_phi3_small.py:651, in Phi3SmallDecoderLayer.__init__(self, config, layer_idx)
    649 super().__init__()
    650 self.hidden_size = config.hidden_size
--> 651 self.self_attn = Phi3SmallSelfAttention(config, layer_idx)
    652 self.mlp = Phi3SmallMLP(config)
    654 self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)

File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-small-8k-instruct/1adb635233ffce9e13385862a4111606d4382762/modeling_phi3_small.py:218, in Phi3SmallSelfAttention.__init__(self, config, layer_idx)
    213 if self.config.dense_attention_every_n_layers and ((self.layer_idx + 1) % self.config.dense_attention_every_n_layers == 0):
    214     logger.info(
    215         f"Layer {layer_idx + 1} is using dense attention since it is divisible by "
    216         f"{self.config.dense_attention_every_n_layers}"
    217     )
--> 218     assert is_flash_attention_available, "Flash Attention is not available, but is needed for dense attention"
    219 else:
    220     # BlockSparse related Parameters
    221     self.blocksparse_params = BlockSparseParams.from_config(config)

AssertionError: Flash Attention is not available, but is needed for dense attention
zwelz
Jun 3
Check this thread: https://huggingface.co/microsoft/Phi-3-small-128k-instruct/discussions/7
bapatra changed discussion status to closed Jun 6