vaibhavad commited on
Commit
33d5ca9
·
verified ·
1 Parent(s): 4f499ad

Create modeling_llama_encoder.py

Browse files
Files changed (1) hide show
  1. modeling_llama_encoder.py +200 -0
modeling_llama_encoder.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Tuple, Union
2
+ import torch
3
+ from transformers import LlamaModel, LlamaPreTrainedModel
4
+ from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm, LlamaConfig, LlamaMLP, LlamaAttention, LlamaFlashAttention2, LlamaSdpaAttention
5
+ from transformers.utils import logging
6
+ from torch import nn
7
+ import torch.nn.functional as F
8
+ from transformers.modeling_outputs import BaseModelOutputWithPast
9
+ from transformers.cache_utils import Cache, DynamicCache
10
+ from .attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_attention_mask
11
+
12
+ logger = logging.get_logger(__name__)
13
+
14
+ class ModifiedLlamaAttention(LlamaAttention):
15
+
16
+ def __init__(self, *args, **kwargs):
17
+ super().__init__(*args, **kwargs)
18
+ self.is_causal = False
19
+
20
+
21
+ class ModifiedLlamaFlashAttention2(LlamaFlashAttention2):
22
+
23
+ def __init__(self, *args, **kwargs):
24
+ super().__init__(*args, **kwargs)
25
+ self.is_causal = False
26
+
27
+
28
+ class ModifiedLlamaSdpaAttention(LlamaSdpaAttention):
29
+
30
+ def __init__(self, *args, **kwargs):
31
+ super().__init__(*args, **kwargs)
32
+ self.is_causal = False
33
+
34
+
35
+ LLAMA_ATTENTION_CLASSES = {
36
+ "eager": ModifiedLlamaAttention,
37
+ "flash_attention_2": ModifiedLlamaFlashAttention2,
38
+ "sdpa": ModifiedLlamaSdpaAttention,
39
+ }
40
+
41
+
42
+ class ModifiedLlamaDecoderLayer(LlamaDecoderLayer):
43
+ def __init__(self, config: LlamaConfig, layer_idx: int):
44
+ nn.Module.__init__(self)
45
+ self.hidden_size = config.hidden_size
46
+
47
+ self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
48
+
49
+ self.mlp = LlamaMLP(config)
50
+ self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
51
+ self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
52
+
53
+
54
+ class LlamaEncoderModel(LlamaModel):
55
+ def __init__(self, config):
56
+ LlamaPreTrainedModel.__init__(self, config)
57
+ self.padding_idx = config.pad_token_id
58
+ self.vocab_size = config.vocab_size
59
+
60
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
61
+ self.layers = nn.ModuleList(
62
+ [ModifiedLlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
63
+ )
64
+ self._use_sdpa = config._attn_implementation == "sdpa"
65
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
66
+ self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
67
+
68
+ self.gradient_checkpointing = False
69
+ # Initialize weights and apply final processing
70
+ self.post_init()
71
+
72
+ def forward(
73
+ self,
74
+ input_ids: torch.LongTensor = None,
75
+ attention_mask: Optional[torch.Tensor] = None,
76
+ position_ids: Optional[torch.LongTensor] = None,
77
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
78
+ inputs_embeds: Optional[torch.FloatTensor] = None,
79
+ use_cache: Optional[bool] = None,
80
+ output_attentions: Optional[bool] = None,
81
+ output_hidden_states: Optional[bool] = None,
82
+ return_dict: Optional[bool] = None,
83
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
84
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
85
+ output_hidden_states = (
86
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
87
+ )
88
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
89
+
90
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
91
+
92
+ # retrieve input_ids and inputs_embeds
93
+ if input_ids is not None and inputs_embeds is not None:
94
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
95
+ elif input_ids is not None:
96
+ batch_size, seq_length = input_ids.shape[:2]
97
+ elif inputs_embeds is not None:
98
+ batch_size, seq_length = inputs_embeds.shape[:2]
99
+ else:
100
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
101
+
102
+ if self.gradient_checkpointing and self.training:
103
+ if use_cache:
104
+ logger.warning_once(
105
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
106
+ )
107
+ use_cache = False
108
+
109
+ past_key_values_length = 0
110
+ if use_cache:
111
+ use_legacy_cache = not isinstance(past_key_values, Cache)
112
+ if use_legacy_cache:
113
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
114
+ past_key_values_length = past_key_values.get_usable_length(seq_length)
115
+
116
+ if position_ids is None:
117
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
118
+ position_ids = torch.arange(
119
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
120
+ )
121
+ position_ids = position_ids.unsqueeze(0)
122
+
123
+ if inputs_embeds is None:
124
+ inputs_embeds = self.embed_tokens(input_ids)
125
+
126
+ if self._use_flash_attention_2:
127
+ # 2d mask is passed through the layers
128
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
129
+ elif self._use_sdpa and not output_attentions:
130
+ # output_attentions=True can not be supported when using SDPA, and we fall back on
131
+ # the manual implementation that requires a 4D causal mask in all cases.
132
+ attention_mask = _prepare_4d_attention_mask_for_sdpa(
133
+ attention_mask,
134
+ (batch_size, seq_length),
135
+ inputs_embeds,
136
+ past_key_values_length,
137
+ )
138
+ else:
139
+ # 4d mask is passed through the layers
140
+ attention_mask = _prepare_4d_attention_mask(
141
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
142
+ )
143
+
144
+ # embed positions
145
+ hidden_states = inputs_embeds
146
+
147
+ # decoder layers
148
+ all_hidden_states = () if output_hidden_states else None
149
+ all_self_attns = () if output_attentions else None
150
+ next_decoder_cache = None
151
+
152
+ for decoder_layer in self.layers:
153
+ if output_hidden_states:
154
+ all_hidden_states += (hidden_states,)
155
+
156
+ if self.gradient_checkpointing and self.training:
157
+ layer_outputs = self._gradient_checkpointing_func(
158
+ decoder_layer.__call__,
159
+ hidden_states,
160
+ attention_mask,
161
+ position_ids,
162
+ past_key_values,
163
+ output_attentions,
164
+ use_cache,
165
+ )
166
+ else:
167
+ layer_outputs = decoder_layer(
168
+ hidden_states,
169
+ attention_mask=attention_mask,
170
+ position_ids=position_ids,
171
+ past_key_value=past_key_values,
172
+ output_attentions=output_attentions,
173
+ use_cache=use_cache,
174
+ )
175
+
176
+ hidden_states = layer_outputs[0]
177
+
178
+ if use_cache:
179
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
180
+
181
+ if output_attentions:
182
+ all_self_attns += (layer_outputs[1],)
183
+
184
+ hidden_states = self.norm(hidden_states)
185
+
186
+ # add hidden states from the last decoder layer
187
+ if output_hidden_states:
188
+ all_hidden_states += (hidden_states,)
189
+
190
+ next_cache = None
191
+ if use_cache:
192
+ next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
193
+ if not return_dict:
194
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
195
+ return BaseModelOutputWithPast(
196
+ last_hidden_state=hidden_states,
197
+ past_key_values=next_cache,
198
+ hidden_states=all_hidden_states,
199
+ attentions=all_self_attns,
200
+ )