yihongLiu commited on
Commit
5ecbec1
·
verified ·
1 Parent(s): e181e70

Upload 2 files

Browse files
Files changed (2) hide show
  1. full_dicts.pkl +3 -0
  2. modeling_xlmr_decoupled.py +1008 -0
full_dicts.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3553ee9bb39b51124ca2e3b0b5227f6b765cde8f7f77d5832e7be731830029b0
3
+ size 9608
modeling_xlmr_decoupled.py ADDED
@@ -0,0 +1,1008 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """PyTorch XLM-RoBERTa model."""
17
+
18
+ import math
19
+ from typing import List, Optional, Tuple, Union
20
+
21
+ import torch
22
+ import torch.utils.checkpoint
23
+ from torch import nn
24
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
25
+
26
+ from transformers.activations import ACT2FN, gelu
27
+ from transformers.modeling_outputs import (
28
+ BaseModelOutputWithPastAndCrossAttentions,
29
+ BaseModelOutputWithPoolingAndCrossAttentions,
30
+ CausalLMOutputWithCrossAttentions,
31
+ MaskedLMOutput,
32
+ MultipleChoiceModelOutput,
33
+ QuestionAnsweringModelOutput,
34
+ SequenceClassifierOutput,
35
+ TokenClassifierOutput,
36
+ )
37
+ from transformers.modeling_utils import PreTrainedModel
38
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
39
+ from transformers.utils import (
40
+ add_code_sample_docstrings,
41
+ add_start_docstrings,
42
+ add_start_docstrings_to_model_forward,
43
+ logging,
44
+ replace_return_docstrings,
45
+ )
46
+
47
+ from transformers.models.xlm_roberta.modeling_xlm_roberta import (
48
+ XLMRobertaEncoder,
49
+ XLMRobertaPooler,
50
+ XLMRobertaPreTrainedModel,
51
+ XLMRobertaClassificationHead
52
+ )
53
+
54
+ logger = logging.get_logger(__name__)
55
+
56
+ _CHECKPOINT_FOR_DOC = "FacebookAI/xlm-roberta-base"
57
+ _CONFIG_FOR_DOC = "XLMRobertaConfig"
58
+
59
+
60
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->XLMRoberta
61
+ class XLMRobertaDecoupledEmbeddings(nn.Module):
62
+ """
63
+ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
64
+ """
65
+
66
+ # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
67
+ def __init__(self, config):
68
+ super().__init__()
69
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
70
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
71
+ # self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
72
+
73
+ # this is used as the language embedding
74
+ if config.use_lang_embedding is True:
75
+ self.lang_type_embeddings = nn.Embedding(config.lang_size, config.hidden_size)
76
+
77
+ # this is used as the script embedding
78
+ if config.use_script_embedding is True:
79
+ self.script_type_embeddings = nn.Embedding(config.script_size, config.hidden_size)
80
+
81
+ # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
82
+ # any TensorFlow checkpoint file
83
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
84
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
85
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
86
+ self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
87
+ self.register_buffer(
88
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
89
+ )
90
+ self.register_buffer(
91
+ "token_lang_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
92
+ )
93
+ self.register_buffer(
94
+ "token_script_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
95
+ )
96
+
97
+ # End copy
98
+ self.padding_idx = config.pad_token_id
99
+ self.position_embeddings = nn.Embedding(
100
+ config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
101
+ )
102
+ self.config = config
103
+
104
+ def forward(
105
+ self, token_lang_ids=None, token_script_ids=None,
106
+ input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
107
+ ):
108
+ if position_ids is None:
109
+ if input_ids is not None:
110
+ # Create the position ids from the input token ids. Any padded tokens remain padded.
111
+ position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
112
+ else:
113
+ position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
114
+
115
+ if inputs_embeds is None:
116
+ inputs_embeds = self.word_embeddings(input_ids)
117
+
118
+ if self.config.decouple_at_input_embeddings:
119
+ if token_lang_ids is None and self.config.use_lang_embedding is True:
120
+ raise ValueError("token_lang_ids cannot be None if use_lang_embed is True")
121
+ else:
122
+ if self.config.use_lang_embedding is True:
123
+ token_lang_embeddings = self.lang_type_embeddings(token_lang_ids)
124
+ inputs_embeds = inputs_embeds + token_lang_embeddings
125
+ if token_script_ids is None and self.config.use_script_embedding is True:
126
+ raise ValueError("token_script_ids cannot be None if use_script_embedding is True")
127
+ else:
128
+ if self.config.use_script_embedding:
129
+ token_script_embeddings = self.script_type_embeddings(token_lang_ids)
130
+ inputs_embeds = inputs_embeds + token_script_embeddings
131
+
132
+ embeddings = inputs_embeds
133
+
134
+ if self.position_embedding_type == "absolute":
135
+ position_embeddings = self.position_embeddings(position_ids)
136
+ embeddings += position_embeddings
137
+ embeddings = self.LayerNorm(embeddings)
138
+ embeddings = self.dropout(embeddings)
139
+ return embeddings
140
+
141
+ def create_position_ids_from_inputs_embeds(self, inputs_embeds):
142
+ """
143
+ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
144
+
145
+ Args:
146
+ inputs_embeds: torch.Tensor
147
+
148
+ Returns: torch.Tensor
149
+ """
150
+ input_shape = inputs_embeds.size()[:-1]
151
+ sequence_length = input_shape[1]
152
+
153
+ position_ids = torch.arange(
154
+ self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
155
+ )
156
+ return position_ids.unsqueeze(0).expand(input_shape)
157
+
158
+
159
+ XLM_ROBERTA_START_DOCSTRING = r"""
160
+
161
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
162
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
163
+ etc.)
164
+
165
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
166
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
167
+ and behavior.
168
+
169
+ Parameters:
170
+ config ([`XLMRobertaConfig`]): Model configuration class with all the parameters of the
171
+ model. Initializing with a config file does not load the weights associated with the model, only the
172
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
173
+ """
174
+
175
+ XLM_ROBERTA_INPUTS_DOCSTRING = r"""
176
+ Args:
177
+ input_ids (`torch.LongTensor` of shape `({0})`):
178
+ Indices of input sequence tokens in the vocabulary.
179
+
180
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
181
+ [`PreTrainedTokenizer.__call__`] for details.
182
+
183
+ [What are input IDs?](../glossary#input-ids)
184
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
185
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
186
+
187
+ - 1 for tokens that are **not masked**,
188
+ - 0 for tokens that are **masked**.
189
+
190
+ [What are attention masks?](../glossary#attention-mask)
191
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
192
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
193
+ 1]`:
194
+
195
+ - 0 corresponds to a *sentence A* token,
196
+ - 1 corresponds to a *sentence B* token.
197
+
198
+ [What are token type IDs?](../glossary#token-type-ids)
199
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
200
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
201
+ config.max_position_embeddings - 1]`.
202
+
203
+ [What are position IDs?](../glossary#position-ids)
204
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
205
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
206
+
207
+ - 1 indicates the head is **not masked**,
208
+ - 0 indicates the head is **masked**.
209
+
210
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
211
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
212
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
213
+ model's internal embedding lookup matrix.
214
+ output_attentions (`bool`, *optional*):
215
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
216
+ tensors for more detail.
217
+ output_hidden_states (`bool`, *optional*):
218
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
219
+ more detail.
220
+ return_dict (`bool`, *optional*):
221
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
222
+ """
223
+
224
+
225
+ @add_start_docstrings(
226
+ "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
227
+ XLM_ROBERTA_START_DOCSTRING,
228
+ )
229
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaModel with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
230
+ class XLMRobertaDecoupledModel(XLMRobertaPreTrainedModel):
231
+ """
232
+
233
+ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
234
+ cross-attention is added between the self-attention layers, following the architecture described in *Attention is
235
+ all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
236
+ Kaiser and Illia Polosukhin.
237
+
238
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
239
+ to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
240
+ `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
241
+
242
+ .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
243
+
244
+ """
245
+
246
+ # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->XLMRoberta
247
+ def __init__(self, config, add_pooling_layer=True):
248
+ super().__init__(config)
249
+ self.config = config
250
+
251
+ self.embeddings = XLMRobertaDecoupledEmbeddings(config)
252
+ self.encoder = XLMRobertaEncoder(config)
253
+
254
+ self.pooler = XLMRobertaPooler(config) if add_pooling_layer else None
255
+
256
+ # Initialize weights and apply final processing
257
+ self.post_init()
258
+
259
+ def get_input_embeddings(self):
260
+ return self.embeddings.word_embeddings
261
+
262
+ def set_input_embeddings(self, value):
263
+ self.embeddings.word_embeddings = value
264
+
265
+ def _prune_heads(self, heads_to_prune):
266
+ """
267
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
268
+ class PreTrainedModel
269
+ """
270
+ for layer, heads in heads_to_prune.items():
271
+ self.encoder.layer[layer].attention.prune_heads(heads)
272
+
273
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
274
+ @add_code_sample_docstrings(
275
+ checkpoint=_CHECKPOINT_FOR_DOC,
276
+ output_type=BaseModelOutputWithPoolingAndCrossAttentions,
277
+ config_class=_CONFIG_FOR_DOC,
278
+ )
279
+ # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward
280
+ def forward(
281
+ self,
282
+ input_ids: Optional[torch.Tensor] = None,
283
+ attention_mask: Optional[torch.Tensor] = None,
284
+ token_lang_ids: Optional[torch.Tensor] = None,
285
+ token_script_ids: Optional[torch.Tensor] = None,
286
+ position_ids: Optional[torch.Tensor] = None,
287
+ head_mask: Optional[torch.Tensor] = None,
288
+ inputs_embeds: Optional[torch.Tensor] = None,
289
+ encoder_hidden_states: Optional[torch.Tensor] = None,
290
+ encoder_attention_mask: Optional[torch.Tensor] = None,
291
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
292
+ use_cache: Optional[bool] = None,
293
+ output_attentions: Optional[bool] = None,
294
+ output_hidden_states: Optional[bool] = None,
295
+ return_dict: Optional[bool] = None,
296
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
297
+ r"""
298
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
299
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
300
+ the model is configured as a decoder.
301
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
302
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
303
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
304
+
305
+ - 1 for tokens that are **not masked**,
306
+ - 0 for tokens that are **masked**.
307
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
308
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
309
+
310
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
311
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
312
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
313
+ use_cache (`bool`, *optional*):
314
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
315
+ `past_key_values`).
316
+ """
317
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
318
+ output_hidden_states = (
319
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
320
+ )
321
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
322
+
323
+ if self.config.is_decoder:
324
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
325
+ else:
326
+ use_cache = False
327
+
328
+ if input_ids is not None and inputs_embeds is not None:
329
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
330
+ elif input_ids is not None:
331
+ self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
332
+ input_shape = input_ids.size()
333
+ elif inputs_embeds is not None:
334
+ input_shape = inputs_embeds.size()[:-1]
335
+ else:
336
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
337
+
338
+ batch_size, seq_length = input_shape
339
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
340
+
341
+ # past_key_values_length
342
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
343
+
344
+ if attention_mask is None:
345
+ attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
346
+
347
+ if self.config.decouple_at_input_embeddings:
348
+ if token_lang_ids is None and self.config.use_lang_embedding is True:
349
+ raise ValueError("token_lang_ids cannot be None if use_lang_embed is True")
350
+ if token_script_ids is None and self.config.use_script_embedding is True:
351
+ raise ValueError("token_script_ids cannot be None if use_script_embedding is True")
352
+
353
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
354
+ # ourselves in which case we just need to make it broadcastable to all heads.
355
+ extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
356
+
357
+ # If a 2D or 3D attention mask is provided for the cross-attention
358
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
359
+ if self.config.is_decoder and encoder_hidden_states is not None:
360
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
361
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
362
+ if encoder_attention_mask is None:
363
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
364
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
365
+ else:
366
+ encoder_extended_attention_mask = None
367
+
368
+ # Prepare head mask if needed
369
+ # 1.0 in head_mask indicate we keep the head
370
+ # attention_probs has shape bsz x n_heads x N x N
371
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
372
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
373
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
374
+
375
+ embedding_output = self.embeddings(
376
+ input_ids=input_ids,
377
+ position_ids=position_ids,
378
+ token_lang_ids=token_lang_ids,
379
+ token_script_ids=token_script_ids,
380
+ inputs_embeds=inputs_embeds,
381
+ past_key_values_length=past_key_values_length,
382
+ )
383
+ encoder_outputs = self.encoder(
384
+ embedding_output,
385
+ attention_mask=extended_attention_mask,
386
+ head_mask=head_mask,
387
+ encoder_hidden_states=encoder_hidden_states,
388
+ encoder_attention_mask=encoder_extended_attention_mask,
389
+ past_key_values=past_key_values,
390
+ use_cache=use_cache,
391
+ output_attentions=output_attentions,
392
+ output_hidden_states=output_hidden_states,
393
+ return_dict=return_dict,
394
+ )
395
+ sequence_output = encoder_outputs[0]
396
+ pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
397
+
398
+ if not return_dict:
399
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
400
+
401
+ return BaseModelOutputWithPoolingAndCrossAttentions(
402
+ last_hidden_state=sequence_output,
403
+ pooler_output=pooled_output,
404
+ past_key_values=encoder_outputs.past_key_values,
405
+ hidden_states=encoder_outputs.hidden_states,
406
+ attentions=encoder_outputs.attentions,
407
+ cross_attentions=encoder_outputs.cross_attentions,
408
+ )
409
+
410
+
411
+ @add_start_docstrings(
412
+ """XLM-RoBERTa Model with a `language modeling` head on top.""",
413
+ XLM_ROBERTA_START_DOCSTRING,
414
+ )
415
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
416
+ class XLMRobertaDecoupledForMaskedLM(XLMRobertaPreTrainedModel):
417
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
418
+
419
+ def __init__(self, config):
420
+ super().__init__(config)
421
+
422
+ if config.is_decoder:
423
+ logger.warning(
424
+ "If you want to use `XLMRobertaForMaskedLM` make sure `config.is_decoder=False` for "
425
+ "bi-directional self-attention."
426
+ )
427
+
428
+ self.roberta = XLMRobertaDecoupledModel(config, add_pooling_layer=False)
429
+ self.lm_head = XLMRobertaDecoupledLMHead(config)
430
+ self.config = config
431
+ # Initialize weights and apply final processing
432
+ self.post_init()
433
+
434
+ def get_output_embeddings(self):
435
+ return self.lm_head.decoder
436
+
437
+ def set_output_embeddings(self, new_embeddings):
438
+ self.lm_head.decoder = new_embeddings
439
+
440
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
441
+ @add_code_sample_docstrings(
442
+ checkpoint=_CHECKPOINT_FOR_DOC,
443
+ output_type=MaskedLMOutput,
444
+ config_class=_CONFIG_FOR_DOC,
445
+ mask="<mask>",
446
+ expected_output="' Paris'",
447
+ expected_loss=0.1,
448
+ )
449
+ def forward(
450
+ self,
451
+ input_ids: Optional[torch.LongTensor] = None,
452
+ attention_mask: Optional[torch.FloatTensor] = None,
453
+ token_lang_ids: Optional[torch.Tensor] = None,
454
+ token_script_ids: Optional[torch.Tensor] = None,
455
+ position_ids: Optional[torch.LongTensor] = None,
456
+ head_mask: Optional[torch.FloatTensor] = None,
457
+ inputs_embeds: Optional[torch.FloatTensor] = None,
458
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
459
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
460
+ labels: Optional[torch.LongTensor] = None,
461
+ output_attentions: Optional[bool] = None,
462
+ output_hidden_states: Optional[bool] = None,
463
+ return_dict: Optional[bool] = None,
464
+ ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
465
+ r"""
466
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
467
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
468
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
469
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
470
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
471
+ Used to hide legacy arguments that have been deprecated.
472
+ """
473
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
474
+
475
+ outputs = self.roberta(
476
+ input_ids,
477
+ attention_mask=attention_mask,
478
+ token_lang_ids=token_lang_ids,
479
+ token_script_ids=token_script_ids,
480
+ position_ids=position_ids,
481
+ head_mask=head_mask,
482
+ inputs_embeds=inputs_embeds,
483
+ encoder_hidden_states=encoder_hidden_states,
484
+ encoder_attention_mask=encoder_attention_mask,
485
+ output_attentions=output_attentions,
486
+ output_hidden_states=output_hidden_states,
487
+ return_dict=return_dict,
488
+ )
489
+ sequence_output = outputs[0]
490
+
491
+ if self.config.use_lang_embedding:
492
+ if token_lang_ids is None:
493
+ raise ValueError("token_lang_ids cannot be None if use_lang_embedding is True")
494
+ token_lang_embeddings = self.roberta.embeddings.lang_type_embeddings(token_lang_ids)
495
+ else:
496
+ token_lang_embeddings = None
497
+
498
+ if self.config.use_script_embedding:
499
+ if token_script_ids is None:
500
+ raise ValueError("token_script_ids cannot be None if use_script_embedding is True")
501
+ token_script_embeddings = self.roberta.embeddings.script_type_embeddings(token_script_ids)
502
+ else:
503
+ token_script_embeddings = None
504
+
505
+ # select ouputs and labels
506
+ if labels is not None:
507
+ valid_tokens = labels != -100
508
+ valid_tokens = valid_tokens.unsqueeze(-1).expand_as(sequence_output)
509
+ filtered_output = torch.masked_select(sequence_output, valid_tokens).view(-1, sequence_output.size(-1))
510
+ if token_lang_embeddings is not None:
511
+ token_lang_embeddings = \
512
+ torch.masked_select(token_lang_embeddings, valid_tokens).view(-1, token_lang_embeddings.size(-1))
513
+ if token_script_embeddings is not None:
514
+ token_script_embeddings = \
515
+ torch.masked_select(token_script_embeddings, valid_tokens).view(-1, token_script_embeddings.size(-1))
516
+ filtered_labels = torch.masked_select(labels, labels != -100)
517
+ sequence_output = filtered_output
518
+ labels = filtered_labels
519
+
520
+ prediction_scores = self.lm_head(sequence_output, token_lang_embeddings, token_script_embeddings)
521
+ masked_lm_loss = None
522
+ if labels is not None:
523
+ # move labels to correct device to enable model parallelism
524
+ labels = labels.to(prediction_scores.device)
525
+ loss_fct = CrossEntropyLoss()
526
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
527
+
528
+ if not return_dict:
529
+ output = (prediction_scores,) + outputs[2:]
530
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
531
+
532
+ return MaskedLMOutput(
533
+ loss=masked_lm_loss,
534
+ logits=prediction_scores,
535
+ hidden_states=outputs.hidden_states,
536
+ attentions=outputs.attentions,
537
+ )
538
+
539
+
540
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead
541
+ class XLMRobertaDecoupledLMHead(nn.Module):
542
+ """Roberta Head for masked language modeling."""
543
+
544
+ def __init__(self, config):
545
+ super().__init__()
546
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
547
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
548
+
549
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
550
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
551
+ self.decoder.bias = self.bias
552
+ self.config = config
553
+
554
+ def forward(self, features, language_embeddings, script_embeddings, **kwargs):
555
+ if language_embeddings is not None:
556
+ features = features + language_embeddings
557
+ if script_embeddings is not None:
558
+ features = features + script_embeddings
559
+ x = self.dense(features)
560
+ x = gelu(x)
561
+ x = self.layer_norm(x)
562
+
563
+ # project back to size of vocabulary with bias
564
+ x = self.decoder(x)
565
+
566
+ return x
567
+
568
+ def _tie_weights(self):
569
+ # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
570
+ # For accelerate compatibility and to not break backward compatibility
571
+ if self.decoder.bias.device.type == "meta":
572
+ self.decoder.bias = self.bias
573
+ else:
574
+ self.bias = self.decoder.bias
575
+
576
+
577
+ @add_start_docstrings(
578
+ """
579
+ XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
580
+ pooled output) e.g. for GLUE tasks.
581
+ """,
582
+ XLM_ROBERTA_START_DOCSTRING,
583
+ )
584
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
585
+ class XLMRobertaDecoupledForSequenceClassification(XLMRobertaPreTrainedModel):
586
+ def __init__(self, config):
587
+ super().__init__(config)
588
+ self.num_labels = config.num_labels
589
+ self.config = config
590
+
591
+ self.roberta = XLMRobertaDecoupledModel(config, add_pooling_layer=False)
592
+ self.classifier = XLMRobertaClassificationHead(config)
593
+
594
+ # Initialize weights and apply final processing
595
+ self.post_init()
596
+
597
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
598
+ @add_code_sample_docstrings(
599
+ checkpoint="cardiffnlp/twitter-roberta-base-emotion",
600
+ output_type=SequenceClassifierOutput,
601
+ config_class=_CONFIG_FOR_DOC,
602
+ expected_output="'optimism'",
603
+ expected_loss=0.08,
604
+ )
605
+ def forward(
606
+ self,
607
+ input_ids: Optional[torch.LongTensor] = None,
608
+ attention_mask: Optional[torch.FloatTensor] = None,
609
+ token_lang_ids: Optional[torch.Tensor] = None,
610
+ token_script_ids: Optional[torch.Tensor] = None,
611
+ position_ids: Optional[torch.LongTensor] = None,
612
+ head_mask: Optional[torch.FloatTensor] = None,
613
+ inputs_embeds: Optional[torch.FloatTensor] = None,
614
+ labels: Optional[torch.LongTensor] = None,
615
+ output_attentions: Optional[bool] = None,
616
+ output_hidden_states: Optional[bool] = None,
617
+ return_dict: Optional[bool] = None,
618
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
619
+ r"""
620
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
621
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
622
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
623
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
624
+ """
625
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
626
+
627
+ outputs = self.roberta(
628
+ input_ids,
629
+ attention_mask=attention_mask,
630
+ token_lang_ids=token_lang_ids,
631
+ token_script_ids=token_script_ids,
632
+ position_ids=position_ids,
633
+ head_mask=head_mask,
634
+ inputs_embeds=inputs_embeds,
635
+ output_attentions=output_attentions,
636
+ output_hidden_states=output_hidden_states,
637
+ return_dict=return_dict,
638
+ )
639
+ sequence_output = outputs[0]
640
+ logits = self.classifier(sequence_output)
641
+
642
+ loss = None
643
+ if labels is not None:
644
+ # move labels to correct device to enable model parallelism
645
+ labels = labels.to(logits.device)
646
+ if self.config.problem_type is None:
647
+ if self.num_labels == 1:
648
+ self.config.problem_type = "regression"
649
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
650
+ self.config.problem_type = "single_label_classification"
651
+ else:
652
+ self.config.problem_type = "multi_label_classification"
653
+
654
+ if self.config.problem_type == "regression":
655
+ loss_fct = MSELoss()
656
+ if self.num_labels == 1:
657
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
658
+ else:
659
+ loss = loss_fct(logits, labels)
660
+ elif self.config.problem_type == "single_label_classification":
661
+ loss_fct = CrossEntropyLoss()
662
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
663
+ elif self.config.problem_type == "multi_label_classification":
664
+ loss_fct = BCEWithLogitsLoss()
665
+ loss = loss_fct(logits, labels)
666
+
667
+ if not return_dict:
668
+ output = (logits,) + outputs[2:]
669
+ return ((loss,) + output) if loss is not None else output
670
+
671
+ return SequenceClassifierOutput(
672
+ loss=loss,
673
+ logits=logits,
674
+ hidden_states=outputs.hidden_states,
675
+ attentions=outputs.attentions,
676
+ )
677
+
678
+
679
+ @add_start_docstrings(
680
+ """
681
+ XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
682
+ a softmax) e.g. for RocStories/SWAG tasks.
683
+ """,
684
+ XLM_ROBERTA_START_DOCSTRING,
685
+ )
686
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
687
+ class XLMRobertaDecoupledForMultipleChoice(XLMRobertaPreTrainedModel):
688
+ def __init__(self, config):
689
+ super().__init__(config)
690
+
691
+ self.roberta = XLMRobertaDecoupledModel(config)
692
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
693
+ self.classifier = nn.Linear(config.hidden_size, 1)
694
+
695
+ # Initialize weights and apply final processing
696
+ self.post_init()
697
+
698
+ @add_start_docstrings_to_model_forward(
699
+ XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
700
+ )
701
+ @add_code_sample_docstrings(
702
+ checkpoint=_CHECKPOINT_FOR_DOC,
703
+ output_type=MultipleChoiceModelOutput,
704
+ config_class=_CONFIG_FOR_DOC,
705
+ )
706
+ def forward(
707
+ self,
708
+ input_ids: Optional[torch.LongTensor] = None,
709
+ token_lang_ids: Optional[torch.Tensor] = None,
710
+ token_script_ids: Optional[torch.Tensor] = None,
711
+ attention_mask: Optional[torch.FloatTensor] = None,
712
+ labels: Optional[torch.LongTensor] = None,
713
+ position_ids: Optional[torch.LongTensor] = None,
714
+ head_mask: Optional[torch.FloatTensor] = None,
715
+ inputs_embeds: Optional[torch.FloatTensor] = None,
716
+ output_attentions: Optional[bool] = None,
717
+ output_hidden_states: Optional[bool] = None,
718
+ return_dict: Optional[bool] = None,
719
+ ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
720
+ r"""
721
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
722
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
723
+ num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
724
+ `input_ids` above)
725
+ """
726
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
727
+ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
728
+
729
+ flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
730
+ flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
731
+ flat_token_lang_ids = token_lang_ids.view(-1, token_lang_ids.size(-1)) if token_lang_ids is not None else None
732
+ flat_token_script_ids = token_script_ids.view(-1, token_script_ids.size(
733
+ -1)) if token_script_ids is not None else None
734
+ flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
735
+ flat_inputs_embeds = (
736
+ inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
737
+ if inputs_embeds is not None
738
+ else None
739
+ )
740
+
741
+ outputs = self.roberta(
742
+ flat_input_ids,
743
+ position_ids=flat_position_ids,
744
+ token_lang_ids=flat_token_lang_ids,
745
+ token_script_ids=flat_token_script_ids,
746
+ attention_mask=flat_attention_mask,
747
+ head_mask=head_mask,
748
+ inputs_embeds=flat_inputs_embeds,
749
+ output_attentions=output_attentions,
750
+ output_hidden_states=output_hidden_states,
751
+ return_dict=return_dict,
752
+ )
753
+ pooled_output = outputs[1]
754
+
755
+ pooled_output = self.dropout(pooled_output)
756
+ logits = self.classifier(pooled_output)
757
+ reshaped_logits = logits.view(-1, num_choices)
758
+
759
+ loss = None
760
+ if labels is not None:
761
+ # move labels to correct device to enable model parallelism
762
+ labels = labels.to(reshaped_logits.device)
763
+ loss_fct = CrossEntropyLoss()
764
+ loss = loss_fct(reshaped_logits, labels)
765
+
766
+ if not return_dict:
767
+ output = (reshaped_logits,) + outputs[2:]
768
+ return ((loss,) + output) if loss is not None else output
769
+
770
+ return MultipleChoiceModelOutput(
771
+ loss=loss,
772
+ logits=reshaped_logits,
773
+ hidden_states=outputs.hidden_states,
774
+ attentions=outputs.attentions,
775
+ )
776
+
777
+
778
+ @add_start_docstrings(
779
+ """
780
+ XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
781
+ for Named-Entity-Recognition (NER) tasks.
782
+ """,
783
+ XLM_ROBERTA_START_DOCSTRING,
784
+ )
785
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
786
+ class XLMRobertaDecoupledForTokenClassification(XLMRobertaPreTrainedModel):
787
+ def __init__(self, config):
788
+ super().__init__(config)
789
+ self.num_labels = config.num_labels
790
+
791
+ self.roberta = XLMRobertaDecoupledModel(config, add_pooling_layer=False)
792
+ classifier_dropout = (
793
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
794
+ )
795
+ self.dropout = nn.Dropout(classifier_dropout)
796
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
797
+
798
+ # Initialize weights and apply final processing
799
+ self.post_init()
800
+
801
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
802
+ @add_code_sample_docstrings(
803
+ checkpoint="Jean-Baptiste/roberta-large-ner-english",
804
+ output_type=TokenClassifierOutput,
805
+ config_class=_CONFIG_FOR_DOC,
806
+ expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
807
+ expected_loss=0.01,
808
+ )
809
+ def forward(
810
+ self,
811
+ input_ids: Optional[torch.LongTensor] = None,
812
+ attention_mask: Optional[torch.FloatTensor] = None,
813
+ token_lang_ids: Optional[torch.Tensor] = None,
814
+ token_script_ids: Optional[torch.Tensor] = None,
815
+ position_ids: Optional[torch.LongTensor] = None,
816
+ head_mask: Optional[torch.FloatTensor] = None,
817
+ inputs_embeds: Optional[torch.FloatTensor] = None,
818
+ labels: Optional[torch.LongTensor] = None,
819
+ output_attentions: Optional[bool] = None,
820
+ output_hidden_states: Optional[bool] = None,
821
+ return_dict: Optional[bool] = None,
822
+ ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
823
+ r"""
824
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
825
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
826
+ """
827
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
828
+
829
+ outputs = self.roberta(
830
+ input_ids,
831
+ attention_mask=attention_mask,
832
+ token_lang_ids=token_lang_ids,
833
+ token_script_ids=token_script_ids,
834
+ position_ids=position_ids,
835
+ head_mask=head_mask,
836
+ inputs_embeds=inputs_embeds,
837
+ output_attentions=output_attentions,
838
+ output_hidden_states=output_hidden_states,
839
+ return_dict=return_dict,
840
+ )
841
+
842
+ sequence_output = outputs[0]
843
+
844
+ sequence_output = self.dropout(sequence_output)
845
+ logits = self.classifier(sequence_output)
846
+
847
+ loss = None
848
+ if labels is not None:
849
+ # move labels to correct device to enable model parallelism
850
+ labels = labels.to(logits.device)
851
+ loss_fct = CrossEntropyLoss()
852
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
853
+
854
+ if not return_dict:
855
+ output = (logits,) + outputs[2:]
856
+ return ((loss,) + output) if loss is not None else output
857
+
858
+ return TokenClassifierOutput(
859
+ loss=loss,
860
+ logits=logits,
861
+ hidden_states=outputs.hidden_states,
862
+ attentions=outputs.attentions,
863
+ )
864
+
865
+
866
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->XLMRoberta
867
+ class XLMRobertaClassificationHead(nn.Module):
868
+ """Head for sentence-level classification tasks."""
869
+
870
+ def __init__(self, config):
871
+ super().__init__()
872
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
873
+ classifier_dropout = (
874
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
875
+ )
876
+ self.dropout = nn.Dropout(classifier_dropout)
877
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
878
+
879
+ def forward(self, features, **kwargs):
880
+ x = features[:, 0, :] # take <s> token (equiv. to [CLS])
881
+ x = self.dropout(x)
882
+ x = self.dense(x)
883
+ x = torch.tanh(x)
884
+ x = self.dropout(x)
885
+ x = self.out_proj(x)
886
+ return x
887
+
888
+
889
+ @add_start_docstrings(
890
+ """
891
+ XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
892
+ linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
893
+ """,
894
+ XLM_ROBERTA_START_DOCSTRING,
895
+ )
896
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
897
+ class XLMRobertaForQuestionAnswering(XLMRobertaPreTrainedModel):
898
+ def __init__(self, config):
899
+ super().__init__(config)
900
+ self.num_labels = config.num_labels
901
+
902
+ self.roberta = XLMRobertaDecoupledModel(config, add_pooling_layer=False)
903
+ self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
904
+
905
+ # Initialize weights and apply final processing
906
+ self.post_init()
907
+
908
+ @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
909
+ @add_code_sample_docstrings(
910
+ checkpoint="deepset/roberta-base-squad2",
911
+ output_type=QuestionAnsweringModelOutput,
912
+ config_class=_CONFIG_FOR_DOC,
913
+ expected_output="' puppet'",
914
+ expected_loss=0.86,
915
+ )
916
+ def forward(
917
+ self,
918
+ input_ids: Optional[torch.LongTensor] = None,
919
+ attention_mask: Optional[torch.FloatTensor] = None,
920
+ token_lang_ids: Optional[torch.Tensor] = None,
921
+ token_script_ids: Optional[torch.Tensor] = None,
922
+ position_ids: Optional[torch.LongTensor] = None,
923
+ head_mask: Optional[torch.FloatTensor] = None,
924
+ inputs_embeds: Optional[torch.FloatTensor] = None,
925
+ start_positions: Optional[torch.LongTensor] = None,
926
+ end_positions: Optional[torch.LongTensor] = None,
927
+ output_attentions: Optional[bool] = None,
928
+ output_hidden_states: Optional[bool] = None,
929
+ return_dict: Optional[bool] = None,
930
+ ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
931
+ r"""
932
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
933
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
934
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
935
+ are not taken into account for computing the loss.
936
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
937
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
938
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
939
+ are not taken into account for computing the loss.
940
+ """
941
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
942
+
943
+ outputs = self.roberta(
944
+ input_ids,
945
+ attention_mask=attention_mask,
946
+ token_lang_ids=token_lang_ids,
947
+ token_script_ids=token_script_ids,
948
+ position_ids=position_ids,
949
+ head_mask=head_mask,
950
+ inputs_embeds=inputs_embeds,
951
+ output_attentions=output_attentions,
952
+ output_hidden_states=output_hidden_states,
953
+ return_dict=return_dict,
954
+ )
955
+
956
+ sequence_output = outputs[0]
957
+
958
+ logits = self.qa_outputs(sequence_output)
959
+ start_logits, end_logits = logits.split(1, dim=-1)
960
+ start_logits = start_logits.squeeze(-1).contiguous()
961
+ end_logits = end_logits.squeeze(-1).contiguous()
962
+
963
+ total_loss = None
964
+ if start_positions is not None and end_positions is not None:
965
+ # If we are on multi-GPU, split add a dimension
966
+ if len(start_positions.size()) > 1:
967
+ start_positions = start_positions.squeeze(-1)
968
+ if len(end_positions.size()) > 1:
969
+ end_positions = end_positions.squeeze(-1)
970
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
971
+ ignored_index = start_logits.size(1)
972
+ start_positions = start_positions.clamp(0, ignored_index)
973
+ end_positions = end_positions.clamp(0, ignored_index)
974
+
975
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
976
+ start_loss = loss_fct(start_logits, start_positions)
977
+ end_loss = loss_fct(end_logits, end_positions)
978
+ total_loss = (start_loss + end_loss) / 2
979
+
980
+ if not return_dict:
981
+ output = (start_logits, end_logits) + outputs[2:]
982
+ return ((total_loss,) + output) if total_loss is not None else output
983
+
984
+ return QuestionAnsweringModelOutput(
985
+ loss=total_loss,
986
+ start_logits=start_logits,
987
+ end_logits=end_logits,
988
+ hidden_states=outputs.hidden_states,
989
+ attentions=outputs.attentions,
990
+ )
991
+
992
+
993
+ # Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
994
+ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
995
+ """
996
+ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
997
+ are ignored. This is modified from fairseq's `utils.make_positions`.
998
+
999
+ Args:
1000
+ x: torch.Tensor x:
1001
+
1002
+ Returns: torch.Tensor
1003
+ """
1004
+ # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
1005
+ mask = input_ids.ne(padding_idx).int()
1006
+ incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
1007
+ return incremental_indices.long() + padding_idx
1008
+