crumb commited on
Commit
02b8d72
·
1 Parent(s): 4c8451e

Upload 2 files

Browse files
Files changed (2) hide show
  1. configuration_gpt2l.py +273 -0
  2. modeling_gpt2l.py +974 -0
configuration_gpt2l.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """ OpenAI GPT-2 configuration"""
17
+ from collections import OrderedDict
18
+ from typing import Any, List, Mapping, Optional
19
+
20
+ from transformers import PreTrainedTokenizer, TensorType, is_torch_available
21
+ from transformers.configuration_utils import PretrainedConfig
22
+ from transformers.onnx import OnnxConfigWithPast, PatchingSpec
23
+ from transformers.utils import logging
24
+
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
29
+ "gpt2": "https://huggingface.co/gpt2/resolve/main/config.json",
30
+ "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/config.json",
31
+ "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/config.json",
32
+ "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/config.json",
33
+ "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json",
34
+ }
35
+
36
+
37
+ class GPT2LConfig(PretrainedConfig):
38
+ """
39
+ This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
40
+ instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a
41
+ configuration with the defaults will yield a similar configuration to that of the GPT-2
42
+ [gpt2](https://huggingface.co/gpt2) architecture.
43
+
44
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
45
+ documentation from [`PretrainedConfig`] for more information.
46
+
47
+
48
+ Args:
49
+ vocab_size (`int`, *optional*, defaults to 50257):
50
+ Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
51
+ `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
52
+ n_positions (`int`, *optional*, defaults to 1024):
53
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
54
+ just in case (e.g., 512 or 1024 or 2048).
55
+ n_embd (`int`, *optional*, defaults to 768):
56
+ Dimensionality of the embeddings and hidden states.
57
+ n_layer (`int`, *optional*, defaults to 12):
58
+ Number of hidden layers in the Transformer encoder.
59
+ n_head (`int`, *optional*, defaults to 12):
60
+ Number of attention heads for each attention layer in the Transformer encoder.
61
+ n_inner (`int`, *optional*, defaults to None):
62
+ Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
63
+ activation_function (`str`, *optional*, defaults to `"gelu"`):
64
+ Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
65
+ resid_pdrop (`float`, *optional*, defaults to 0.1):
66
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
67
+ embd_pdrop (`float`, *optional*, defaults to 0.1):
68
+ The dropout ratio for the embeddings.
69
+ attn_pdrop (`float`, *optional*, defaults to 0.1):
70
+ The dropout ratio for the attention.
71
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
72
+ The epsilon to use in the layer normalization layers.
73
+ initializer_range (`float`, *optional*, defaults to 0.02):
74
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
75
+ summary_type (`string`, *optional*, defaults to `"cls_index"`):
76
+ Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
77
+ [`TFGPT2DoubleHeadsModel`].
78
+
79
+ Has to be one of the following options:
80
+
81
+ - `"last"`: Take the last token hidden state (like XLNet).
82
+ - `"first"`: Take the first token hidden state (like BERT).
83
+ - `"mean"`: Take the mean of all tokens hidden states.
84
+ - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
85
+ - `"attn"`: Not implemented now, use multi-head attention.
86
+ summary_use_proj (`bool`, *optional*, defaults to `True`):
87
+ Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
88
+ [`TFGPT2DoubleHeadsModel`].
89
+
90
+ Whether or not to add a projection after the vector extraction.
91
+ summary_activation (`str`, *optional*):
92
+ Argument used when doing sequence summary. Used in for the multiple choice head in
93
+ [`GPT2DoubleHeadsModel`].
94
+
95
+ Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
96
+ summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
97
+ Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
98
+ [`TFGPT2DoubleHeadsModel`].
99
+
100
+ Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
101
+ summary_first_dropout (`float`, *optional*, defaults to 0.1):
102
+ Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
103
+ [`TFGPT2DoubleHeadsModel`].
104
+
105
+ The dropout ratio to be used after the projection and activation.
106
+ scale_attn_weights (`bool`, *optional*, defaults to `True`):
107
+ Scale attention weights by dividing by sqrt(hidden_size)..
108
+ use_cache (`bool`, *optional*, defaults to `True`):
109
+ Whether or not the model should return the last key/values attentions (not used by all models).
110
+ scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
111
+ Whether to additionally scale attention weights by `1 / layer_idx + 1`.
112
+ reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
113
+ Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
114
+ dot-product/softmax to float() when training with mixed precision.
115
+
116
+ Example:
117
+
118
+ ```python
119
+ >>> from transformers import GPT2Config, GPT2Model
120
+
121
+ >>> # Initializing a GPT2 configuration
122
+ >>> configuration = GPT2Config()
123
+
124
+ >>> # Initializing a model (with random weights) from the configuration
125
+ >>> model = GPT2Model(configuration)
126
+
127
+ >>> # Accessing the model configuration
128
+ >>> configuration = model.config
129
+ ```"""
130
+
131
+ model_type = "gpt2l"
132
+ keys_to_ignore_at_inference = ["past_key_values"]
133
+ attribute_map = {
134
+ "hidden_size": "n_embd",
135
+ "max_position_embeddings": "n_positions",
136
+ "num_attention_heads": "n_head",
137
+ "num_hidden_layers": "n_layer",
138
+ }
139
+
140
+ def __init__(
141
+ self,
142
+ vocab_size=50257,
143
+ n_positions=1024,
144
+ n_embd=768,
145
+ n_layer=12,
146
+ n_head=12,
147
+ n_inner=None,
148
+ activation_function="gelu_new",
149
+ resid_pdrop=0.1,
150
+ embd_pdrop=0.1,
151
+ attn_pdrop=0.1,
152
+ layer_norm_epsilon=1e-5,
153
+ initializer_range=0.02,
154
+ summary_type="cls_index",
155
+ summary_use_proj=True,
156
+ summary_activation=None,
157
+ summary_proj_to_labels=True,
158
+ summary_first_dropout=0.1,
159
+ scale_attn_weights=True,
160
+ use_cache=True,
161
+ bos_token_id=50256,
162
+ eos_token_id=50256,
163
+ scale_attn_by_inverse_layer_idx=False,
164
+ reorder_and_upcast_attn=False,
165
+ **kwargs,
166
+ ):
167
+ self.vocab_size = vocab_size
168
+ self.n_positions = n_positions
169
+ self.n_embd = n_embd
170
+ self.n_layer = n_layer
171
+ self.n_head = n_head
172
+ self.n_inner = n_inner
173
+ self.activation_function = activation_function
174
+ self.resid_pdrop = resid_pdrop
175
+ self.embd_pdrop = embd_pdrop
176
+ self.attn_pdrop = attn_pdrop
177
+ self.layer_norm_epsilon = layer_norm_epsilon
178
+ self.initializer_range = initializer_range
179
+ self.summary_type = summary_type
180
+ self.summary_use_proj = summary_use_proj
181
+ self.summary_activation = summary_activation
182
+ self.summary_first_dropout = summary_first_dropout
183
+ self.summary_proj_to_labels = summary_proj_to_labels
184
+ self.scale_attn_weights = scale_attn_weights
185
+ self.use_cache = use_cache
186
+ self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
187
+ self.reorder_and_upcast_attn = reorder_and_upcast_attn
188
+
189
+ self.bos_token_id = bos_token_id
190
+ self.eos_token_id = eos_token_id
191
+
192
+ super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
193
+
194
+
195
+ class GPT2LOnnxConfig(OnnxConfigWithPast):
196
+ def __init__(
197
+ self,
198
+ config: PretrainedConfig,
199
+ task: str = "default",
200
+ patching_specs: List[PatchingSpec] = None,
201
+ use_past: bool = False,
202
+ ):
203
+ super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
204
+ if not getattr(self._config, "pad_token_id", None):
205
+ # TODO: how to do that better?
206
+ self._config.pad_token_id = 0
207
+
208
+ @property
209
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
210
+ common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
211
+ if self.use_past:
212
+ self.fill_with_past_key_values_(common_inputs, direction="inputs")
213
+ common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
214
+ else:
215
+ common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
216
+
217
+ return common_inputs
218
+
219
+ @property
220
+ def num_layers(self) -> int:
221
+ return self._config.n_layer
222
+
223
+ @property
224
+ def num_attention_heads(self) -> int:
225
+ return self._config.n_head
226
+
227
+ def generate_dummy_inputs(
228
+ self,
229
+ tokenizer: PreTrainedTokenizer,
230
+ batch_size: int = -1,
231
+ seq_length: int = -1,
232
+ is_pair: bool = False,
233
+ framework: Optional[TensorType] = None,
234
+ ) -> Mapping[str, Any]:
235
+ common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
236
+ tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
237
+ )
238
+
239
+ # We need to order the input in the way they appears in the forward()
240
+ ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
241
+
242
+ # Need to add the past_keys
243
+ if self.use_past:
244
+ if not is_torch_available():
245
+ raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
246
+ else:
247
+ import torch
248
+
249
+ batch, seqlen = common_inputs["input_ids"].shape
250
+ # Not using the same length for past_key_values
251
+ past_key_values_length = seqlen + 2
252
+ past_shape = (
253
+ batch,
254
+ self.num_attention_heads,
255
+ past_key_values_length,
256
+ self._config.hidden_size // self.num_attention_heads,
257
+ )
258
+ ordered_inputs["past_key_values"] = [
259
+ (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
260
+ ]
261
+
262
+ ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
263
+ if self.use_past:
264
+ mask_dtype = ordered_inputs["attention_mask"].dtype
265
+ ordered_inputs["attention_mask"] = torch.cat(
266
+ [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
267
+ )
268
+
269
+ return ordered_inputs
270
+
271
+ @property
272
+ def default_onnx_opset(self) -> int:
273
+ return 13
modeling_gpt2l.py ADDED
@@ -0,0 +1,974 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
2
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """PyTorch OpenAI GPT-2 model."""
16
+
17
+ import os
18
+ import warnings
19
+ from dataclasses import dataclass
20
+ from typing import List, Optional, Tuple
21
+
22
+ import torch
23
+ import torch.nn as nn
24
+ from torch.nn import CrossEntropyLoss, MSELoss
25
+
26
+ from transformers.activations import ACT2FN
27
+ from .configuration_gpt2l import GPT2LConfig
28
+ from transformers.file_utils import (
29
+ ModelOutput,
30
+ add_start_docstrings,
31
+ add_start_docstrings_to_model_forward,
32
+ )
33
+ from transformers.modeling_outputs import (
34
+ BaseModelOutputWithPastAndCrossAttentions,
35
+ # CausalLMOutputWithPastAndCrossAttentions,
36
+ CausalLMOutputWithPast,
37
+ SequenceClassifierOutputWithPast,
38
+ )
39
+ from transformers.modeling_utils import (
40
+ Conv1D,
41
+ PreTrainedModel,
42
+ SequenceSummary,
43
+ find_pruneable_heads_and_indices,
44
+ prune_conv1d_layer,
45
+ )
46
+ from transformers.utils import logging
47
+
48
+
49
+ logger = logging.get_logger(__name__)
50
+
51
+ _CONFIG_FOR_DOC = "GPT2LConfig"
52
+ _TOKENIZER_FOR_DOC = "GPT2Tokenizer"
53
+
54
+ GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
55
+ "gpt2",
56
+ "gpt2-medium",
57
+ "gpt2-large",
58
+ "gpt2-xl",
59
+ "distilgpt2",
60
+ # See all GPT-2 models at https://huggingface.co/models?filter=gpt2
61
+ ]
62
+
63
+
64
+
65
+ class Attention(nn.Module):
66
+ def __init__(self, nx, n_ctx, config, scale=False, is_cross_attention=False):
67
+ super().__init__()
68
+
69
+ n_state = nx # in Attention: n_state=768 (nx=n_embd)
70
+ # [switch nx => n_state from Block to Attention to keep identical to TF implem]
71
+ assert n_state % config.n_head == 0
72
+ self.register_buffer(
73
+ "bias", torch.tril(torch.ones((n_ctx, n_ctx), dtype=torch.uint8)).view(1, 1, n_ctx, n_ctx)
74
+ )
75
+ self.register_buffer("masked_bias", torch.tensor(-1e4))
76
+ self.n_head = config.n_head
77
+ self.split_size = n_state
78
+ self.scale = scale
79
+ self.is_cross_attention = is_cross_attention
80
+ if self.is_cross_attention:
81
+ # self.c_attn = Conv1D(2 * n_state, nx)
82
+ # self.q_attn = Conv1D(n_state, nx)
83
+ self.c_attn = nn.Linear(nx, 2 * n_state)
84
+ self.q_attn = nn.Linear(nx, n_state)
85
+ else:
86
+ self.c_attn = nn.Linear(nx, 3 * n_state)
87
+ # self.c_proj = Conv1D(n_state, nx)
88
+ self.c_proj = nn.Linear(nx, n_state)
89
+ self.attn_dropout = nn.Dropout(config.attn_pdrop)
90
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
91
+ self.pruned_heads = set()
92
+
93
+ def prune_heads(self, heads):
94
+ if len(heads) == 0:
95
+ return
96
+ heads, index = find_pruneable_heads_and_indices(
97
+ heads, self.n_head, self.split_size // self.n_head, self.pruned_heads
98
+ )
99
+ index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
100
+
101
+ # Prune conv1d layers
102
+ self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
103
+ self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
104
+
105
+ # Update hyper params
106
+ self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
107
+ self.n_head = self.n_head - len(heads)
108
+ self.pruned_heads = self.pruned_heads.union(heads)
109
+
110
+ def _attn(self, q, k, v, attention_mask=None, head_mask=None, output_attentions=False):
111
+ w = torch.matmul(q, k)
112
+ if self.scale:
113
+ w = w / (float(v.size(-1)) ** 0.5)
114
+ nd, ns = w.size(-2), w.size(-1)
115
+
116
+ if not self.is_cross_attention:
117
+ # if only "normal" attention layer implements causal mask
118
+ mask = self.bias[:, :, ns - nd : ns, :ns]
119
+ w = torch.where(mask.bool(), w, self.masked_bias.to(w.dtype))
120
+
121
+ if attention_mask is not None:
122
+ # Apply the attention mask
123
+ w = w + attention_mask
124
+
125
+ w = nn.Softmax(dim=-1)(w)
126
+ w = self.attn_dropout(w)
127
+
128
+ # Mask heads if we want to
129
+ if head_mask is not None:
130
+ w = w * head_mask
131
+
132
+ outputs = [torch.matmul(w, v)]
133
+ if output_attentions:
134
+ outputs.append(w)
135
+ return outputs
136
+
137
+ def merge_heads(self, x):
138
+ x = x.permute(0, 2, 1, 3).contiguous()
139
+ new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
140
+ return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states
141
+
142
+ def split_heads(self, x, k=False):
143
+ new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
144
+ x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states
145
+ if k:
146
+ return x.permute(0, 2, 3, 1) # (batch, head, head_features, seq_length)
147
+ else:
148
+ return x.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features)
149
+
150
+ def forward(
151
+ self,
152
+ hidden_states,
153
+ layer_past=None,
154
+ attention_mask=None,
155
+ head_mask=None,
156
+ encoder_hidden_states=None,
157
+ encoder_attention_mask=None,
158
+ use_cache=False,
159
+ output_attentions=False,
160
+ ):
161
+ if encoder_hidden_states is not None:
162
+ assert hasattr(
163
+ self, "q_attn"
164
+ ), "If class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `Attention(..., is_cross_attention=True)`."
165
+ query = self.q_attn(hidden_states)
166
+ key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
167
+ attention_mask = encoder_attention_mask
168
+ else:
169
+ query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
170
+
171
+ query = self.split_heads(query)
172
+ key = self.split_heads(key, k=True)
173
+ value = self.split_heads(value)
174
+ if layer_past is not None:
175
+ past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1] # transpose back cf below
176
+ key = torch.cat((past_key, key), dim=-1)
177
+ value = torch.cat((past_value, value), dim=-2)
178
+
179
+ if use_cache is True:
180
+ present = torch.stack((key.transpose(-2, -1), value)) # transpose to have same shapes for stacking
181
+ else:
182
+ present = (None,)
183
+
184
+ attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions)
185
+ a = attn_outputs[0]
186
+
187
+ a = self.merge_heads(a)
188
+ a = self.c_proj(a)
189
+ a = self.resid_dropout(a)
190
+
191
+ outputs = [a, present] + attn_outputs[1:]
192
+ return outputs # a, present, (attentions)
193
+
194
+
195
+ class MLP(nn.Module):
196
+ def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd)
197
+ super().__init__()
198
+ nx = config.n_embd
199
+ # self.c_fc = Conv1D(n_state, nx)
200
+ # self.c_proj = Conv1D(nx, n_state)
201
+ self.c_fc = nn.Linear(nx, n_state)
202
+ self.c_proj = nn.Linear(n_state, nx)
203
+ self.act = ACT2FN[config.activation_function]
204
+ self.dropout = nn.Dropout(config.resid_pdrop)
205
+
206
+ def forward(self, x):
207
+ h = self.act(self.c_fc(x))
208
+ h2 = self.c_proj(h)
209
+ return self.dropout(h2)
210
+
211
+
212
+ class Block(nn.Module):
213
+ def __init__(self, n_ctx, config, scale=False):
214
+ super().__init__()
215
+ hidden_size = config.n_embd
216
+ inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
217
+ self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
218
+ self.attn = Attention(hidden_size, n_ctx, config, scale)
219
+ self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
220
+ if config.add_cross_attention:
221
+ self.crossattention = Attention(hidden_size, n_ctx, config, scale, is_cross_attention=True)
222
+ self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
223
+ self.mlp = MLP(inner_dim, config)
224
+
225
+ def forward(
226
+ self,
227
+ hidden_states,
228
+ layer_past=None,
229
+ attention_mask=None,
230
+ head_mask=None,
231
+ encoder_hidden_states=None,
232
+ encoder_attention_mask=None,
233
+ use_cache=False,
234
+ output_attentions=False,
235
+ ):
236
+ attn_outputs = self.attn(
237
+ self.ln_1(hidden_states),
238
+ layer_past=layer_past,
239
+ attention_mask=attention_mask,
240
+ head_mask=head_mask,
241
+ use_cache=use_cache,
242
+ output_attentions=output_attentions,
243
+ )
244
+ attn_output = attn_outputs[0] # output_attn: a, present, (attentions)
245
+ outputs = attn_outputs[1:]
246
+ # residual connection
247
+ hidden_states = attn_output + hidden_states
248
+
249
+ if encoder_hidden_states is not None:
250
+ # add one self-attention block for cross-attention
251
+ assert hasattr(
252
+ self, "crossattention"
253
+ ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
254
+ cross_attn_outputs = self.crossattention(
255
+ self.ln_cross_attn(hidden_states),
256
+ attention_mask=attention_mask,
257
+ head_mask=head_mask,
258
+ encoder_hidden_states=encoder_hidden_states,
259
+ encoder_attention_mask=encoder_attention_mask,
260
+ output_attentions=output_attentions,
261
+ )
262
+ attn_output = cross_attn_outputs[0]
263
+ # residual connection
264
+ hidden_states = hidden_states + attn_output
265
+ outputs = outputs + cross_attn_outputs[2:] # add cross attentions if we output attention weights
266
+
267
+ feed_forward_hidden_states = self.mlp(self.ln_2(hidden_states))
268
+ # residual connection
269
+ hidden_states = hidden_states + feed_forward_hidden_states
270
+
271
+ outputs = [hidden_states] + outputs
272
+ return outputs # hidden_states, present, (attentions, cross_attentions)
273
+
274
+
275
+ class GPT2LPreTrainedModel(PreTrainedModel):
276
+ """
277
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
278
+ models.
279
+ """
280
+
281
+ config_class = GPT2LConfig
282
+ base_model_prefix = "transformer"
283
+
284
+ def __init__(self, *inputs, **kwargs):
285
+ super().__init__(*inputs, **kwargs)
286
+
287
+ def _init_weights(self, module):
288
+ """Initialize the weights."""
289
+ if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
290
+ # Slightly different from the TF version which uses truncated_normal for initialization
291
+ # cf https://github.com/pytorch/pytorch/pull/5617
292
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
293
+ if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
294
+ module.bias.data.zero_()
295
+ elif isinstance(module, nn.LayerNorm):
296
+ module.bias.data.zero_()
297
+ module.weight.data.fill_(1.0)
298
+
299
+
300
+ class GPT2LDoubleHeadsModelOutput(ModelOutput):
301
+ """
302
+ Base class for outputs of models predicting if two sentences are consecutive or not.
303
+
304
+ Args:
305
+ loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided):
306
+ Language modeling loss.
307
+ mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided):
308
+ Multiple choice classification loss.
309
+ logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
310
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
311
+ mc_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
312
+ Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
313
+ past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
314
+ List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2,
315
+ batch_size, num_heads, sequence_length, embed_size_per_head)`).
316
+
317
+ Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
318
+ :obj:`past_key_values` input) to speed up sequential decoding.
319
+ hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
320
+ Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
321
+ of shape :obj:`(batch_size, sequence_length, hidden_size)`.
322
+
323
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
324
+ attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
325
+ Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
326
+ sequence_length, sequence_length)`.
327
+
328
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
329
+ heads.
330
+ """
331
+
332
+ loss: Optional[torch.FloatTensor] = None
333
+ mc_loss: Optional[torch.FloatTensor] = None
334
+ logits: torch.FloatTensor = None
335
+ mc_logits: torch.FloatTensor = None
336
+ past_key_values: Optional[List[torch.FloatTensor]] = None
337
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
338
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
339
+
340
+
341
+
342
+ GPT2L_START_DOCSTRING = r"""
343
+
344
+ This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
345
+ methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
346
+ pruning heads etc.)
347
+
348
+ This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
349
+ subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
350
+ general usage and behavior.
351
+
352
+ Parameters:
353
+ config (:class:`~transformers.GPT2LConfig`): Model configuration class with all the parameters of the model.
354
+ Initializing with a config file does not load the weights associated with the model, only the
355
+ configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
356
+ weights.
357
+ """
358
+
359
+ GPT2_INPUTS_DOCSTRING = r"""
360
+ Args:
361
+ input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`):
362
+ :obj:`input_ids_length` = ``sequence_length`` if :obj:`past_key_values` is ``None`` else
363
+ ``past_key_values[0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input
364
+ sequence tokens in the vocabulary.
365
+
366
+ If :obj:`past_key_values` is used, only ``input_ids`` that do not have their past calculated should be
367
+ passed as ``input_ids``.
368
+
369
+ Indices can be obtained using :class:`~transformers.GPT2Tokenizer`. See
370
+ :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
371
+ details.
372
+
373
+ `What are input IDs? <../glossary.html#input-ids>`__
374
+ past_key_values (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
375
+ Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
376
+ :obj:`past_key_values` output below). Can be used to speed up sequential decoding. The ``input_ids`` which
377
+ have their past given to this model should not be passed as ``input_ids`` as they have already been
378
+ computed.
379
+ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
380
+ Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
381
+
382
+ - 1 for tokens that are **not masked**,
383
+ - 0 for tokens that are **masked**.
384
+
385
+ `What are attention masks? <../glossary.html#attention-mask>`__
386
+ token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`, `optional`):
387
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
388
+ 1]``:
389
+
390
+ - 0 corresponds to a `sentence A` token,
391
+ - 1 corresponds to a `sentence B` token.
392
+
393
+ `What are token type IDs? <../glossary.html#token-type-ids>`_
394
+ position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
395
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
396
+ config.max_position_embeddings - 1]``.
397
+
398
+ `What are position IDs? <../glossary.html#position-ids>`_
399
+ head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
400
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
401
+
402
+ - 1 indicates the head is **not masked**,
403
+ - 0 indicates the head is **masked**.
404
+
405
+ inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
406
+ Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
407
+ This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
408
+ vectors than the model's internal embedding lookup matrix.
409
+
410
+ If :obj:`past_key_values` is used, optionally only the last :obj:`inputs_embeds` have to be input (see
411
+ :obj:`past_key_values`).
412
+ use_cache (:obj:`bool`, `optional`):
413
+ If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
414
+ decoding (see :obj:`past_key_values`).
415
+ output_attentions (:obj:`bool`, `optional`):
416
+ Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
417
+ tensors for more detail.
418
+ output_hidden_states (:obj:`bool`, `optional`):
419
+ Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
420
+ more detail.
421
+ return_dict (:obj:`bool`, `optional`):
422
+ Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
423
+ """
424
+
425
+
426
+ class GPT2LModel(GPT2LPreTrainedModel):
427
+ def __init__(self, config):
428
+ super().__init__(config)
429
+
430
+ self.wte = nn.Embedding(config.vocab_size, config.n_embd)
431
+ self.wpe = nn.Embedding(config.n_positions, config.n_embd)
432
+ self.drop = nn.Dropout(config.embd_pdrop)
433
+ self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
434
+ self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
435
+
436
+ self.init_weights()
437
+
438
+ def get_input_embeddings(self):
439
+ return self.wte
440
+
441
+ def set_input_embeddings(self, new_embeddings):
442
+ self.wte = new_embeddings
443
+
444
+ def _prune_heads(self, heads_to_prune):
445
+ """
446
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
447
+ """
448
+ for layer, heads in heads_to_prune.items():
449
+ self.h[layer].attn.prune_heads(heads)
450
+
451
+ def forward(
452
+ self,
453
+ input_ids=None,
454
+ past_key_values=None,
455
+ attention_mask=None,
456
+ token_type_ids=None,
457
+ position_ids=None,
458
+ head_mask=None,
459
+ inputs_embeds=None,
460
+ encoder_hidden_states=None,
461
+ encoder_attention_mask=None,
462
+ use_cache=None,
463
+ output_attentions=None,
464
+ output_hidden_states=None,
465
+ return_dict=None,
466
+ **kwargs,
467
+ ):
468
+ if "past" in kwargs:
469
+ warnings.warn(
470
+ "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
471
+ FutureWarning,
472
+ )
473
+ past_key_values = kwargs.pop("past")
474
+ assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
475
+
476
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
477
+ output_hidden_states = (
478
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
479
+ )
480
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
481
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
482
+
483
+ if input_ids is not None and inputs_embeds is not None:
484
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
485
+ elif input_ids is not None:
486
+ input_shape = input_ids.size()
487
+ input_ids = input_ids.view(-1, input_shape[-1])
488
+ batch_size = input_ids.shape[0]
489
+ elif inputs_embeds is not None:
490
+ input_shape = inputs_embeds.size()[:-1]
491
+ batch_size = inputs_embeds.shape[0]
492
+ else:
493
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
494
+
495
+ if token_type_ids is not None:
496
+ token_type_ids = token_type_ids.view(-1, input_shape[-1])
497
+ if position_ids is not None:
498
+ position_ids = position_ids.view(-1, input_shape[-1])
499
+
500
+ if past_key_values is None:
501
+ past_length = 0
502
+ past_key_values = [None] * len(self.h)
503
+ else:
504
+ past_length = past_key_values[0][0].size(-2)
505
+ if position_ids is None:
506
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
507
+ position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
508
+ position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
509
+
510
+ # Attention mask.
511
+ if attention_mask is not None:
512
+ assert batch_size > 0, "batch_size has to be defined and > 0"
513
+ attention_mask = attention_mask.view(batch_size, -1)
514
+ # We create a 3D attention mask from a 2D tensor mask.
515
+ # Sizes are [batch_size, 1, 1, to_seq_length]
516
+ # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
517
+ # this attention mask is more simple than the triangular masking of causal attention
518
+ # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
519
+ attention_mask = attention_mask[:, None, None, :]
520
+
521
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
522
+ # masked positions, this operation will create a tensor which is 0.0 for
523
+ # positions we want to attend and -10000.0 for masked positions.
524
+ # Since we are adding it to the raw scores before the softmax, this is
525
+ # effectively the same as removing these entirely.
526
+ attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
527
+ attention_mask = (1.0 - attention_mask) * -10000.0
528
+
529
+ # If a 2D ou 3D attention mask is provided for the cross-attention
530
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
531
+ if self.config.add_cross_attention and encoder_hidden_states is not None:
532
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
533
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
534
+ if encoder_attention_mask is None:
535
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
536
+ encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
537
+ else:
538
+ encoder_attention_mask = None
539
+
540
+ # Prepare head mask if needed
541
+ # 1.0 in head_mask indicate we keep the head
542
+ # attention_probs has shape bsz x n_heads x N x N
543
+ # head_mask has shape n_layer x batch x n_heads x N x N
544
+ head_mask = self.get_head_mask(head_mask, self.config.n_layer)
545
+
546
+ if inputs_embeds is None:
547
+ inputs_embeds = self.wte(input_ids)
548
+ position_embeds = self.wpe(position_ids)
549
+ hidden_states = inputs_embeds + position_embeds
550
+
551
+ if token_type_ids is not None:
552
+ token_type_embeds = self.wte(token_type_ids)
553
+ hidden_states = hidden_states + token_type_embeds
554
+
555
+ hidden_states = self.drop(hidden_states)
556
+
557
+ output_shape = input_shape + (hidden_states.size(-1),)
558
+
559
+ presents = () if use_cache else None
560
+ all_self_attentions = () if output_attentions else None
561
+ all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
562
+ all_hidden_states = () if output_hidden_states else None
563
+ for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
564
+ if output_hidden_states:
565
+ all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
566
+
567
+ if getattr(self.config, "gradient_checkpointing", False):
568
+
569
+ def create_custom_forward(module):
570
+ def custom_forward(*inputs):
571
+ # checkpointing only works with tuple returns, not with lists
572
+ return tuple(output for output in module(*inputs, use_cache, output_attentions))
573
+
574
+ return custom_forward
575
+
576
+ outputs = torch.utils.checkpoint.checkpoint(
577
+ create_custom_forward(block),
578
+ hidden_states,
579
+ layer_past,
580
+ attention_mask,
581
+ head_mask[i],
582
+ encoder_hidden_states,
583
+ encoder_attention_mask,
584
+ )
585
+ else:
586
+ outputs = block(
587
+ hidden_states,
588
+ layer_past=layer_past,
589
+ attention_mask=attention_mask,
590
+ head_mask=head_mask[i],
591
+ encoder_hidden_states=encoder_hidden_states,
592
+ encoder_attention_mask=encoder_attention_mask,
593
+ use_cache=use_cache,
594
+ output_attentions=output_attentions,
595
+ )
596
+
597
+ hidden_states, present = outputs[:2]
598
+ if use_cache is True:
599
+ presents = presents + (present,)
600
+
601
+ if output_attentions:
602
+ all_self_attentions = all_self_attentions + (outputs[2],)
603
+ if self.config.add_cross_attention:
604
+ all_cross_attentions = all_cross_attentions + (outputs[3],)
605
+
606
+ hidden_states = self.ln_f(hidden_states)
607
+
608
+ hidden_states = hidden_states.view(*output_shape)
609
+ # Add last hidden state
610
+ if output_hidden_states:
611
+ all_hidden_states = all_hidden_states + (hidden_states,)
612
+
613
+ if not return_dict:
614
+ return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
615
+
616
+ return BaseModelOutputWithPastAndCrossAttentions(
617
+ last_hidden_state=hidden_states,
618
+ past_key_values=presents,
619
+ hidden_states=all_hidden_states,
620
+ attentions=all_self_attentions,
621
+ cross_attentions=all_cross_attentions,
622
+ )
623
+
624
+
625
+ class GPT2LLMHeadModel(GPT2LPreTrainedModel):
626
+ authorized_missing_keys = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
627
+
628
+ def __init__(self, config):
629
+ super().__init__(config)
630
+ self.transformer = GPT2LModel(config)
631
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
632
+
633
+ self.init_weights()
634
+
635
+ def get_output_embeddings(self):
636
+ return self.lm_head
637
+
638
+ def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
639
+ # only last token for inputs_ids if past is defined in kwargs
640
+ if past:
641
+ input_ids = input_ids[:, -1].unsqueeze(-1)
642
+
643
+ attention_mask = kwargs.get("attention_mask", None)
644
+ position_ids = kwargs.get("position_ids", None)
645
+
646
+ if attention_mask is not None and position_ids is None:
647
+ # create position_ids on the fly for batch generation
648
+ position_ids = attention_mask.long().cumsum(-1) - 1
649
+ position_ids.masked_fill_(attention_mask == 0, 1)
650
+ if past:
651
+ position_ids = position_ids[:, -1].unsqueeze(-1)
652
+ else:
653
+ position_ids = None
654
+ return {
655
+ "input_ids": input_ids,
656
+ "past_key_values": past,
657
+ "use_cache": kwargs.get("use_cache"),
658
+ "position_ids": position_ids,
659
+ "attention_mask": attention_mask,
660
+ }
661
+
662
+
663
+ def forward(
664
+ self,
665
+ input_ids=None,
666
+ past_key_values=None,
667
+ attention_mask=None,
668
+ token_type_ids=None,
669
+ position_ids=None,
670
+ head_mask=None,
671
+ inputs_embeds=None,
672
+ encoder_hidden_states=None,
673
+ encoder_attention_mask=None,
674
+ labels=None,
675
+ use_cache=None,
676
+ output_attentions=None,
677
+ output_hidden_states=None,
678
+ return_dict=None,
679
+ **kwargs,
680
+ ):
681
+ r"""
682
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
683
+ Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
684
+ ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
685
+ ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
686
+ """
687
+ if "past" in kwargs:
688
+ warnings.warn(
689
+ "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
690
+ FutureWarning,
691
+ )
692
+ past_key_values = kwargs.pop("past")
693
+ assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
694
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
695
+
696
+ transformer_outputs = self.transformer(
697
+ input_ids,
698
+ past_key_values=past_key_values,
699
+ attention_mask=attention_mask,
700
+ token_type_ids=token_type_ids,
701
+ position_ids=position_ids,
702
+ head_mask=head_mask,
703
+ inputs_embeds=inputs_embeds,
704
+ encoder_hidden_states=encoder_hidden_states,
705
+ encoder_attention_mask=encoder_attention_mask,
706
+ use_cache=use_cache,
707
+ output_attentions=output_attentions,
708
+ output_hidden_states=output_hidden_states,
709
+ return_dict=return_dict,
710
+ )
711
+ hidden_states = transformer_outputs[0]
712
+
713
+ lm_logits = self.lm_head(hidden_states)
714
+
715
+ loss = None
716
+ if labels is not None:
717
+ # Shift so that tokens < n predict n
718
+ shift_logits = lm_logits[..., :-1, :].contiguous()
719
+ shift_labels = labels[..., 1:].contiguous()
720
+ # Flatten the tokens
721
+ loss_fct = CrossEntropyLoss()
722
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
723
+
724
+ if not return_dict:
725
+ output = (lm_logits,) + transformer_outputs[1:]
726
+ return ((loss,) + output) if loss is not None else output
727
+
728
+ return CausalLMOutputWithPast(
729
+ loss=loss,
730
+ logits=lm_logits,
731
+ past_key_values=transformer_outputs.past_key_values,
732
+ hidden_states=transformer_outputs.hidden_states,
733
+ attentions=transformer_outputs.attentions,
734
+ # cross_attentions=transformer_outputs.cross_attentions,
735
+ )
736
+
737
+ class GPT2LDoubleHeadsModel(GPT2LPreTrainedModel):
738
+ def __init__(self, config):
739
+ super().__init__(config)
740
+ config.num_labels = 1
741
+ self.transformer = GPT2LModel(config)
742
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
743
+ self.multiple_choice_head = SequenceSummary(config)
744
+
745
+ self.init_weights()
746
+
747
+ def get_output_embeddings(self):
748
+ return self.lm_head
749
+
750
+ def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
751
+ # only last token for inputs_ids if past is defined in kwargs
752
+ if past:
753
+ input_ids = input_ids[:, -1].unsqueeze(-1)
754
+
755
+ return {
756
+ "input_ids": input_ids,
757
+ "past_key_values": past,
758
+ "use_cache": kwargs.get("use_cache"),
759
+ }
760
+
761
+ def forward(
762
+ self,
763
+ input_ids=None,
764
+ past_key_values=None,
765
+ attention_mask=None,
766
+ token_type_ids=None,
767
+ position_ids=None,
768
+ head_mask=None,
769
+ inputs_embeds=None,
770
+ mc_token_ids=None,
771
+ labels=None,
772
+ mc_labels=None,
773
+ use_cache=None,
774
+ output_attentions=None,
775
+ output_hidden_states=None,
776
+ return_dict=None,
777
+ **kwargs,
778
+ ):
779
+ r"""
780
+ mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input):
781
+ Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) -
782
+ 1[``.
783
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
784
+ Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
785
+ ``labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` All labels set to
786
+ ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
787
+ mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`):
788
+ Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
789
+ num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see
790
+ `input_ids` above)
791
+ kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
792
+ Used to hide legacy arguments that have been deprecated.
793
+
794
+ Return:
795
+
796
+ Example::
797
+
798
+ >>> import torch
799
+ >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
800
+
801
+ >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
802
+ >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2, return_dict=True)
803
+
804
+ >>> # Add a [CLS] to the vocabulary (we should train it also!)
805
+ >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
806
+
807
+ >>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
808
+
809
+ >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
810
+ >>> encoded_choices = [tokenizer.encode(s) for s in choices]
811
+ >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
812
+
813
+ >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2
814
+ >>> mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1
815
+
816
+ >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
817
+ >>> lm_logits = outputs.lm_logits
818
+ >>> mc_logits = outputs.mc_logits
819
+
820
+ """
821
+ if "lm_labels" in kwargs:
822
+ warnings.warn(
823
+ "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
824
+ FutureWarning,
825
+ )
826
+ labels = kwargs.pop("lm_labels")
827
+ if "past" in kwargs:
828
+ warnings.warn(
829
+ "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
830
+ FutureWarning,
831
+ )
832
+ past_key_values = kwargs.pop("past")
833
+ assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
834
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
835
+
836
+ transformer_outputs = self.transformer(
837
+ input_ids,
838
+ past_key_values=past_key_values,
839
+ attention_mask=attention_mask,
840
+ token_type_ids=token_type_ids,
841
+ position_ids=position_ids,
842
+ head_mask=head_mask,
843
+ inputs_embeds=inputs_embeds,
844
+ use_cache=use_cache,
845
+ output_attentions=output_attentions,
846
+ output_hidden_states=output_hidden_states,
847
+ return_dict=return_dict,
848
+ )
849
+
850
+ hidden_states = transformer_outputs[0]
851
+
852
+ lm_logits = self.lm_head(hidden_states)
853
+ mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
854
+
855
+ mc_loss = None
856
+ if mc_labels is not None:
857
+ loss_fct = CrossEntropyLoss()
858
+ mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
859
+ lm_loss = None
860
+ if labels is not None:
861
+ shift_logits = lm_logits[..., :-1, :].contiguous()
862
+ shift_labels = labels[..., 1:].contiguous()
863
+ loss_fct = CrossEntropyLoss()
864
+ lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
865
+
866
+ if not return_dict:
867
+ output = (lm_logits, mc_logits) + transformer_outputs[1:]
868
+ if mc_loss is not None:
869
+ output = (mc_loss,) + output
870
+ return ((lm_loss,) + output) if lm_loss is not None else output
871
+
872
+ return GPT2DoubleHeadsModelOutput(
873
+ loss=lm_loss,
874
+ mc_loss=mc_loss,
875
+ logits=lm_logits,
876
+ mc_logits=mc_logits,
877
+ past_key_values=transformer_outputs.past_key_values,
878
+ hidden_states=transformer_outputs.hidden_states,
879
+ attentions=transformer_outputs.attentions,
880
+ )
881
+
882
+
883
+ class GPT2LForSequenceClassification(GPT2LPreTrainedModel):
884
+ authorized_missing_keys = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
885
+
886
+ def __init__(self, config):
887
+ super().__init__(config)
888
+ self.num_labels = config.num_labels
889
+ self.transformer = GPT2LModel(config)
890
+ self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
891
+
892
+ self.init_weights()
893
+ def forward(
894
+ self,
895
+ input_ids=None,
896
+ past_key_values=None,
897
+ attention_mask=None,
898
+ token_type_ids=None,
899
+ position_ids=None,
900
+ head_mask=None,
901
+ inputs_embeds=None,
902
+ labels=None,
903
+ use_cache=None,
904
+ output_attentions=None,
905
+ output_hidden_states=None,
906
+ return_dict=None,
907
+ ):
908
+ r"""
909
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
910
+ Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
911
+ config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
912
+ If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
913
+ """
914
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
915
+
916
+ transformer_outputs = self.transformer(
917
+ input_ids,
918
+ past_key_values=past_key_values,
919
+ attention_mask=attention_mask,
920
+ token_type_ids=token_type_ids,
921
+ position_ids=position_ids,
922
+ head_mask=head_mask,
923
+ inputs_embeds=inputs_embeds,
924
+ use_cache=use_cache,
925
+ output_attentions=output_attentions,
926
+ output_hidden_states=output_hidden_states,
927
+ return_dict=return_dict,
928
+ )
929
+ hidden_states = transformer_outputs[0]
930
+ logits = self.score(hidden_states)
931
+
932
+ if input_ids is not None:
933
+ batch_size, sequence_length = input_ids.shape[:2]
934
+ else:
935
+ batch_size, sequence_length = inputs_embeds.shape[:2]
936
+
937
+ assert (
938
+ self.config.pad_token_id is not None or batch_size == 1
939
+ ), "Cannot handle batch sizes > 1 if no padding token is defined."
940
+ if self.config.pad_token_id is None:
941
+ sequence_lengths = -1
942
+ else:
943
+ if input_ids is not None:
944
+ sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
945
+ else:
946
+ sequence_lengths = -1
947
+ logger.warning(
948
+ f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
949
+ f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
950
+ )
951
+
952
+ pooled_logits = logits[range(batch_size), sequence_lengths]
953
+
954
+ loss = None
955
+ if labels is not None:
956
+ if self.num_labels == 1:
957
+ # We are doing regression
958
+ loss_fct = MSELoss()
959
+ loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1))
960
+ else:
961
+ loss_fct = CrossEntropyLoss()
962
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
963
+
964
+ if not return_dict:
965
+ output = (pooled_logits,) + transformer_outputs[1:]
966
+ return ((loss,) + output) if loss is not None else output
967
+
968
+ return SequenceClassifierOutputWithPast(
969
+ loss=loss,
970
+ logits=pooled_logits,
971
+ past_key_values=transformer_outputs.past_key_values,
972
+ hidden_states=transformer_outputs.hidden_states,
973
+ attentions=transformer_outputs.attentions,
974
+ )