sdadas commited on
Commit
d178fcb
1 Parent(s): 5731901

Upload 9 files

Browse files
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<mask>": 128000
3
+ }
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "auto_map": {
7
+ "AutoModel": "modeling_roberta.RobertaModel",
8
+ "AutoModelForSequenceClassification": "modeling_roberta.RobertaForSequenceClassification"
9
+ },
10
+ "bos_token_id": 0,
11
+ "classifier_dropout": null,
12
+ "eos_token_id": 2,
13
+ "hidden_act": "gelu",
14
+ "hidden_dropout_prob": 0.1,
15
+ "hidden_size": 1024,
16
+ "id2label": {
17
+ "0": "LABEL_0"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 4096,
21
+ "label2id": {
22
+ "LABEL_0": 0
23
+ },
24
+ "layer_norm_eps": 1e-05,
25
+ "max_position_embeddings": 514,
26
+ "model_type": "roberta",
27
+ "num_attention_heads": 16,
28
+ "num_hidden_layers": 24,
29
+ "pad_token_id": 1,
30
+ "position_embedding_type": "absolute",
31
+ "torch_dtype": "bfloat16",
32
+ "transformers_version": "4.44.2",
33
+ "type_vocab_size": 1,
34
+ "use_cache": true,
35
+ "vocab_size": 128001
36
+ }
configuration_roberta.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """ RoBERTa configuration"""
17
+ from collections import OrderedDict
18
+ from typing import Mapping
19
+
20
+ from transformers import PretrainedConfig
21
+ from transformers.onnx import OnnxConfig
22
+ from transformers.utils import logging
23
+
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+
28
+ class RobertaConfig(PretrainedConfig):
29
+ r"""
30
+ This is the configuration class to store the configuration of a [`RobertaModel`] or a [`TFRobertaModel`]. It is
31
+ used to instantiate a RoBERTa model according to the specified arguments, defining the model architecture.
32
+ Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa
33
+ [FacebookAI/roberta-base](https://huggingface.co/FacebookAI/roberta-base) architecture.
34
+
35
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
36
+ documentation from [`PretrainedConfig`] for more information.
37
+
38
+
39
+ Args:
40
+ vocab_size (`int`, *optional*, defaults to 50265):
41
+ Vocabulary size of the RoBERTa model. Defines the number of different tokens that can be represented by the
42
+ `inputs_ids` passed when calling [`RobertaModel`] or [`TFRobertaModel`].
43
+ hidden_size (`int`, *optional*, defaults to 768):
44
+ Dimensionality of the encoder layers and the pooler layer.
45
+ num_hidden_layers (`int`, *optional*, defaults to 12):
46
+ Number of hidden layers in the Transformer encoder.
47
+ num_attention_heads (`int`, *optional*, defaults to 12):
48
+ Number of attention heads for each attention layer in the Transformer encoder.
49
+ intermediate_size (`int`, *optional*, defaults to 3072):
50
+ Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
51
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
52
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
53
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
54
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
55
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
56
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
57
+ The dropout ratio for the attention probabilities.
58
+ max_position_embeddings (`int`, *optional*, defaults to 512):
59
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
60
+ just in case (e.g., 512 or 1024 or 2048).
61
+ type_vocab_size (`int`, *optional*, defaults to 2):
62
+ The vocabulary size of the `token_type_ids` passed when calling [`RobertaModel`] or [`TFRobertaModel`].
63
+ initializer_range (`float`, *optional*, defaults to 0.02):
64
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
65
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
66
+ The epsilon used by the layer normalization layers.
67
+ position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
68
+ Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
69
+ positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
70
+ [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
71
+ For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
72
+ with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
73
+ is_decoder (`bool`, *optional*, defaults to `False`):
74
+ Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
75
+ use_cache (`bool`, *optional*, defaults to `True`):
76
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
77
+ relevant if `config.is_decoder=True`.
78
+ classifier_dropout (`float`, *optional*):
79
+ The dropout ratio for the classification head.
80
+
81
+ Examples:
82
+
83
+ ```python
84
+ >>> from transformers import RobertaConfig, RobertaModel
85
+
86
+ >>> # Initializing a RoBERTa configuration
87
+ >>> configuration = RobertaConfig()
88
+
89
+ >>> # Initializing a model (with random weights) from the configuration
90
+ >>> model = RobertaModel(configuration)
91
+
92
+ >>> # Accessing the model configuration
93
+ >>> configuration = model.config
94
+ ```"""
95
+
96
+ model_type = "roberta"
97
+
98
+ def __init__(
99
+ self,
100
+ vocab_size=50265,
101
+ hidden_size=768,
102
+ num_hidden_layers=12,
103
+ num_attention_heads=12,
104
+ intermediate_size=3072,
105
+ hidden_act="gelu",
106
+ hidden_dropout_prob=0.1,
107
+ attention_probs_dropout_prob=0.1,
108
+ max_position_embeddings=512,
109
+ type_vocab_size=2,
110
+ initializer_range=0.02,
111
+ layer_norm_eps=1e-12,
112
+ pad_token_id=1,
113
+ bos_token_id=0,
114
+ eos_token_id=2,
115
+ position_embedding_type="absolute",
116
+ use_cache=True,
117
+ classifier_dropout=None,
118
+ **kwargs,
119
+ ):
120
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
121
+
122
+ self.vocab_size = vocab_size
123
+ self.hidden_size = hidden_size
124
+ self.num_hidden_layers = num_hidden_layers
125
+ self.num_attention_heads = num_attention_heads
126
+ self.hidden_act = hidden_act
127
+ self.intermediate_size = intermediate_size
128
+ self.hidden_dropout_prob = hidden_dropout_prob
129
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
130
+ self.max_position_embeddings = max_position_embeddings
131
+ self.type_vocab_size = type_vocab_size
132
+ self.initializer_range = initializer_range
133
+ self.layer_norm_eps = layer_norm_eps
134
+ self.position_embedding_type = position_embedding_type
135
+ self.use_cache = use_cache
136
+ self.classifier_dropout = classifier_dropout
137
+
138
+
139
+ class RobertaOnnxConfig(OnnxConfig):
140
+ @property
141
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
142
+ if self.task == "multiple-choice":
143
+ dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
144
+ else:
145
+ dynamic_axis = {0: "batch", 1: "sequence"}
146
+ return OrderedDict(
147
+ [
148
+ ("input_ids", dynamic_axis),
149
+ ("attention_mask", dynamic_axis),
150
+ ]
151
+ )
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69335a3fb5b5ab2a736849a1a3ddc03a09cee6d966ccded6751739bebdd5adb8
3
+ size 869973234
modeling_roberta.py ADDED
@@ -0,0 +1,1941 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """PyTorch RoBERTa model."""
17
+
18
+ import math
19
+ from typing import List, Optional, Tuple, Union
20
+
21
+ import torch
22
+ import torch.nn.functional as F
23
+ import torch.utils.checkpoint
24
+ from torch import nn
25
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
26
+
27
+ from transformers.activations import ACT2FN, gelu
28
+ from transformers.modeling_outputs import (
29
+ BaseModelOutputWithPastAndCrossAttentions,
30
+ BaseModelOutputWithPoolingAndCrossAttentions,
31
+ CausalLMOutputWithCrossAttentions,
32
+ MaskedLMOutput,
33
+ MultipleChoiceModelOutput,
34
+ QuestionAnsweringModelOutput,
35
+ SequenceClassifierOutput,
36
+ TokenClassifierOutput,
37
+ )
38
+ from transformers.modeling_utils import PreTrainedModel
39
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
40
+ from transformers.utils import (
41
+ add_code_sample_docstrings,
42
+ add_start_docstrings,
43
+ add_start_docstrings_to_model_forward,
44
+ is_flash_attn_2_available,
45
+ is_flash_attn_greater_or_equal_2_10,
46
+ logging,
47
+ replace_return_docstrings,
48
+ )
49
+ from .configuration_roberta import RobertaConfig
50
+
51
+
52
+ if is_flash_attn_2_available():
53
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
54
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
55
+
56
+
57
+ logger = logging.get_logger(__name__)
58
+
59
+ _CHECKPOINT_FOR_DOC = "FacebookAI/roberta-base"
60
+ _CONFIG_FOR_DOC = "RobertaConfig"
61
+
62
+
63
+ # Copied from transformers.models.llama.modeling_llama._get_unpad_data
64
+ def _get_unpad_data(attention_mask):
65
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
66
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
67
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
68
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
69
+ return (
70
+ indices,
71
+ cu_seqlens,
72
+ max_seqlen_in_batch,
73
+ )
74
+
75
+
76
+ class RobertaEmbeddings(nn.Module):
77
+ """
78
+ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
79
+ """
80
+
81
+ # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
82
+ def __init__(self, config):
83
+ super().__init__()
84
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
85
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
86
+ self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
87
+
88
+ # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
89
+ # any TensorFlow checkpoint file
90
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
91
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
92
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
93
+ self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
94
+ self.register_buffer(
95
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
96
+ )
97
+ self.register_buffer(
98
+ "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
99
+ )
100
+
101
+ # End copy
102
+ self.padding_idx = config.pad_token_id
103
+ self.position_embeddings = nn.Embedding(
104
+ config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
105
+ )
106
+
107
+ def forward(
108
+ self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
109
+ ):
110
+ if position_ids is None:
111
+ if input_ids is not None:
112
+ # Create the position ids from the input token ids. Any padded tokens remain padded.
113
+ position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
114
+ else:
115
+ position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
116
+
117
+ if input_ids is not None:
118
+ input_shape = input_ids.size()
119
+ else:
120
+ input_shape = inputs_embeds.size()[:-1]
121
+
122
+ seq_length = input_shape[1]
123
+
124
+ # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
125
+ # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
126
+ # issue #5664
127
+ if token_type_ids is None:
128
+ if hasattr(self, "token_type_ids"):
129
+ buffered_token_type_ids = self.token_type_ids[:, :seq_length]
130
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
131
+ token_type_ids = buffered_token_type_ids_expanded
132
+ else:
133
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
134
+
135
+ if inputs_embeds is None:
136
+ inputs_embeds = self.word_embeddings(input_ids)
137
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
138
+
139
+ embeddings = inputs_embeds + token_type_embeddings
140
+ if self.position_embedding_type == "absolute":
141
+ position_embeddings = self.position_embeddings(position_ids)
142
+ embeddings += position_embeddings
143
+ embeddings = self.LayerNorm(embeddings)
144
+ embeddings = self.dropout(embeddings)
145
+ return embeddings
146
+
147
+ def create_position_ids_from_inputs_embeds(self, inputs_embeds):
148
+ """
149
+ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
150
+
151
+ Args:
152
+ inputs_embeds: torch.Tensor
153
+
154
+ Returns: torch.Tensor
155
+ """
156
+ input_shape = inputs_embeds.size()[:-1]
157
+ sequence_length = input_shape[1]
158
+
159
+ position_ids = torch.arange(
160
+ self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
161
+ )
162
+ return position_ids.unsqueeze(0).expand(input_shape)
163
+
164
+
165
+ # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Roberta
166
+ class RobertaSelfAttention(nn.Module):
167
+ def __init__(self, config, position_embedding_type=None):
168
+ super().__init__()
169
+ self.config = config
170
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
171
+ raise ValueError(
172
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
173
+ f"heads ({config.num_attention_heads})"
174
+ )
175
+
176
+ self.num_attention_heads = config.num_attention_heads
177
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
178
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
179
+
180
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
181
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
182
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
183
+
184
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
185
+ self.position_embedding_type = position_embedding_type or getattr(
186
+ config, "position_embedding_type", "absolute"
187
+ )
188
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
189
+ self.max_position_embeddings = config.max_position_embeddings
190
+ self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
191
+
192
+ self.is_decoder = config.is_decoder
193
+
194
+ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
195
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
196
+ x = x.view(new_x_shape)
197
+ return x.permute(0, 2, 1, 3)
198
+
199
+ def forward(
200
+ self,
201
+ hidden_states: torch.Tensor,
202
+ attention_mask: Optional[torch.FloatTensor] = None,
203
+ head_mask: Optional[torch.FloatTensor] = None,
204
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
205
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
206
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
207
+ output_attentions: Optional[bool] = False,
208
+ ) -> Tuple[torch.Tensor]:
209
+ mixed_query_layer = self.query(hidden_states)
210
+
211
+ # If this is instantiated as a cross-attention module, the keys
212
+ # and values come from an encoder; the attention mask needs to be
213
+ # such that the encoder's padding tokens are not attended to.
214
+ is_cross_attention = encoder_hidden_states is not None
215
+
216
+ if is_cross_attention and past_key_value is not None:
217
+ # reuse k,v, cross_attentions
218
+ key_layer = past_key_value[0]
219
+ value_layer = past_key_value[1]
220
+ attention_mask = encoder_attention_mask
221
+ elif is_cross_attention:
222
+ key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
223
+ value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
224
+ attention_mask = encoder_attention_mask
225
+ elif past_key_value is not None:
226
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
227
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
228
+ key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
229
+ value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
230
+ else:
231
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
232
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
233
+
234
+ query_layer = self.transpose_for_scores(mixed_query_layer)
235
+
236
+ use_cache = past_key_value is not None
237
+ if self.is_decoder:
238
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
239
+ # Further calls to cross_attention layer can then reuse all cross-attention
240
+ # key/value_states (first "if" case)
241
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
242
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
243
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
244
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
245
+ past_key_value = (key_layer, value_layer)
246
+
247
+ # Take the dot product between "query" and "key" to get the raw attention scores.
248
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
249
+
250
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
251
+ query_length, key_length = query_layer.shape[2], key_layer.shape[2]
252
+ if use_cache:
253
+ position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
254
+ -1, 1
255
+ )
256
+ else:
257
+ position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
258
+ position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
259
+ distance = position_ids_l - position_ids_r
260
+
261
+ positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
262
+ positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
263
+
264
+ if self.position_embedding_type == "relative_key":
265
+ relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
266
+ attention_scores = attention_scores + relative_position_scores
267
+ elif self.position_embedding_type == "relative_key_query":
268
+ relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
269
+ relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
270
+ attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
271
+
272
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
273
+ if attention_mask is not None:
274
+ # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
275
+ attention_scores = attention_scores + attention_mask
276
+
277
+ # Normalize the attention scores to probabilities.
278
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
279
+
280
+ # This is actually dropping out entire tokens to attend to, which might
281
+ # seem a bit unusual, but is taken from the original Transformer paper.
282
+ attention_probs = self.dropout(attention_probs)
283
+
284
+ # Mask heads if we want to
285
+ if head_mask is not None:
286
+ attention_probs = attention_probs * head_mask
287
+
288
+ context_layer = torch.matmul(attention_probs, value_layer)
289
+
290
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
291
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
292
+ context_layer = context_layer.view(new_context_layer_shape)
293
+
294
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
295
+
296
+ if self.is_decoder:
297
+ outputs = outputs + (past_key_value,)
298
+ return outputs
299
+
300
+
301
+ class RobertaFlashAttention2(RobertaSelfAttention):
302
+ def __init__(self, *args, **kwargs):
303
+ super().__init__(*args, **kwargs)
304
+
305
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
306
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
307
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
308
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
309
+
310
+ self.is_causal = False
311
+
312
+ if self.position_embedding_type != "absolute":
313
+ raise ValueError("RobertaFlashAttention2 only supports absolute position embeddings")
314
+
315
+ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
316
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
317
+ x = x.view(new_x_shape)
318
+ return x
319
+
320
+ def forward(
321
+ self,
322
+ hidden_states: torch.Tensor,
323
+ attention_mask: Optional[torch.FloatTensor] = None,
324
+ head_mask: Optional[torch.FloatTensor] = None,
325
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
326
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
327
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
328
+ output_attentions: Optional[bool] = False,
329
+ ) -> Tuple[torch.Tensor, ...]:
330
+ """
331
+ Parameters:
332
+ query: torch.tensor(bs, seq_length, dim)
333
+ key: torch.tensor(bs, seq_length, dim)
334
+ value: torch.tensor(bs, seq_length, dim)
335
+ mask: torch.tensor(bs, seq_length)
336
+
337
+ Returns:
338
+ weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
339
+ seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
340
+ """
341
+ if output_attentions:
342
+ raise ValueError("RobertaFlashAttention2 attention does not support output_attentions")
343
+ if head_mask is not None:
344
+ raise ValueError("RobertaFlashAttention2 attention does not support head_mask")
345
+
346
+ mixed_query_layer = self.query(hidden_states)
347
+
348
+ # If this is instantiated as a cross-attention module, the keys
349
+ # and values come from an encoder; the attention mask needs to be
350
+ # such that the encoder's padding tokens are not attended to.
351
+ is_cross_attention = encoder_hidden_states is not None
352
+
353
+ if is_cross_attention and past_key_value is not None:
354
+ # reuse k,v, cross_attentions
355
+ key_states = past_key_value[0]
356
+ value_states = past_key_value[1]
357
+ attention_mask = encoder_attention_mask
358
+ elif is_cross_attention:
359
+ key_states = self.transpose_for_scores(self.key(encoder_hidden_states))
360
+ value_states = self.transpose_for_scores(self.value(encoder_hidden_states))
361
+ attention_mask = encoder_attention_mask
362
+ elif past_key_value is not None:
363
+ key_states = self.transpose_for_scores(self.key(hidden_states))
364
+ value_states = self.transpose_for_scores(self.value(hidden_states))
365
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
366
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
367
+ else:
368
+ key_states = self.transpose_for_scores(self.key(hidden_states))
369
+ value_states = self.transpose_for_scores(self.value(hidden_states))
370
+
371
+ # attention_mask is of the "extended attention mask" at this stage, i.e. it's 0 for positions that need attention
372
+ # and the lowest possible value for positions that should be masked. So, an "all attention" mask sums to 0.
373
+ # In that case, we can safely set it to None to avoid unnecessary computation for variable length attention.
374
+ if attention_mask.sum().item() == 0:
375
+ attention_mask = None
376
+ else:
377
+ # Otherwise, we want to undo the "extended attention mask" format, as flash attention doesn't work with it.
378
+ attention_mask = torch.where(attention_mask[:, 0, 0, :] == 0, 1.0, 0.0)
379
+
380
+ query_states = self.transpose_for_scores(mixed_query_layer)
381
+ # At this stage, the key, value and query states all have the shape of
382
+ # batch_size x seq_len x head_dim x hidden_dim
383
+
384
+ if self.is_decoder:
385
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
386
+ # Further calls to cross_attention layer can then reuse all cross-attention
387
+ # key/value_states (first "if" case)
388
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
389
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
390
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
391
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
392
+ past_key_value = (key_states, value_states)
393
+
394
+ seq_len = query_states.shape[1]
395
+
396
+ attn_dropout = self.config.attention_probs_dropout_prob if self.training else 0.0
397
+
398
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
399
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
400
+ # cast them back in the correct dtype just to be sure everything works as expected.
401
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
402
+ # in fp32.
403
+
404
+ if query_states.dtype == torch.float32:
405
+ if torch.is_autocast_enabled():
406
+ target_dtype = torch.get_autocast_gpu_dtype()
407
+ # Handle the case where the model is quantized
408
+ elif hasattr(self.config, "_pre_quantization_dtype"):
409
+ target_dtype = self.config._pre_quantization_dtype
410
+ else:
411
+ target_dtype = self.q_lin.weight.dtype
412
+
413
+ logger.warning_once(
414
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
415
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
416
+ f" {target_dtype}."
417
+ )
418
+
419
+ query_states = query_states.to(target_dtype)
420
+ key_states = key_states.to(target_dtype)
421
+ value_states = value_states.to(target_dtype)
422
+
423
+ attn_weights = self._flash_attention_forward(
424
+ query_states, key_states, value_states, attention_mask, seq_len, dropout=attn_dropout
425
+ )
426
+
427
+ new_shape = attn_weights.size()[:-2] + (self.all_head_size,)
428
+ attn_output = attn_weights.view(new_shape)
429
+
430
+ outputs = (attn_output,)
431
+
432
+ if self.is_decoder:
433
+ outputs = outputs + (past_key_value,)
434
+ return outputs
435
+
436
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
437
+ def _flash_attention_forward(
438
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
439
+ ):
440
+ """
441
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
442
+ first unpad the input, then computes the attention scores and pad the final attention scores.
443
+
444
+ Args:
445
+ query_states (`torch.Tensor`):
446
+ Input query states to be passed to Flash Attention API
447
+ key_states (`torch.Tensor`):
448
+ Input key states to be passed to Flash Attention API
449
+ value_states (`torch.Tensor`):
450
+ Input value states to be passed to Flash Attention API
451
+ attention_mask (`torch.Tensor`):
452
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
453
+ position of padding tokens and 1 for the position of non-padding tokens.
454
+ dropout (`float`):
455
+ Attention dropout
456
+ softmax_scale (`float`, *optional*):
457
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
458
+ """
459
+ if not self._flash_attn_uses_top_left_mask:
460
+ causal = self.is_causal
461
+ else:
462
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
463
+ causal = self.is_causal and query_length != 1
464
+
465
+ # Contains at least one padding token in the sequence
466
+ if attention_mask is not None:
467
+ batch_size = query_states.shape[0]
468
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
469
+ query_states, key_states, value_states, attention_mask, query_length
470
+ )
471
+
472
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
473
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
474
+
475
+ attn_output_unpad = flash_attn_varlen_func(
476
+ query_states,
477
+ key_states,
478
+ value_states,
479
+ cu_seqlens_q=cu_seqlens_q,
480
+ cu_seqlens_k=cu_seqlens_k,
481
+ max_seqlen_q=max_seqlen_in_batch_q,
482
+ max_seqlen_k=max_seqlen_in_batch_k,
483
+ dropout_p=dropout,
484
+ softmax_scale=softmax_scale,
485
+ causal=causal,
486
+ )
487
+
488
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
489
+ else:
490
+ attn_output = flash_attn_func(
491
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
492
+ )
493
+
494
+ return attn_output
495
+
496
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input with num_heads->num_attention_heads
497
+ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
498
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
499
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
500
+
501
+ key_layer = index_first_axis(
502
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
503
+ )
504
+ value_layer = index_first_axis(
505
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
506
+ )
507
+ if query_length == kv_seq_len:
508
+ query_layer = index_first_axis(
509
+ query_layer.reshape(batch_size * kv_seq_len, self.num_attention_heads, head_dim), indices_k
510
+ )
511
+ cu_seqlens_q = cu_seqlens_k
512
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
513
+ indices_q = indices_k
514
+ elif query_length == 1:
515
+ max_seqlen_in_batch_q = 1
516
+ cu_seqlens_q = torch.arange(
517
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
518
+ ) # There is a memcpy here, that is very bad.
519
+ indices_q = cu_seqlens_q[:-1]
520
+ query_layer = query_layer.squeeze(1)
521
+ else:
522
+ # The -q_len: slice assumes left padding.
523
+ attention_mask = attention_mask[:, -query_length:]
524
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
525
+
526
+ return (
527
+ query_layer,
528
+ key_layer,
529
+ value_layer,
530
+ indices_q,
531
+ (cu_seqlens_q, cu_seqlens_k),
532
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
533
+ )
534
+
535
+
536
+ class RobertaSdpaAttention(RobertaSelfAttention):
537
+ """
538
+ Roberta attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
539
+ `RobertaSelfAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
540
+ SDPA API.
541
+ """
542
+
543
+ def __init__(self, config, position_embedding_type=None):
544
+ super().__init__(config, position_embedding_type)
545
+
546
+ self.is_causal = False
547
+
548
+ if self.position_embedding_type != "absolute":
549
+ raise ValueError("RobertaSdpaAttention only supports absolute position embeddings")
550
+
551
+ # Adapted from LlamaAttention.forward and RobertaFlashAttention2.forward
552
+ def forward(
553
+ self,
554
+ hidden_states: torch.Tensor,
555
+ attention_mask: Optional[torch.FloatTensor] = None,
556
+ head_mask: Optional[torch.FloatTensor] = None,
557
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
558
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
559
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
560
+ output_attentions: Optional[bool] = False,
561
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
562
+ if output_attentions:
563
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
564
+ logger.warning_once(
565
+ "RobertaModel is using RobertaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
566
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
567
+ )
568
+ return super().forward(
569
+ hidden_states=hidden_states,
570
+ attention_mask=attention_mask,
571
+ head_mask=head_mask,
572
+ encoder_hidden_states=encoder_hidden_states,
573
+ encoder_attention_mask=encoder_attention_mask,
574
+ past_key_value=past_key_value,
575
+ output_attentions=output_attentions,
576
+ )
577
+
578
+ mixed_query_layer = self.query(hidden_states)
579
+
580
+ # If this is instantiated as a cross-attention module, the keys
581
+ # and values come from an encoder; the attention mask needs to be
582
+ # such that the encoder's padding tokens are not attended to.
583
+ is_cross_attention = encoder_hidden_states is not None
584
+
585
+ if is_cross_attention and past_key_value is not None:
586
+ # reuse k,v, cross_attentions
587
+ key_states = past_key_value[0]
588
+ value_states = past_key_value[1]
589
+ attention_mask = encoder_attention_mask
590
+ elif is_cross_attention:
591
+ key_states = self.transpose_for_scores(self.key(encoder_hidden_states))
592
+ value_states = self.transpose_for_scores(self.value(encoder_hidden_states))
593
+ attention_mask = encoder_attention_mask
594
+ elif past_key_value is not None:
595
+ key_states = self.transpose_for_scores(self.key(hidden_states))
596
+ value_states = self.transpose_for_scores(self.value(hidden_states))
597
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
598
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
599
+ else:
600
+ key_states = self.transpose_for_scores(self.key(hidden_states))
601
+ value_states = self.transpose_for_scores(self.value(hidden_states))
602
+
603
+ query_states = self.transpose_for_scores(mixed_query_layer)
604
+ # At this stage, the key, value and query states all have the shape of
605
+ # batch_size x head_dim x seq_len x hidden_dim
606
+
607
+ if self.is_decoder:
608
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
609
+ # Further calls to cross_attention layer can then reuse all cross-attention
610
+ # key/value_states (first "if" case)
611
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
612
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
613
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
614
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
615
+ past_key_value = (key_states, value_states)
616
+
617
+ batch_size, _, seq_len, _ = query_states.size()
618
+
619
+ attn_dropout = self.config.attention_probs_dropout_prob if self.training else 0.0
620
+
621
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
622
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
623
+ if query_states.device.type == "cuda" and attention_mask is not None:
624
+ query_states = query_states.contiguous()
625
+ key_states = key_states.contiguous()
626
+ value_states = value_states.contiguous()
627
+
628
+ # In case we are not compiling, we may set `causal_mask` to None, which is required to dispatch to SDPA's Flash Attention 2 backend, rather
629
+ # relying on the `is_causal` argument.
630
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
631
+ query_states,
632
+ key_states,
633
+ value_states,
634
+ attn_mask=attention_mask,
635
+ dropout_p=attn_dropout,
636
+ is_causal=self.is_causal and attention_mask is None and seq_len > 1,
637
+ )
638
+
639
+ if attn_output.size() != (batch_size, self.num_attention_heads, seq_len, self.attention_head_size):
640
+ raise ValueError(
641
+ f"`attn_output` should be of size {(batch_size, self.num_attention_heads, seq_len, self.attention_head_size)}, but is"
642
+ f" {attn_output.size()}"
643
+ )
644
+
645
+ attn_output = attn_output.transpose(1, 2)
646
+ attn_output = attn_output.reshape(batch_size, seq_len, self.all_head_size)
647
+
648
+ outputs = (attn_output,)
649
+
650
+ if self.is_decoder:
651
+ outputs = outputs + (past_key_value,)
652
+ return outputs
653
+
654
+
655
+ ROBERTA_ATTENTION_CLASSES = {
656
+ "eager": RobertaSelfAttention,
657
+ "sdpa": RobertaSdpaAttention,
658
+ "flash_attention_2": RobertaFlashAttention2,
659
+ }
660
+
661
+
662
+ # Copied from transformers.models.bert.modeling_bert.BertSelfOutput
663
+ class RobertaSelfOutput(nn.Module):
664
+ def __init__(self, config):
665
+ super().__init__()
666
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
667
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
668
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
669
+
670
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
671
+ hidden_states = self.dense(hidden_states)
672
+ hidden_states = self.dropout(hidden_states)
673
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
674
+ return hidden_states
675
+
676
+
677
+ class RobertaAttention(nn.Module):
678
+ def __init__(self, config, position_embedding_type=None):
679
+ super().__init__()
680
+ self.self = ROBERTA_ATTENTION_CLASSES[config._attn_implementation](
681
+ config,
682
+ position_embedding_type=position_embedding_type,
683
+ )
684
+ self.output = RobertaSelfOutput(config)
685
+ self.pruned_heads = set()
686
+
687
+ # Copied from transformers.models.bert.modeling_bert.BertAttention.prune_heads
688
+ def prune_heads(self, heads):
689
+ if len(heads) == 0:
690
+ return
691
+ heads, index = find_pruneable_heads_and_indices(
692
+ heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
693
+ )
694
+
695
+ # Prune linear layers
696
+ self.self.query = prune_linear_layer(self.self.query, index)
697
+ self.self.key = prune_linear_layer(self.self.key, index)
698
+ self.self.value = prune_linear_layer(self.self.value, index)
699
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
700
+
701
+ # Update hyper params and store pruned heads
702
+ self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
703
+ self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
704
+ self.pruned_heads = self.pruned_heads.union(heads)
705
+
706
+ # Copied from transformers.models.bert.modeling_bert.BertAttention.forward
707
+ def forward(
708
+ self,
709
+ hidden_states: torch.Tensor,
710
+ attention_mask: Optional[torch.FloatTensor] = None,
711
+ head_mask: Optional[torch.FloatTensor] = None,
712
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
713
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
714
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
715
+ output_attentions: Optional[bool] = False,
716
+ ) -> Tuple[torch.Tensor]:
717
+ self_outputs = self.self(
718
+ hidden_states,
719
+ attention_mask,
720
+ head_mask,
721
+ encoder_hidden_states,
722
+ encoder_attention_mask,
723
+ past_key_value,
724
+ output_attentions,
725
+ )
726
+ attention_output = self.output(self_outputs[0], hidden_states)
727
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
728
+ return outputs
729
+
730
+
731
+ # Copied from transformers.models.bert.modeling_bert.BertIntermediate
732
+ class RobertaIntermediate(nn.Module):
733
+ def __init__(self, config):
734
+ super().__init__()
735
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
736
+ if isinstance(config.hidden_act, str):
737
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
738
+ else:
739
+ self.intermediate_act_fn = config.hidden_act
740
+
741
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
742
+ hidden_states = self.dense(hidden_states)
743
+ hidden_states = self.intermediate_act_fn(hidden_states)
744
+ return hidden_states
745
+
746
+
747
+ # Copied from transformers.models.bert.modeling_bert.BertOutput
748
+ class RobertaOutput(nn.Module):
749
+ def __init__(self, config):
750
+ super().__init__()
751
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
752
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
753
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
754
+
755
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
756
+ hidden_states = self.dense(hidden_states)
757
+ hidden_states = self.dropout(hidden_states)
758
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
759
+ return hidden_states
760
+
761
+
762
+ # Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Roberta
763
+ class RobertaLayer(nn.Module):
764
+ def __init__(self, config):
765
+ super().__init__()
766
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
767
+ self.seq_len_dim = 1
768
+ self.attention = RobertaAttention(config)
769
+ self.is_decoder = config.is_decoder
770
+ self.add_cross_attention = config.add_cross_attention
771
+ if self.add_cross_attention:
772
+ if not self.is_decoder:
773
+ raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
774
+ self.crossattention = RobertaAttention(config, position_embedding_type="absolute")
775
+ self.intermediate = RobertaIntermediate(config)
776
+ self.output = RobertaOutput(config)
777
+
778
+ def forward(
779
+ self,
780
+ hidden_states: torch.Tensor,
781
+ attention_mask: Optional[torch.FloatTensor] = None,
782
+ head_mask: Optional[torch.FloatTensor] = None,
783
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
784
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
785
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
786
+ output_attentions: Optional[bool] = False,
787
+ ) -> Tuple[torch.Tensor]:
788
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
789
+ self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
790
+ self_attention_outputs = self.attention(
791
+ hidden_states,
792
+ attention_mask,
793
+ head_mask,
794
+ output_attentions=output_attentions,
795
+ past_key_value=self_attn_past_key_value,
796
+ )
797
+ attention_output = self_attention_outputs[0]
798
+
799
+ # if decoder, the last output is tuple of self-attn cache
800
+ if self.is_decoder:
801
+ outputs = self_attention_outputs[1:-1]
802
+ present_key_value = self_attention_outputs[-1]
803
+ else:
804
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
805
+
806
+ cross_attn_present_key_value = None
807
+ if self.is_decoder and encoder_hidden_states is not None:
808
+ if not hasattr(self, "crossattention"):
809
+ raise ValueError(
810
+ f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
811
+ " by setting `config.add_cross_attention=True`"
812
+ )
813
+
814
+ # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
815
+ cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
816
+ cross_attention_outputs = self.crossattention(
817
+ attention_output,
818
+ attention_mask,
819
+ head_mask,
820
+ encoder_hidden_states,
821
+ encoder_attention_mask,
822
+ cross_attn_past_key_value,
823
+ output_attentions,
824
+ )
825
+ attention_output = cross_attention_outputs[0]
826
+ outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
827
+
828
+ # add cross-attn cache to positions 3,4 of present_key_value tuple
829
+ cross_attn_present_key_value = cross_attention_outputs[-1]
830
+ present_key_value = present_key_value + cross_attn_present_key_value
831
+
832
+ layer_output = apply_chunking_to_forward(
833
+ self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
834
+ )
835
+ outputs = (layer_output,) + outputs
836
+
837
+ # if decoder, return the attn key/values as the last output
838
+ if self.is_decoder:
839
+ outputs = outputs + (present_key_value,)
840
+
841
+ return outputs
842
+
843
+ def feed_forward_chunk(self, attention_output):
844
+ intermediate_output = self.intermediate(attention_output)
845
+ layer_output = self.output(intermediate_output, attention_output)
846
+ return layer_output
847
+
848
+
849
+ # Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Roberta
850
+ class RobertaEncoder(nn.Module):
851
+ def __init__(self, config):
852
+ super().__init__()
853
+ self.config = config
854
+ self.layer = nn.ModuleList([RobertaLayer(config) for _ in range(config.num_hidden_layers)])
855
+ self.gradient_checkpointing = False
856
+
857
+ def forward(
858
+ self,
859
+ hidden_states: torch.Tensor,
860
+ attention_mask: Optional[torch.FloatTensor] = None,
861
+ head_mask: Optional[torch.FloatTensor] = None,
862
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
863
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
864
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
865
+ use_cache: Optional[bool] = None,
866
+ output_attentions: Optional[bool] = False,
867
+ output_hidden_states: Optional[bool] = False,
868
+ return_dict: Optional[bool] = True,
869
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
870
+ all_hidden_states = () if output_hidden_states else None
871
+ all_self_attentions = () if output_attentions else None
872
+ all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
873
+
874
+ if self.gradient_checkpointing and self.training:
875
+ if use_cache:
876
+ logger.warning_once(
877
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
878
+ )
879
+ use_cache = False
880
+
881
+ next_decoder_cache = () if use_cache else None
882
+ for i, layer_module in enumerate(self.layer):
883
+ if output_hidden_states:
884
+ all_hidden_states = all_hidden_states + (hidden_states,)
885
+
886
+ layer_head_mask = head_mask[i] if head_mask is not None else None
887
+ past_key_value = past_key_values[i] if past_key_values is not None else None
888
+
889
+ if self.gradient_checkpointing and self.training:
890
+ layer_outputs = self._gradient_checkpointing_func(
891
+ layer_module.__call__,
892
+ hidden_states,
893
+ attention_mask,
894
+ layer_head_mask,
895
+ encoder_hidden_states,
896
+ encoder_attention_mask,
897
+ past_key_value,
898
+ output_attentions,
899
+ )
900
+ else:
901
+ layer_outputs = layer_module(
902
+ hidden_states,
903
+ attention_mask,
904
+ layer_head_mask,
905
+ encoder_hidden_states,
906
+ encoder_attention_mask,
907
+ past_key_value,
908
+ output_attentions,
909
+ )
910
+
911
+ hidden_states = layer_outputs[0]
912
+ if use_cache:
913
+ next_decoder_cache += (layer_outputs[-1],)
914
+ if output_attentions:
915
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
916
+ if self.config.add_cross_attention:
917
+ all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
918
+
919
+ if output_hidden_states:
920
+ all_hidden_states = all_hidden_states + (hidden_states,)
921
+
922
+ if not return_dict:
923
+ return tuple(
924
+ v
925
+ for v in [
926
+ hidden_states,
927
+ next_decoder_cache,
928
+ all_hidden_states,
929
+ all_self_attentions,
930
+ all_cross_attentions,
931
+ ]
932
+ if v is not None
933
+ )
934
+ return BaseModelOutputWithPastAndCrossAttentions(
935
+ last_hidden_state=hidden_states,
936
+ past_key_values=next_decoder_cache,
937
+ hidden_states=all_hidden_states,
938
+ attentions=all_self_attentions,
939
+ cross_attentions=all_cross_attentions,
940
+ )
941
+
942
+
943
+ # Copied from transformers.models.bert.modeling_bert.BertPooler
944
+ class RobertaPooler(nn.Module):
945
+ def __init__(self, config):
946
+ super().__init__()
947
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
948
+ self.activation = nn.Tanh()
949
+
950
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
951
+ # We "pool" the model by simply taking the hidden state corresponding
952
+ # to the first token.
953
+ first_token_tensor = hidden_states[:, 0]
954
+ pooled_output = self.dense(first_token_tensor)
955
+ pooled_output = self.activation(pooled_output)
956
+ return pooled_output
957
+
958
+
959
+ class RobertaPreTrainedModel(PreTrainedModel):
960
+ """
961
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
962
+ models.
963
+ """
964
+
965
+ config_class = RobertaConfig
966
+ base_model_prefix = "roberta"
967
+ supports_gradient_checkpointing = True
968
+ _no_split_modules = ["RobertaEmbeddings", "RobertaSelfAttention"]
969
+ _supports_flash_attn_2 = True
970
+ _supports_sdpa = True
971
+
972
+ # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
973
+ def _init_weights(self, module):
974
+ """Initialize the weights"""
975
+ if isinstance(module, nn.Linear):
976
+ # Slightly different from the TF version which uses truncated_normal for initialization
977
+ # cf https://github.com/pytorch/pytorch/pull/5617
978
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
979
+ if module.bias is not None:
980
+ module.bias.data.zero_()
981
+ elif isinstance(module, nn.Embedding):
982
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
983
+ if module.padding_idx is not None:
984
+ module.weight.data[module.padding_idx].zero_()
985
+ elif isinstance(module, nn.LayerNorm):
986
+ module.bias.data.zero_()
987
+ module.weight.data.fill_(1.0)
988
+
989
+
990
+ ROBERTA_START_DOCSTRING = r"""
991
+
992
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
993
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
994
+ etc.)
995
+
996
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
997
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
998
+ and behavior.
999
+
1000
+ Parameters:
1001
+ config ([`RobertaConfig`]): Model configuration class with all the parameters of the
1002
+ model. Initializing with a config file does not load the weights associated with the model, only the
1003
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
1004
+ """
1005
+
1006
+ ROBERTA_INPUTS_DOCSTRING = r"""
1007
+ Args:
1008
+ input_ids (`torch.LongTensor` of shape `({0})`):
1009
+ Indices of input sequence tokens in the vocabulary.
1010
+
1011
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1012
+ [`PreTrainedTokenizer.__call__`] for details.
1013
+
1014
+ [What are input IDs?](../glossary#input-ids)
1015
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
1016
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1017
+
1018
+ - 1 for tokens that are **not masked**,
1019
+ - 0 for tokens that are **masked**.
1020
+
1021
+ [What are attention masks?](../glossary#attention-mask)
1022
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
1023
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
1024
+
1025
+ - 0 corresponds to a *sentence A* token,
1026
+ - 1 corresponds to a *sentence B* token.
1027
+ This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
1028
+ >= 2. All the value in this tensor should be always < type_vocab_size.
1029
+
1030
+ [What are token type IDs?](../glossary#token-type-ids)
1031
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
1032
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1033
+ config.max_position_embeddings - 1]`.
1034
+
1035
+ [What are position IDs?](../glossary#position-ids)
1036
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
1037
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
1038
+
1039
+ - 1 indicates the head is **not masked**,
1040
+ - 0 indicates the head is **masked**.
1041
+
1042
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
1043
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
1044
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
1045
+ model's internal embedding lookup matrix.
1046
+ output_attentions (`bool`, *optional*):
1047
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
1048
+ tensors for more detail.
1049
+ output_hidden_states (`bool`, *optional*):
1050
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
1051
+ more detail.
1052
+ return_dict (`bool`, *optional*):
1053
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1054
+ """
1055
+
1056
+
1057
+ @add_start_docstrings(
1058
+ "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
1059
+ ROBERTA_START_DOCSTRING,
1060
+ )
1061
+ class RobertaModel(RobertaPreTrainedModel):
1062
+ """
1063
+
1064
+ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
1065
+ cross-attention is added between the self-attention layers, following the architecture described in *Attention is
1066
+ all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
1067
+ Kaiser and Illia Polosukhin.
1068
+
1069
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
1070
+ to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
1071
+ `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
1072
+
1073
+ .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
1074
+
1075
+ """
1076
+
1077
+ # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
1078
+ def __init__(self, config, add_pooling_layer=True):
1079
+ super().__init__(config)
1080
+ self.config = config
1081
+
1082
+ self.embeddings = RobertaEmbeddings(config)
1083
+ self.encoder = RobertaEncoder(config)
1084
+
1085
+ self.pooler = RobertaPooler(config) if add_pooling_layer else None
1086
+
1087
+ # Initialize weights and apply final processing
1088
+ self.post_init()
1089
+
1090
+ def get_input_embeddings(self):
1091
+ return self.embeddings.word_embeddings
1092
+
1093
+ def set_input_embeddings(self, value):
1094
+ self.embeddings.word_embeddings = value
1095
+
1096
+ def _prune_heads(self, heads_to_prune):
1097
+ """
1098
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
1099
+ class PreTrainedModel
1100
+ """
1101
+ for layer, heads in heads_to_prune.items():
1102
+ self.encoder.layer[layer].attention.prune_heads(heads)
1103
+
1104
+ @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1105
+ @add_code_sample_docstrings(
1106
+ checkpoint=_CHECKPOINT_FOR_DOC,
1107
+ output_type=BaseModelOutputWithPoolingAndCrossAttentions,
1108
+ config_class=_CONFIG_FOR_DOC,
1109
+ )
1110
+ # Copied from transformers.models.bert.modeling_bert.BertModel.forward
1111
+ def forward(
1112
+ self,
1113
+ input_ids: Optional[torch.Tensor] = None,
1114
+ attention_mask: Optional[torch.Tensor] = None,
1115
+ token_type_ids: Optional[torch.Tensor] = None,
1116
+ position_ids: Optional[torch.Tensor] = None,
1117
+ head_mask: Optional[torch.Tensor] = None,
1118
+ inputs_embeds: Optional[torch.Tensor] = None,
1119
+ encoder_hidden_states: Optional[torch.Tensor] = None,
1120
+ encoder_attention_mask: Optional[torch.Tensor] = None,
1121
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1122
+ use_cache: Optional[bool] = None,
1123
+ output_attentions: Optional[bool] = None,
1124
+ output_hidden_states: Optional[bool] = None,
1125
+ return_dict: Optional[bool] = None,
1126
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
1127
+ r"""
1128
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1129
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
1130
+ the model is configured as a decoder.
1131
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
1132
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
1133
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
1134
+
1135
+ - 1 for tokens that are **not masked**,
1136
+ - 0 for tokens that are **masked**.
1137
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
1138
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
1139
+
1140
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
1141
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
1142
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
1143
+ use_cache (`bool`, *optional*):
1144
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1145
+ `past_key_values`).
1146
+ """
1147
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1148
+ output_hidden_states = (
1149
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1150
+ )
1151
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1152
+
1153
+ if self.config.is_decoder:
1154
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1155
+ else:
1156
+ use_cache = False
1157
+
1158
+ if input_ids is not None and inputs_embeds is not None:
1159
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
1160
+ elif input_ids is not None:
1161
+ self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
1162
+ input_shape = input_ids.size()
1163
+ elif inputs_embeds is not None:
1164
+ input_shape = inputs_embeds.size()[:-1]
1165
+ else:
1166
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
1167
+
1168
+ batch_size, seq_length = input_shape
1169
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
1170
+
1171
+ # past_key_values_length
1172
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
1173
+
1174
+ if attention_mask is None:
1175
+ attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
1176
+
1177
+ if token_type_ids is None:
1178
+ if hasattr(self.embeddings, "token_type_ids"):
1179
+ buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
1180
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
1181
+ token_type_ids = buffered_token_type_ids_expanded
1182
+ else:
1183
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
1184
+
1185
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
1186
+ # ourselves in which case we just need to make it broadcastable to all heads.
1187
+ extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
1188
+
1189
+ # If a 2D or 3D attention mask is provided for the cross-attention
1190
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
1191
+ if self.config.is_decoder and encoder_hidden_states is not None:
1192
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
1193
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
1194
+ if encoder_attention_mask is None:
1195
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
1196
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
1197
+ else:
1198
+ encoder_extended_attention_mask = None
1199
+
1200
+ # Prepare head mask if needed
1201
+ # 1.0 in head_mask indicate we keep the head
1202
+ # attention_probs has shape bsz x n_heads x N x N
1203
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
1204
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
1205
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
1206
+
1207
+ embedding_output = self.embeddings(
1208
+ input_ids=input_ids,
1209
+ position_ids=position_ids,
1210
+ token_type_ids=token_type_ids,
1211
+ inputs_embeds=inputs_embeds,
1212
+ past_key_values_length=past_key_values_length,
1213
+ )
1214
+ encoder_outputs = self.encoder(
1215
+ embedding_output,
1216
+ attention_mask=extended_attention_mask,
1217
+ head_mask=head_mask,
1218
+ encoder_hidden_states=encoder_hidden_states,
1219
+ encoder_attention_mask=encoder_extended_attention_mask,
1220
+ past_key_values=past_key_values,
1221
+ use_cache=use_cache,
1222
+ output_attentions=output_attentions,
1223
+ output_hidden_states=output_hidden_states,
1224
+ return_dict=return_dict,
1225
+ )
1226
+ sequence_output = encoder_outputs[0]
1227
+ pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
1228
+
1229
+ if not return_dict:
1230
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
1231
+
1232
+ return BaseModelOutputWithPoolingAndCrossAttentions(
1233
+ last_hidden_state=sequence_output,
1234
+ pooler_output=pooled_output,
1235
+ past_key_values=encoder_outputs.past_key_values,
1236
+ hidden_states=encoder_outputs.hidden_states,
1237
+ attentions=encoder_outputs.attentions,
1238
+ cross_attentions=encoder_outputs.cross_attentions,
1239
+ )
1240
+
1241
+
1242
+ @add_start_docstrings(
1243
+ """RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.""", ROBERTA_START_DOCSTRING
1244
+ )
1245
+ class RobertaForCausalLM(RobertaPreTrainedModel):
1246
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
1247
+
1248
+ def __init__(self, config):
1249
+ super().__init__(config)
1250
+
1251
+ if not config.is_decoder:
1252
+ logger.warning("If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
1253
+
1254
+ self.roberta = RobertaModel(config, add_pooling_layer=False)
1255
+ self.lm_head = RobertaLMHead(config)
1256
+
1257
+ # Initialize weights and apply final processing
1258
+ self.post_init()
1259
+
1260
+ def get_output_embeddings(self):
1261
+ return self.lm_head.decoder
1262
+
1263
+ def set_output_embeddings(self, new_embeddings):
1264
+ self.lm_head.decoder = new_embeddings
1265
+
1266
+ @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1267
+ @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
1268
+ def forward(
1269
+ self,
1270
+ input_ids: Optional[torch.LongTensor] = None,
1271
+ attention_mask: Optional[torch.FloatTensor] = None,
1272
+ token_type_ids: Optional[torch.LongTensor] = None,
1273
+ position_ids: Optional[torch.LongTensor] = None,
1274
+ head_mask: Optional[torch.FloatTensor] = None,
1275
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1276
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
1277
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
1278
+ labels: Optional[torch.LongTensor] = None,
1279
+ past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
1280
+ use_cache: Optional[bool] = None,
1281
+ output_attentions: Optional[bool] = None,
1282
+ output_hidden_states: Optional[bool] = None,
1283
+ return_dict: Optional[bool] = None,
1284
+ ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
1285
+ r"""
1286
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1287
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
1288
+ the model is configured as a decoder.
1289
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
1290
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
1291
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
1292
+
1293
+ - 1 for tokens that are **not masked**,
1294
+ - 0 for tokens that are **masked**.
1295
+
1296
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1297
+ Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
1298
+ `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
1299
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
1300
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
1301
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
1302
+
1303
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
1304
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
1305
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
1306
+ use_cache (`bool`, *optional*):
1307
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1308
+ `past_key_values`).
1309
+
1310
+ Returns:
1311
+
1312
+ Example:
1313
+
1314
+ ```python
1315
+ >>> from transformers import AutoTokenizer, RobertaForCausalLM, AutoConfig
1316
+ >>> import torch
1317
+
1318
+ >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
1319
+ >>> config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
1320
+ >>> config.is_decoder = True
1321
+ >>> model = RobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)
1322
+
1323
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
1324
+ >>> outputs = model(**inputs)
1325
+
1326
+ >>> prediction_logits = outputs.logits
1327
+ ```"""
1328
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1329
+ if labels is not None:
1330
+ use_cache = False
1331
+
1332
+ outputs = self.roberta(
1333
+ input_ids,
1334
+ attention_mask=attention_mask,
1335
+ token_type_ids=token_type_ids,
1336
+ position_ids=position_ids,
1337
+ head_mask=head_mask,
1338
+ inputs_embeds=inputs_embeds,
1339
+ encoder_hidden_states=encoder_hidden_states,
1340
+ encoder_attention_mask=encoder_attention_mask,
1341
+ past_key_values=past_key_values,
1342
+ use_cache=use_cache,
1343
+ output_attentions=output_attentions,
1344
+ output_hidden_states=output_hidden_states,
1345
+ return_dict=return_dict,
1346
+ )
1347
+
1348
+ sequence_output = outputs[0]
1349
+ prediction_scores = self.lm_head(sequence_output)
1350
+
1351
+ lm_loss = None
1352
+ if labels is not None:
1353
+ # move labels to correct device to enable model parallelism
1354
+ labels = labels.to(prediction_scores.device)
1355
+ # we are doing next-token prediction; shift prediction scores and input ids by one
1356
+ shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
1357
+ labels = labels[:, 1:].contiguous()
1358
+ loss_fct = CrossEntropyLoss()
1359
+ lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1360
+
1361
+ if not return_dict:
1362
+ output = (prediction_scores,) + outputs[2:]
1363
+ return ((lm_loss,) + output) if lm_loss is not None else output
1364
+
1365
+ return CausalLMOutputWithCrossAttentions(
1366
+ loss=lm_loss,
1367
+ logits=prediction_scores,
1368
+ past_key_values=outputs.past_key_values,
1369
+ hidden_states=outputs.hidden_states,
1370
+ attentions=outputs.attentions,
1371
+ cross_attentions=outputs.cross_attentions,
1372
+ )
1373
+
1374
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
1375
+ input_shape = input_ids.shape
1376
+ # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
1377
+ if attention_mask is None:
1378
+ attention_mask = input_ids.new_ones(input_shape)
1379
+
1380
+ # cut decoder_input_ids if past_key_values is used
1381
+ if past_key_values is not None:
1382
+ past_length = past_key_values[0][0].shape[2]
1383
+
1384
+ # Some generation methods already pass only the last input ID
1385
+ if input_ids.shape[1] > past_length:
1386
+ remove_prefix_length = past_length
1387
+ else:
1388
+ # Default to old behavior: keep only final ID
1389
+ remove_prefix_length = input_ids.shape[1] - 1
1390
+
1391
+ input_ids = input_ids[:, remove_prefix_length:]
1392
+
1393
+ return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
1394
+
1395
+ def _reorder_cache(self, past_key_values, beam_idx):
1396
+ reordered_past = ()
1397
+ for layer_past in past_key_values:
1398
+ reordered_past += (
1399
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
1400
+ )
1401
+ return reordered_past
1402
+
1403
+
1404
+ @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top.""", ROBERTA_START_DOCSTRING)
1405
+ class RobertaForMaskedLM(RobertaPreTrainedModel):
1406
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
1407
+
1408
+ def __init__(self, config):
1409
+ super().__init__(config)
1410
+
1411
+ if config.is_decoder:
1412
+ logger.warning(
1413
+ "If you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for "
1414
+ "bi-directional self-attention."
1415
+ )
1416
+
1417
+ self.roberta = RobertaModel(config, add_pooling_layer=False)
1418
+ self.lm_head = RobertaLMHead(config)
1419
+
1420
+ # Initialize weights and apply final processing
1421
+ self.post_init()
1422
+
1423
+ def get_output_embeddings(self):
1424
+ return self.lm_head.decoder
1425
+
1426
+ def set_output_embeddings(self, new_embeddings):
1427
+ self.lm_head.decoder = new_embeddings
1428
+
1429
+ @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1430
+ @add_code_sample_docstrings(
1431
+ checkpoint=_CHECKPOINT_FOR_DOC,
1432
+ output_type=MaskedLMOutput,
1433
+ config_class=_CONFIG_FOR_DOC,
1434
+ mask="<mask>",
1435
+ expected_output="' Paris'",
1436
+ expected_loss=0.1,
1437
+ )
1438
+ def forward(
1439
+ self,
1440
+ input_ids: Optional[torch.LongTensor] = None,
1441
+ attention_mask: Optional[torch.FloatTensor] = None,
1442
+ token_type_ids: Optional[torch.LongTensor] = None,
1443
+ position_ids: Optional[torch.LongTensor] = None,
1444
+ head_mask: Optional[torch.FloatTensor] = None,
1445
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1446
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
1447
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
1448
+ labels: Optional[torch.LongTensor] = None,
1449
+ output_attentions: Optional[bool] = None,
1450
+ output_hidden_states: Optional[bool] = None,
1451
+ return_dict: Optional[bool] = None,
1452
+ ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
1453
+ r"""
1454
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1455
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
1456
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
1457
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
1458
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
1459
+ Used to hide legacy arguments that have been deprecated.
1460
+ """
1461
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1462
+
1463
+ outputs = self.roberta(
1464
+ input_ids,
1465
+ attention_mask=attention_mask,
1466
+ token_type_ids=token_type_ids,
1467
+ position_ids=position_ids,
1468
+ head_mask=head_mask,
1469
+ inputs_embeds=inputs_embeds,
1470
+ encoder_hidden_states=encoder_hidden_states,
1471
+ encoder_attention_mask=encoder_attention_mask,
1472
+ output_attentions=output_attentions,
1473
+ output_hidden_states=output_hidden_states,
1474
+ return_dict=return_dict,
1475
+ )
1476
+ sequence_output = outputs[0]
1477
+ prediction_scores = self.lm_head(sequence_output)
1478
+
1479
+ masked_lm_loss = None
1480
+ if labels is not None:
1481
+ # move labels to correct device to enable model parallelism
1482
+ labels = labels.to(prediction_scores.device)
1483
+ loss_fct = CrossEntropyLoss()
1484
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1485
+
1486
+ if not return_dict:
1487
+ output = (prediction_scores,) + outputs[2:]
1488
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
1489
+
1490
+ return MaskedLMOutput(
1491
+ loss=masked_lm_loss,
1492
+ logits=prediction_scores,
1493
+ hidden_states=outputs.hidden_states,
1494
+ attentions=outputs.attentions,
1495
+ )
1496
+
1497
+
1498
+ class RobertaLMHead(nn.Module):
1499
+ """Roberta Head for masked language modeling."""
1500
+
1501
+ def __init__(self, config):
1502
+ super().__init__()
1503
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
1504
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
1505
+
1506
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
1507
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
1508
+ self.decoder.bias = self.bias
1509
+
1510
+ def forward(self, features, **kwargs):
1511
+ x = self.dense(features)
1512
+ x = gelu(x)
1513
+ x = self.layer_norm(x)
1514
+
1515
+ # project back to size of vocabulary with bias
1516
+ x = self.decoder(x)
1517
+
1518
+ return x
1519
+
1520
+ def _tie_weights(self):
1521
+ # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
1522
+ # For accelerate compatibility and to not break backward compatibility
1523
+ if self.decoder.bias.device.type == "meta":
1524
+ self.decoder.bias = self.bias
1525
+ else:
1526
+ self.bias = self.decoder.bias
1527
+
1528
+
1529
+ @add_start_docstrings(
1530
+ """
1531
+ RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
1532
+ pooled output) e.g. for GLUE tasks.
1533
+ """,
1534
+ ROBERTA_START_DOCSTRING,
1535
+ )
1536
+ class RobertaForSequenceClassification(RobertaPreTrainedModel):
1537
+ def __init__(self, config):
1538
+ super().__init__(config)
1539
+ self.num_labels = config.num_labels
1540
+ self.config = config
1541
+
1542
+ self.roberta = RobertaModel(config, add_pooling_layer=False)
1543
+ self.classifier = RobertaClassificationHead(config)
1544
+
1545
+ # Initialize weights and apply final processing
1546
+ self.post_init()
1547
+
1548
+ @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1549
+ @add_code_sample_docstrings(
1550
+ checkpoint="cardiffnlp/twitter-roberta-base-emotion",
1551
+ output_type=SequenceClassifierOutput,
1552
+ config_class=_CONFIG_FOR_DOC,
1553
+ expected_output="'optimism'",
1554
+ expected_loss=0.08,
1555
+ )
1556
+ def forward(
1557
+ self,
1558
+ input_ids: Optional[torch.LongTensor] = None,
1559
+ attention_mask: Optional[torch.FloatTensor] = None,
1560
+ token_type_ids: Optional[torch.LongTensor] = None,
1561
+ position_ids: Optional[torch.LongTensor] = None,
1562
+ head_mask: Optional[torch.FloatTensor] = None,
1563
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1564
+ labels: Optional[torch.LongTensor] = None,
1565
+ output_attentions: Optional[bool] = None,
1566
+ output_hidden_states: Optional[bool] = None,
1567
+ return_dict: Optional[bool] = None,
1568
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
1569
+ r"""
1570
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1571
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1572
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1573
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1574
+ """
1575
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1576
+
1577
+ outputs = self.roberta(
1578
+ input_ids,
1579
+ attention_mask=attention_mask,
1580
+ token_type_ids=token_type_ids,
1581
+ position_ids=position_ids,
1582
+ head_mask=head_mask,
1583
+ inputs_embeds=inputs_embeds,
1584
+ output_attentions=output_attentions,
1585
+ output_hidden_states=output_hidden_states,
1586
+ return_dict=return_dict,
1587
+ )
1588
+ sequence_output = outputs[0]
1589
+ logits = self.classifier(sequence_output)
1590
+
1591
+ loss = None
1592
+ if labels is not None:
1593
+ # move labels to correct device to enable model parallelism
1594
+ labels = labels.to(logits.device)
1595
+ if self.config.problem_type is None:
1596
+ if self.num_labels == 1:
1597
+ self.config.problem_type = "regression"
1598
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1599
+ self.config.problem_type = "single_label_classification"
1600
+ else:
1601
+ self.config.problem_type = "multi_label_classification"
1602
+
1603
+ if self.config.problem_type == "regression":
1604
+ loss_fct = MSELoss()
1605
+ if self.num_labels == 1:
1606
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
1607
+ else:
1608
+ loss = loss_fct(logits, labels)
1609
+ elif self.config.problem_type == "single_label_classification":
1610
+ loss_fct = CrossEntropyLoss()
1611
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1612
+ elif self.config.problem_type == "multi_label_classification":
1613
+ loss_fct = BCEWithLogitsLoss()
1614
+ loss = loss_fct(logits, labels)
1615
+
1616
+ if not return_dict:
1617
+ output = (logits,) + outputs[2:]
1618
+ return ((loss,) + output) if loss is not None else output
1619
+
1620
+ return SequenceClassifierOutput(
1621
+ loss=loss,
1622
+ logits=logits,
1623
+ hidden_states=outputs.hidden_states,
1624
+ attentions=outputs.attentions,
1625
+ )
1626
+
1627
+
1628
+ @add_start_docstrings(
1629
+ """
1630
+ Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
1631
+ softmax) e.g. for RocStories/SWAG tasks.
1632
+ """,
1633
+ ROBERTA_START_DOCSTRING,
1634
+ )
1635
+ class RobertaForMultipleChoice(RobertaPreTrainedModel):
1636
+ def __init__(self, config):
1637
+ super().__init__(config)
1638
+
1639
+ self.roberta = RobertaModel(config)
1640
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
1641
+ self.classifier = nn.Linear(config.hidden_size, 1)
1642
+
1643
+ # Initialize weights and apply final processing
1644
+ self.post_init()
1645
+
1646
+ @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
1647
+ @add_code_sample_docstrings(
1648
+ checkpoint=_CHECKPOINT_FOR_DOC,
1649
+ output_type=MultipleChoiceModelOutput,
1650
+ config_class=_CONFIG_FOR_DOC,
1651
+ )
1652
+ def forward(
1653
+ self,
1654
+ input_ids: Optional[torch.LongTensor] = None,
1655
+ token_type_ids: Optional[torch.LongTensor] = None,
1656
+ attention_mask: Optional[torch.FloatTensor] = None,
1657
+ labels: Optional[torch.LongTensor] = None,
1658
+ position_ids: Optional[torch.LongTensor] = None,
1659
+ head_mask: Optional[torch.FloatTensor] = None,
1660
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1661
+ output_attentions: Optional[bool] = None,
1662
+ output_hidden_states: Optional[bool] = None,
1663
+ return_dict: Optional[bool] = None,
1664
+ ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
1665
+ r"""
1666
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1667
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
1668
+ num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
1669
+ `input_ids` above)
1670
+ """
1671
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1672
+ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
1673
+
1674
+ flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
1675
+ flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
1676
+ flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
1677
+ flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
1678
+ flat_inputs_embeds = (
1679
+ inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
1680
+ if inputs_embeds is not None
1681
+ else None
1682
+ )
1683
+
1684
+ outputs = self.roberta(
1685
+ flat_input_ids,
1686
+ position_ids=flat_position_ids,
1687
+ token_type_ids=flat_token_type_ids,
1688
+ attention_mask=flat_attention_mask,
1689
+ head_mask=head_mask,
1690
+ inputs_embeds=flat_inputs_embeds,
1691
+ output_attentions=output_attentions,
1692
+ output_hidden_states=output_hidden_states,
1693
+ return_dict=return_dict,
1694
+ )
1695
+ pooled_output = outputs[1]
1696
+
1697
+ pooled_output = self.dropout(pooled_output)
1698
+ logits = self.classifier(pooled_output)
1699
+ reshaped_logits = logits.view(-1, num_choices)
1700
+
1701
+ loss = None
1702
+ if labels is not None:
1703
+ # move labels to correct device to enable model parallelism
1704
+ labels = labels.to(reshaped_logits.device)
1705
+ loss_fct = CrossEntropyLoss()
1706
+ loss = loss_fct(reshaped_logits, labels)
1707
+
1708
+ if not return_dict:
1709
+ output = (reshaped_logits,) + outputs[2:]
1710
+ return ((loss,) + output) if loss is not None else output
1711
+
1712
+ return MultipleChoiceModelOutput(
1713
+ loss=loss,
1714
+ logits=reshaped_logits,
1715
+ hidden_states=outputs.hidden_states,
1716
+ attentions=outputs.attentions,
1717
+ )
1718
+
1719
+
1720
+ @add_start_docstrings(
1721
+ """
1722
+ Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
1723
+ Named-Entity-Recognition (NER) tasks.
1724
+ """,
1725
+ ROBERTA_START_DOCSTRING,
1726
+ )
1727
+ class RobertaForTokenClassification(RobertaPreTrainedModel):
1728
+ def __init__(self, config):
1729
+ super().__init__(config)
1730
+ self.num_labels = config.num_labels
1731
+
1732
+ self.roberta = RobertaModel(config, add_pooling_layer=False)
1733
+ classifier_dropout = (
1734
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
1735
+ )
1736
+ self.dropout = nn.Dropout(classifier_dropout)
1737
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
1738
+
1739
+ # Initialize weights and apply final processing
1740
+ self.post_init()
1741
+
1742
+ @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1743
+ @add_code_sample_docstrings(
1744
+ checkpoint="Jean-Baptiste/roberta-large-ner-english",
1745
+ output_type=TokenClassifierOutput,
1746
+ config_class=_CONFIG_FOR_DOC,
1747
+ expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
1748
+ expected_loss=0.01,
1749
+ )
1750
+ def forward(
1751
+ self,
1752
+ input_ids: Optional[torch.LongTensor] = None,
1753
+ attention_mask: Optional[torch.FloatTensor] = None,
1754
+ token_type_ids: Optional[torch.LongTensor] = None,
1755
+ position_ids: Optional[torch.LongTensor] = None,
1756
+ head_mask: Optional[torch.FloatTensor] = None,
1757
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1758
+ labels: Optional[torch.LongTensor] = None,
1759
+ output_attentions: Optional[bool] = None,
1760
+ output_hidden_states: Optional[bool] = None,
1761
+ return_dict: Optional[bool] = None,
1762
+ ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
1763
+ r"""
1764
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1765
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
1766
+ """
1767
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1768
+
1769
+ outputs = self.roberta(
1770
+ input_ids,
1771
+ attention_mask=attention_mask,
1772
+ token_type_ids=token_type_ids,
1773
+ position_ids=position_ids,
1774
+ head_mask=head_mask,
1775
+ inputs_embeds=inputs_embeds,
1776
+ output_attentions=output_attentions,
1777
+ output_hidden_states=output_hidden_states,
1778
+ return_dict=return_dict,
1779
+ )
1780
+
1781
+ sequence_output = outputs[0]
1782
+
1783
+ sequence_output = self.dropout(sequence_output)
1784
+ logits = self.classifier(sequence_output)
1785
+
1786
+ loss = None
1787
+ if labels is not None:
1788
+ # move labels to correct device to enable model parallelism
1789
+ labels = labels.to(logits.device)
1790
+ loss_fct = CrossEntropyLoss()
1791
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1792
+
1793
+ if not return_dict:
1794
+ output = (logits,) + outputs[2:]
1795
+ return ((loss,) + output) if loss is not None else output
1796
+
1797
+ return TokenClassifierOutput(
1798
+ loss=loss,
1799
+ logits=logits,
1800
+ hidden_states=outputs.hidden_states,
1801
+ attentions=outputs.attentions,
1802
+ )
1803
+
1804
+
1805
+ class RobertaClassificationHead(nn.Module):
1806
+ """Head for sentence-level classification tasks."""
1807
+
1808
+ def __init__(self, config):
1809
+ super().__init__()
1810
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
1811
+ classifier_dropout = (
1812
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
1813
+ )
1814
+ self.dropout = nn.Dropout(classifier_dropout)
1815
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
1816
+
1817
+ def forward(self, features, **kwargs):
1818
+ x = features[:, 0, :] # take <s> token (equiv. to [CLS])
1819
+ x = self.dropout(x)
1820
+ x = self.dense(x)
1821
+ x = torch.tanh(x)
1822
+ x = self.dropout(x)
1823
+ x = self.out_proj(x)
1824
+ return x
1825
+
1826
+
1827
+ @add_start_docstrings(
1828
+ """
1829
+ Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
1830
+ layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
1831
+ """,
1832
+ ROBERTA_START_DOCSTRING,
1833
+ )
1834
+ class RobertaForQuestionAnswering(RobertaPreTrainedModel):
1835
+ def __init__(self, config):
1836
+ super().__init__(config)
1837
+ self.num_labels = config.num_labels
1838
+
1839
+ self.roberta = RobertaModel(config, add_pooling_layer=False)
1840
+ self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
1841
+
1842
+ # Initialize weights and apply final processing
1843
+ self.post_init()
1844
+
1845
+ @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1846
+ @add_code_sample_docstrings(
1847
+ checkpoint="deepset/roberta-base-squad2",
1848
+ output_type=QuestionAnsweringModelOutput,
1849
+ config_class=_CONFIG_FOR_DOC,
1850
+ expected_output="' puppet'",
1851
+ expected_loss=0.86,
1852
+ )
1853
+ def forward(
1854
+ self,
1855
+ input_ids: Optional[torch.LongTensor] = None,
1856
+ attention_mask: Optional[torch.FloatTensor] = None,
1857
+ token_type_ids: Optional[torch.LongTensor] = None,
1858
+ position_ids: Optional[torch.LongTensor] = None,
1859
+ head_mask: Optional[torch.FloatTensor] = None,
1860
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1861
+ start_positions: Optional[torch.LongTensor] = None,
1862
+ end_positions: Optional[torch.LongTensor] = None,
1863
+ output_attentions: Optional[bool] = None,
1864
+ output_hidden_states: Optional[bool] = None,
1865
+ return_dict: Optional[bool] = None,
1866
+ ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
1867
+ r"""
1868
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1869
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
1870
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1871
+ are not taken into account for computing the loss.
1872
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1873
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
1874
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1875
+ are not taken into account for computing the loss.
1876
+ """
1877
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1878
+
1879
+ outputs = self.roberta(
1880
+ input_ids,
1881
+ attention_mask=attention_mask,
1882
+ token_type_ids=token_type_ids,
1883
+ position_ids=position_ids,
1884
+ head_mask=head_mask,
1885
+ inputs_embeds=inputs_embeds,
1886
+ output_attentions=output_attentions,
1887
+ output_hidden_states=output_hidden_states,
1888
+ return_dict=return_dict,
1889
+ )
1890
+
1891
+ sequence_output = outputs[0]
1892
+
1893
+ logits = self.qa_outputs(sequence_output)
1894
+ start_logits, end_logits = logits.split(1, dim=-1)
1895
+ start_logits = start_logits.squeeze(-1).contiguous()
1896
+ end_logits = end_logits.squeeze(-1).contiguous()
1897
+
1898
+ total_loss = None
1899
+ if start_positions is not None and end_positions is not None:
1900
+ # If we are on multi-GPU, split add a dimension
1901
+ if len(start_positions.size()) > 1:
1902
+ start_positions = start_positions.squeeze(-1)
1903
+ if len(end_positions.size()) > 1:
1904
+ end_positions = end_positions.squeeze(-1)
1905
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
1906
+ ignored_index = start_logits.size(1)
1907
+ start_positions = start_positions.clamp(0, ignored_index)
1908
+ end_positions = end_positions.clamp(0, ignored_index)
1909
+
1910
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1911
+ start_loss = loss_fct(start_logits, start_positions)
1912
+ end_loss = loss_fct(end_logits, end_positions)
1913
+ total_loss = (start_loss + end_loss) / 2
1914
+
1915
+ if not return_dict:
1916
+ output = (start_logits, end_logits) + outputs[2:]
1917
+ return ((total_loss,) + output) if total_loss is not None else output
1918
+
1919
+ return QuestionAnsweringModelOutput(
1920
+ loss=total_loss,
1921
+ start_logits=start_logits,
1922
+ end_logits=end_logits,
1923
+ hidden_states=outputs.hidden_states,
1924
+ attentions=outputs.attentions,
1925
+ )
1926
+
1927
+
1928
+ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
1929
+ """
1930
+ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
1931
+ are ignored. This is modified from fairseq's `utils.make_positions`.
1932
+
1933
+ Args:
1934
+ x: torch.Tensor x:
1935
+
1936
+ Returns: torch.Tensor
1937
+ """
1938
+ # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
1939
+ mask = input_ids.ne(padding_idx).int()
1940
+ incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
1941
+ return incremental_indices.long() + padding_idx
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "128000": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "max_length": 512,
52
+ "model_max_length": 512,
53
+ "pad_to_multiple_of": null,
54
+ "pad_token": "<pad>",
55
+ "pad_token_type_id": 0,
56
+ "padding_side": "right",
57
+ "sep_token": "</s>",
58
+ "stride": 0,
59
+ "tokenizer_class": "RobertaTokenizer",
60
+ "trim_offsets": true,
61
+ "truncation_side": "right",
62
+ "truncation_strategy": "longest_first",
63
+ "unk_token": "<unk>"
64
+ }
unigram.json ADDED
The diff for this file is too large to render. See raw diff