File size: 8,519 Bytes
23fe031 8e64bfa 23fe031 8e64bfa e77f999 8e64bfa e77f999 8e64bfa e77f999 8e64bfa 23fe031 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
import json
import copy
from transformers.configuration_utils import PretrainedConfig
__all__=['AbsModelConfig', 'ModelConfig']
class DebertaConfig(PretrainedConfig):
model_type = 'deberta-v2'
def __init__(self,
vocab_size=22669,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
relax_projection=0,
new_pos_ids=False,
initializer_range=0.02,
task_idx=None,
fp32_embedding=False,
ffn_type=0,
label_smoothing=None,
num_qkv=0,
seg_emb=False,
**kwargs):
"""Constructs BertConfig.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`BertModel`.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.relax_projection = relax_projection
self.new_pos_ids = new_pos_ids
self.initializer_range = initializer_range
self.task_idx = task_idx
self.fp32_embedding = fp32_embedding
self.ffn_type = ffn_type
self.label_smoothing = label_smoothing
self.num_qkv = num_qkv
self.seg_emb = seg_emb
super().__init__(**kwargs)
# @classmethod
# def from_dict(cls, json_object):
# """Constructs a `BertConfig` from a Python dictionary of parameters."""
# config = DebertaConfig(vocab_size_or_config_json_file=-1)
# for key, value in json_object.items():
# config.__dict__[key] = value
# return config
# @classmethod
# def from_json_file(cls, json_file):
# """Constructs a `BertConfig` from a json file of parameters."""
# with open(json_file, "r", encoding='utf-8') as reader:
# text = reader.read()
# return cls.from_dict(json.loads(text))
# def __repr__(self):
# return str(self.to_json_string())
# def to_dict(self):
# """Serializes this instance to a Python dictionary."""
# output = copy.deepcopy(self.__dict__)
# return output
# def to_json_string(self):
# """Serializes this instance to a JSON string."""
# return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
class AbsModelConfig(object):
def __init__(self):
pass
@classmethod
def from_dict(cls, json_object):
"""Constructs a `ModelConfig` from a Python dictionary of parameters."""
config = cls()
for key, value in json_object.items():
if isinstance(value, dict):
value = AbsModelConfig.from_dict(value)
config.__dict__[key] = value
return config
@classmethod
def from_json_file(cls, json_file):
"""Constructs a `ModelConfig` from a json file of parameters."""
with open(json_file, "r", encoding='utf-8') as reader:
text = reader.read()
return cls.from_dict(json.loads(text))
def __repr__(self):
return str(self.to_json_string())
def to_dict(self):
"""Serializes this instance to a Python dictionary."""
output = copy.deepcopy(self.__dict__)
return output
def to_json_string(self):
"""Serializes this instance to a JSON string."""
def _json_default(obj):
if isinstance(obj, AbsModelConfig):
return obj.__dict__
return json.dumps(self.__dict__, indent=2, sort_keys=True, default=_json_default) + "\n"
class ModelConfig(AbsModelConfig):
"""Configuration class to store the configuration of a :class:`~DeBERTa.deberta.DeBERTa` model.
Attributes:
hidden_size (int): Size of the encoder layers and the pooler layer, default: `768`.
num_hidden_layers (int): Number of hidden layers in the Transformer encoder, default: `12`.
num_attention_heads (int): Number of attention heads for each attention layer in
the Transformer encoder, default: `12`.
intermediate_size (int): The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder, default: `3072`.
hidden_act (str): The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported, default: `gelu`.
hidden_dropout_prob (float): The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler, default: `0.1`.
attention_probs_dropout_prob (float): The dropout ratio for the attention
probabilities, default: `0.1`.
max_position_embeddings (int): The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048), default: `512`.
type_vocab_size (int): The vocabulary size of the `token_type_ids` passed into
`DeBERTa` model, default: `-1`.
initializer_range (int): The sttdev of the _normal_initializer for
initializing all weight matrices, default: `0.02`.
relative_attention (:obj:`bool`): Whether use relative position encoding, default: `False`.
max_relative_positions (int): The range of relative positions [`-max_position_embeddings`, `max_position_embeddings`], default: -1, use the same value as `max_position_embeddings`.
padding_idx (int): The value used to pad input_ids, default: `0`.
position_biased_input (:obj:`bool`): Whether add absolute position embedding to content embedding, default: `True`.
pos_att_type (:obj:`str`): The type of relative position attention, it can be a combination of [`p2c`, `c2p`, `p2p`], e.g. "p2c", "p2c|c2p", "p2c|c2p|p2p"., default: "None".
"""
def __init__(self):
"""Constructs ModelConfig.
"""
self.hidden_size = 768
self.num_hidden_layers = 12
self.num_attention_heads = 12
self.hidden_act = "gelu"
self.intermediate_size = 3072
self.hidden_dropout_prob = 0.1
self.attention_probs_dropout_prob = 0.1
self.max_position_embeddings = 512
self.type_vocab_size = 0
self.initializer_range = 0.02
self.layer_norm_eps = 1e-7
self.padding_idx = 0
self.vocab_size = -1
|