File size: 8,519 Bytes
23fe031
 
 
8e64bfa
 
23fe031
 
8e64bfa
 
 
 
 
e77f999
8e64bfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e77f999
 
8e64bfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e77f999
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e64bfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23fe031
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import json
import copy

from transformers.configuration_utils import PretrainedConfig

__all__=['AbsModelConfig', 'ModelConfig']


class DebertaConfig(PretrainedConfig):
    model_type = 'deberta-v2'
    
    def __init__(self,
                 vocab_size=22669,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 relax_projection=0,
                 new_pos_ids=False,
                 initializer_range=0.02,
                 task_idx=None,
                 fp32_embedding=False,
                 ffn_type=0,
                 label_smoothing=None,
                 num_qkv=0,
                 seg_emb=False,
                 **kwargs):
        """Constructs BertConfig.

        Args:
            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
            hidden_size: Size of the encoder layers and the pooler layer.
            num_hidden_layers: Number of hidden layers in the Transformer encoder.
            num_attention_heads: Number of attention heads for each attention layer in
                the Transformer encoder.
            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
                layer in the Transformer encoder.
            hidden_act: The non-linear activation function (function or string) in the
                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
            hidden_dropout_prob: The dropout probabilitiy for all fully connected
                layers in the embeddings, encoder, and pooler.
            attention_probs_dropout_prob: The dropout ratio for the attention
                probabilities.
            max_position_embeddings: The maximum sequence length that this model might
                ever be used with. Typically set this to something large just in case
                (e.g., 512 or 1024 or 2048).
            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
                `BertModel`.
            initializer_range: The sttdev of the truncated_normal_initializer for
                initializing all weight matrices.
        """
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.relax_projection = relax_projection
        self.new_pos_ids = new_pos_ids
        self.initializer_range = initializer_range
        self.task_idx = task_idx
        self.fp32_embedding = fp32_embedding
        self.ffn_type = ffn_type
        self.label_smoothing = label_smoothing
        self.num_qkv = num_qkv
        self.seg_emb = seg_emb
        super().__init__(**kwargs)
    # @classmethod
    # def from_dict(cls, json_object):
    #     """Constructs a `BertConfig` from a Python dictionary of parameters."""
    #     config = DebertaConfig(vocab_size_or_config_json_file=-1)
    #     for key, value in json_object.items():
    #         config.__dict__[key] = value
    #     return config

    # @classmethod
    # def from_json_file(cls, json_file):
    #     """Constructs a `BertConfig` from a json file of parameters."""
    #     with open(json_file, "r", encoding='utf-8') as reader:
    #         text = reader.read()
    #     return cls.from_dict(json.loads(text))

    # def __repr__(self):
    #     return str(self.to_json_string())

    # def to_dict(self):
    #     """Serializes this instance to a Python dictionary."""
    #     output = copy.deepcopy(self.__dict__)
    #     return output

    # def to_json_string(self):
    #     """Serializes this instance to a JSON string."""
    #     return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

class AbsModelConfig(object):
    def __init__(self):
        pass

    @classmethod
    def from_dict(cls, json_object):
        """Constructs a `ModelConfig` from a Python dictionary of parameters."""
        config = cls()
        for key, value in json_object.items():
            if isinstance(value, dict):
                value = AbsModelConfig.from_dict(value)
            config.__dict__[key] = value
        return config

    @classmethod
    def from_json_file(cls, json_file):
        """Constructs a `ModelConfig` from a json file of parameters."""
        with open(json_file, "r", encoding='utf-8') as reader:
            text = reader.read()
        return cls.from_dict(json.loads(text))

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        def _json_default(obj):
            if isinstance(obj, AbsModelConfig):
                return obj.__dict__
        return json.dumps(self.__dict__, indent=2, sort_keys=True, default=_json_default) + "\n"

class ModelConfig(AbsModelConfig):
    """Configuration class to store the configuration of a :class:`~DeBERTa.deberta.DeBERTa` model.

        Attributes:
            hidden_size (int): Size of the encoder layers and the pooler layer, default: `768`.
            num_hidden_layers (int): Number of hidden layers in the Transformer encoder, default: `12`.
            num_attention_heads (int): Number of attention heads for each attention layer in
                the Transformer encoder, default: `12`.
            intermediate_size (int): The size of the "intermediate" (i.e., feed-forward)
                layer in the Transformer encoder, default: `3072`.
            hidden_act (str): The non-linear activation function (function or string) in the
                encoder and pooler. If string, "gelu", "relu" and "swish" are supported, default: `gelu`.
            hidden_dropout_prob (float): The dropout probabilitiy for all fully connected
                layers in the embeddings, encoder, and pooler, default: `0.1`.
            attention_probs_dropout_prob (float): The dropout ratio for the attention
                probabilities, default: `0.1`.
            max_position_embeddings (int): The maximum sequence length that this model might
                ever be used with. Typically set this to something large just in case
                (e.g., 512 or 1024 or 2048), default: `512`.
            type_vocab_size (int): The vocabulary size of the `token_type_ids` passed into
                `DeBERTa` model, default: `-1`.
            initializer_range (int): The sttdev of the _normal_initializer for
                initializing all weight matrices, default: `0.02`.
            relative_attention (:obj:`bool`): Whether use relative position encoding, default: `False`.
            max_relative_positions (int): The range of relative positions [`-max_position_embeddings`, `max_position_embeddings`], default: -1, use the same value as `max_position_embeddings`. 
            padding_idx (int): The value used to pad input_ids, default: `0`.
            position_biased_input (:obj:`bool`): Whether add absolute position embedding to content embedding, default: `True`.
            pos_att_type (:obj:`str`): The type of relative position attention, it can be a combination of [`p2c`, `c2p`, `p2p`], e.g. "p2c", "p2c|c2p", "p2c|c2p|p2p"., default: "None".


    """
    def __init__(self):
        """Constructs ModelConfig.

        """
        
        self.hidden_size = 768
        self.num_hidden_layers = 12
        self.num_attention_heads = 12
        self.hidden_act = "gelu"
        self.intermediate_size = 3072
        self.hidden_dropout_prob = 0.1
        self.attention_probs_dropout_prob = 0.1
        self.max_position_embeddings = 512
        self.type_vocab_size = 0
        self.initializer_range = 0.02
        self.layer_norm_eps = 1e-7
        self.padding_idx = 0
        self.vocab_size = -1