robinzixuan commited on
Commit
e20d3f0
·
verified ·
1 Parent(s): 349d29a

Upload configuration_opt.py

Browse files
Files changed (1) hide show
  1. configuration_opt.py +145 -0
configuration_opt.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 The Metaseq Authors and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """OPT model configuration"""
16
+
17
+ from ...configuration_utils import PretrainedConfig
18
+ from ...utils import logging
19
+
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+
24
+ class OPTConfig(PretrainedConfig):
25
+ r"""
26
+ This is the configuration class to store the configuration of a [`OPTModel`]. It is used to instantiate a OPT model
27
+ according to the specified arguments, defining the model architecture. Instantiating a configuration with the
28
+ defaults will yield a similar configuration to that of the OPT
29
+ [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) architecture.
30
+
31
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
32
+ documentation from [`PretrainedConfig`] for more information.
33
+
34
+
35
+ Args:
36
+ vocab_size (`int`, *optional*, defaults to 50272):
37
+ Vocabulary size of the OPT model. Defines the number of different tokens that can be represented by the
38
+ `inputs_ids` passed when calling [`OPTModel`]
39
+ hidden_size (`int`, *optional*, defaults to 768):
40
+ Dimensionality of the layers and the pooler layer.
41
+ num_hidden_layers (`int`, *optional*, defaults to 12):
42
+ Number of decoder layers.
43
+ ffn_dim (`int`, *optional*, defaults to 3072):
44
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
45
+ num_attention_heads (`int`, *optional*, defaults to 12):
46
+ Number of attention heads for each attention layer in the Transformer decoder.
47
+ activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
48
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
49
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
50
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
51
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
52
+ just in case (e.g., 512 or 1024 or 2048).
53
+ do_layer_norm_before (`bool`, *optional*, defaults to `True`):
54
+ Whether to perform layer normalization before the attention block.
55
+ word_embed_proj_dim (`int`, *optional*):
56
+ `word_embed_proj_dim` can be set to down-project word embeddings, *e.g.* `opt-350m`. Defaults to
57
+ `hidden_size`.
58
+ dropout (`float`, *optional*, defaults to 0.1):
59
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
60
+ attention_dropout (`float`, *optional*, defaults to 0.0):
61
+ The dropout ratio for the attention probabilities.
62
+ layerdrop (`float`, *optional*, defaults to 0.0):
63
+ The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
64
+ details.
65
+ init_std (`float`, *optional*, defaults to 0.02):
66
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
67
+ use_cache (`bool`, *optional*, defaults to `True`):
68
+ Whether or not the model should return the last key/values attentions (not used by all models).
69
+ enable_bias (`bool`, *optional*, defaults to `True`):
70
+ Whether or not if the linear layers in the attention blocks should use the bias term.
71
+ layer_norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
72
+ Whether or not if the layer norms should have learnable parameters.
73
+
74
+ Example:
75
+
76
+ ```python
77
+ >>> from transformers import OPTConfig, OPTModel
78
+
79
+ >>> # Initializing a OPT facebook/opt-large style configuration
80
+ >>> configuration = OPTConfig()
81
+
82
+ >>> # Initializing a model (with random weights) from the facebook/opt-large style configuration
83
+ >>> model = OPTModel(configuration)
84
+
85
+ >>> # Accessing the model configuration
86
+ >>> configuration = model.config
87
+ ```"""
88
+
89
+ model_type = "opt"
90
+ keys_to_ignore_at_inference = ["past_key_values"]
91
+
92
+ def __init__(
93
+ self,
94
+ vocab_size=50272,
95
+ hidden_size=768,
96
+ num_hidden_layers=12,
97
+ ffn_dim=3072,
98
+ max_position_embeddings=2048,
99
+ do_layer_norm_before=True,
100
+ _remove_final_layer_norm=False,
101
+ word_embed_proj_dim=None,
102
+ dropout=0.1,
103
+ attention_dropout=0.0,
104
+ num_attention_heads=12,
105
+ activation_function="relu",
106
+ layerdrop=0.0,
107
+ init_std=0.02,
108
+ use_cache=True,
109
+ pad_token_id=1,
110
+ bos_token_id=2,
111
+ eos_token_id=2,
112
+ enable_bias=True,
113
+ layer_norm_elementwise_affine=True,
114
+ attn_implementation='eager',
115
+ **kwargs,
116
+ ):
117
+ super().__init__(
118
+ pad_token_id=pad_token_id,
119
+ bos_token_id=bos_token_id,
120
+ eos_token_id=eos_token_id,
121
+ **kwargs,
122
+ )
123
+ self.vocab_size = vocab_size
124
+ self.max_position_embeddings = max_position_embeddings
125
+ self.num_attention_heads = num_attention_heads
126
+ self.word_embed_proj_dim = word_embed_proj_dim if word_embed_proj_dim is not None else hidden_size
127
+ self.ffn_dim = ffn_dim
128
+ self.hidden_size = hidden_size
129
+ self.num_hidden_layers = num_hidden_layers
130
+ self.dropout = dropout
131
+ self.attention_dropout = attention_dropout
132
+ self.activation_function = activation_function
133
+ self.init_std = init_std
134
+ self.layerdrop = layerdrop
135
+ self.use_cache = use_cache
136
+ self.do_layer_norm_before = do_layer_norm_before
137
+ # We keep these variables at `True` for backward compatibility.
138
+ self.enable_bias = enable_bias
139
+ self.layer_norm_elementwise_affine = layer_norm_elementwise_affine
140
+
141
+ # Note that the only purpose of `_remove_final_layer_norm` is to keep backward compatibility
142
+ # with checkpoints that have been fine-tuned before transformers v4.20.1
143
+ # see https://github.com/facebookresearch/metaseq/pull/164
144
+ self._remove_final_layer_norm = _remove_final_layer_norm
145
+ self.attn_implementation = attn_implementation