pretrain model
Browse files
scripts/pretrain-model.yaml
CHANGED
@@ -72,13 +72,15 @@ train:
|
|
72 |
micro_batch_size: 16
|
73 |
|
74 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
75 |
-
lr_warmup_steps: 2000
|
|
|
76 |
|
77 |
# Number of epochs to train on (type: Optional[int], default: null)
|
78 |
epochs:
|
79 |
|
80 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
81 |
-
max_tokens: 21260243688 # 5187956 * 2049 * 2
|
|
|
82 |
|
83 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
84 |
max_steps:
|
@@ -121,15 +123,15 @@ optimizer:
|
|
121 |
|
122 |
init_args:
|
123 |
# (type: float, default: 0.001)
|
124 |
-
lr: 1e-
|
125 |
|
126 |
# (type: float, default: 0.01)
|
127 |
-
weight_decay:
|
128 |
|
129 |
# (type: tuple, default: (0.9,0.999))
|
130 |
betas:
|
131 |
- 0.9
|
132 |
-
- 0.
|
133 |
|
134 |
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
135 |
devices: auto
|
@@ -145,4 +147,4 @@ tokenizer_dir: "../"
|
|
145 |
logger_name: "wandb"
|
146 |
|
147 |
# The random seed to use for reproducibility. (type: int, default: 42)
|
148 |
-
seed:
|
|
|
72 |
micro_batch_size: 16
|
73 |
|
74 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
75 |
+
# lr_warmup_steps: 2000
|
76 |
+
lr_warmup_steps: 10
|
77 |
|
78 |
# Number of epochs to train on (type: Optional[int], default: null)
|
79 |
epochs:
|
80 |
|
81 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
82 |
+
# max_tokens: 21260243688 # 5187956 * 2049 * 2
|
83 |
+
max_tokens: 10630121844 # 5187956 * 2049 * 1
|
84 |
|
85 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
86 |
max_steps:
|
|
|
123 |
|
124 |
init_args:
|
125 |
# (type: float, default: 0.001)
|
126 |
+
lr: 1e-3
|
127 |
|
128 |
# (type: float, default: 0.01)
|
129 |
+
weight_decay: 1e-2
|
130 |
|
131 |
# (type: tuple, default: (0.9,0.999))
|
132 |
betas:
|
133 |
- 0.9
|
134 |
+
- 0.999
|
135 |
|
136 |
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
137 |
devices: auto
|
|
|
147 |
logger_name: "wandb"
|
148 |
|
149 |
# The random seed to use for reproducibility. (type: int, default: 42)
|
150 |
+
seed: 23
|