new tokens
Browse files
scripts/pretrain-core-model.yaml
CHANGED
@@ -18,7 +18,7 @@ model_config:
|
|
18 |
bias: False
|
19 |
norm_class_name: "RMSNorm"
|
20 |
mlp_class_name: "LLaMAMLP"
|
21 |
-
intermediate_size:
|
22 |
norm_eps: 1e-5
|
23 |
rope_base: 500000
|
24 |
rope_adjustments:
|
@@ -67,8 +67,8 @@ train:
|
|
67 |
# global_batch_size: 256
|
68 |
|
69 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
70 |
-
micro_batch_size: 4
|
71 |
-
|
72 |
# micro_batch_size: 1
|
73 |
|
74 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
@@ -78,7 +78,7 @@ train:
|
|
78 |
epochs:
|
79 |
|
80 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
81 |
-
max_tokens:
|
82 |
|
83 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
84 |
max_steps:
|
|
|
18 |
bias: False
|
19 |
norm_class_name: "RMSNorm"
|
20 |
mlp_class_name: "LLaMAMLP"
|
21 |
+
intermediate_size: 2688 # n_embd * 5.25
|
22 |
norm_eps: 1e-5
|
23 |
rope_base: 500000
|
24 |
rope_adjustments:
|
|
|
67 |
# global_batch_size: 256
|
68 |
|
69 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
70 |
+
# micro_batch_size: 4
|
71 |
+
micro_batch_size: 2
|
72 |
# micro_batch_size: 1
|
73 |
|
74 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
|
|
78 |
epochs:
|
79 |
|
80 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
81 |
+
max_tokens: 6428475392
|
82 |
|
83 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
84 |
max_steps:
|