tangledgroup
/

tangled-alpha-0.2-core

Text Generation

Inference Endpoints

Model card Files Files and versions Community

mtasic85 commited on 3 days ago

Commit

c3905b0

·

1 Parent(s): 356cb1c

new tokens

Files changed (1) hide show

scripts/pretrain-core-model.yaml +4 -4

scripts/pretrain-core-model.yaml CHANGED Viewed

@@ -18,7 +18,7 @@ model_config:
   bias: False
   norm_class_name: "RMSNorm"
   mlp_class_name: "LLaMAMLP"
-  intermediate_size: 3584
   norm_eps: 1e-5
   rope_base: 500000
   rope_adjustments:
@@ -67,8 +67,8 @@ train:
   # global_batch_size: 256
   # Number of samples per data-parallel rank (type: int, default: 4)
-  micro_batch_size: 4
-  # micro_batch_size: 2
   # micro_batch_size: 1
   # Number of iterations with learning rate warmup active (type: int, default: 2000)
@@ -78,7 +78,7 @@ train:
   epochs:
   # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
-  max_tokens: 7318364160
   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
   max_steps:

   bias: False
   norm_class_name: "RMSNorm"
   mlp_class_name: "LLaMAMLP"
+  intermediate_size: 2688 # n_embd * 5.25
   norm_eps: 1e-5
   rope_base: 500000
   rope_adjustments:
   # global_batch_size: 256
   # Number of samples per data-parallel rank (type: int, default: 4)
+  # micro_batch_size: 4
+  micro_batch_size: 2
   # micro_batch_size: 1
   # Number of iterations with learning rate warmup active (type: int, default: 2000)
   epochs:
   # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
+  max_tokens: 6428475392
   # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
   max_steps: