binoy370sk commited on
Commit
4ab4390
·
verified ·
1 Parent(s): 295682f

Upload 3 files

Browse files
configs/31M-pythia-residual-allmods.yml ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ #"data_path": "/mnt/data/Tejas/GPT-NEOX/custom/BIN/data_text_document",
3
+ "train_data_paths": ["train_test_validate_for_neox/neox_train/neox_train_text_document"],
4
+ "valid_data_paths": ["train_test_validate_for_neox/neox_val/neox_val_text_document"],
5
+ "test_data_paths": ["train_test_validate_for_neox/neox_test/neox_test_text_document"],
6
+
7
+ "vocab_file": "train_test_validate_for_neox/slimpajama_val_test_trained_bpe_tok.json",
8
+ "tokenizer_type": "HFTokenizer",
9
+
10
+ "checkpoint_validation_with_forward_pass": False,
11
+
12
+
13
+ "save": "checkpoints/31M_pythia_residual_allmods",
14
+ "load": "checkpoints/31M_pythia_residual_allmods",
15
+ "tensorboard_dir": "tensorboard/31M_pythia_residual_allmods",
16
+ "log_dir": "logs/31M_pythia_residual_allmods",
17
+ #mods
18
+ "dual_residual": True,
19
+ # "precision": "bfloat16",
20
+ "activation": "swiglu",
21
+ "num_kv_heads": 2,
22
+ "norm": "crmsnorm",
23
+ "crms_norm_epsilon": 1.0e-8,
24
+
25
+ "use_wandb": False,
26
+
27
+ # parallelism settings
28
+ "pipe_parallel_size": 1,
29
+ "model_parallel_size": 2,
30
+
31
+ # model settings
32
+ "num_layers": 6,
33
+ "hidden_size": 256,
34
+ "num_attention_heads": 8,
35
+ "seq_length": 2048,
36
+ "max_position_embeddings": 2048,
37
+ "pos_emb": "rotary",
38
+ "rotary_pct": 0.25,
39
+ "no_weight_tying": true,
40
+ # "gpt_j_residual": true,
41
+ "output_layer_parallelism": "column",
42
+
43
+ "attention_config": [[["flash"], 6]],
44
+
45
+ "scaled_upper_triang_masked_softmax_fusion": true,
46
+ "bias_gelu_fusion": true,
47
+
48
+ # init methods
49
+ "init_method": "small_init",
50
+ "output_layer_init_method": "wang_init",
51
+
52
+ "optimizer": {
53
+ "type": "Adam",
54
+ "params": {
55
+ "lr": 0.0001,
56
+ "betas": [0.9, 0.95],
57
+ "eps": 1.0e-8
58
+ }
59
+ },
60
+ "min_lr": 0.00001,
61
+
62
+ "zero_optimization": {
63
+ "stage": 0,
64
+ "allgather_partitions": true,
65
+ "allgather_bucket_size": 500000000,
66
+ "overlap_comm": true,
67
+ "reduce_scatter": true,
68
+ "reduce_bucket_size": 500000000,
69
+ "contiguous_gradients": true,
70
+ "cpu_offload": false
71
+ },
72
+
73
+ # batch size (trained on 32 gpus)
74
+ "train_micro_batch_size_per_gpu": 8,
75
+ "data_impl": "mmap",
76
+ "num_workers": 1,
77
+
78
+ # activation checkpointing
79
+ "checkpoint_activations": true,
80
+ "checkpoint_num_layers": 1,
81
+ "partition_activations": true,
82
+ "synchronize_each_layer": true,
83
+
84
+ # regularization
85
+ "gradient_clipping": 1.0,
86
+ "weight_decay": 0.1,
87
+ "hidden_dropout": 0,
88
+ "attention_dropout": 0,
89
+
90
+
91
+ # precision settings
92
+ "fp16": {
93
+ "fp16": true,
94
+ "enabled": true,
95
+ "loss_scale": 0,
96
+ "loss_scale_window": 1000,
97
+ "initial_scale_power": 12,
98
+ "hysteresis": 2,
99
+ "min_loss_scale": 1
100
+ },
101
+
102
+ # misc. training settings
103
+ "train_iters": 100000,
104
+ "lr_decay_iters": 100000,
105
+ "distributed_backend": "nccl",
106
+ "lr_decay_style": "cosine",
107
+ "warmup": 0.01,
108
+ "checkpoint_factor": 5000,
109
+ "eval_interval": 5000,
110
+ "eval_iters": 10,
111
+ "do_test": true,
112
+ "extra_save_iters": [10,100,500,1000],
113
+
114
+ # logging
115
+ "log_interval": 100,
116
+ "steps_per_print": 10,
117
+ "keep_last_n_checkpoints": 10,
118
+ "wall_clock_breakdown": true,
119
+
120
+
121
+ }
mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95b0772c9a3a59939c333ed1757a4db9bfb23ffd104e9656aeb341eb8d4eb3e0
3
+ size 138216741
mp_rank_01_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87ab5ef624d7de495abc582e55e2e36e605b58acd885a222d61ab9db8907c93c
3
+ size 138216741