NEOX / tests /config /test_setup.yml
akswelh's picture
Upload 251 files
d90b3a8 verified
# 19M parameter model, & local setup with some additional simplifications
{
# Settings to make the test setup as lightweight as possible
"data_path": "data/enwik8/enwik8_text_document",
"vocab_file": "data/gpt2-vocab.json",
"merge_file": "data/gpt2-merges.txt",
"lr_decay_iters": 20,
"train_iters": 20,
"hostfile": "None",
"include": "localhost:1",
"use_wandb": False,
# Settings copied from 19M parameter config (some modifications above, meaning we can't use configs/19M.yml directly)
"pipe_parallel_size": 1,
"model_parallel_size": 1,
# model settings
"num_layers": 2,
"hidden_size": 8,
"num_attention_heads": 4,
"seq_length": 1024,
"max_position_embeddings": 1024,
"pos_emb": "rotary",
"no_weight_tying": true,
"gpt_j_residual": false,
"output_layer_parallelism": "column",
"scaled_upper_triang_masked_softmax_fusion": false,
"bias_gelu_fusion": false,
"rope_fusion": false,
"layernorm_fusion": false,
# Optimizer
"optimizer": {
"type": "sm3",
"params": {},
},
# precision
"precision": "fp16",
# init methods
"init_method": "small_init",
"output_layer_init_method": "wang_init",
"train_micro_batch_size_per_gpu": 4,
"gradient_accumulation_steps": 1,
"data_impl": "mmap",
"num_workers": 1,
# activation checkpointing
"checkpoint_activations": true,
"checkpoint_num_layers": 1,
"partition_activations": true,
"synchronize_each_layer": true,
# regularization
"gradient_clipping": 1.0,
"weight_decay": 0.1,
"hidden_dropout": 0,
"attention_dropout": 0,
"distributed_backend": "nccl",
"lr_decay_style": "cosine",
"warmup": 0.01,
"checkpoint_factor": 1000,
"eval_interval": 100000,
"eval_iters": 10,
"log_interval": 10,
"steps_per_print": 10,
"wall_clock_breakdown": true,
# additional deepspeed args not specified above
"deepspeed_extra_args": {
"comms_logger": {
"enabled": true,
"verbose": true,
"prof_all": true,
"debug": false
},
}
}