{ "pipe_parallel_size": 0, "model_parallel_size": 4, "make_vocab_size_divisible_by": 1, # model settings "num_layers": 32, "hidden_size": 4096, "num_attention_heads": 32, "num_kv_heads": 8, # llama3 supports more than this but this is just for testing. "seq_length": 1024, "max_position_embeddings": 1024, "pos_emb": "rotary", "rotary_pct": 1, "rotary_emb_base": 500000, "rope_fusion": true, "no_weight_tying": true, "gpt_j_residual": false, "output_layer_parallelism": "column", "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-5, "attention_config": [[["flash"], 32]], "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": false, "use_bias_in_norms": false, "use_bias_in_attn_linear": false, "use_bias_in_mlp": false, "use_flashattn_swiglu": true, "activation": "swiglu", "intermediate_size": 14336, "mlp_multiple_of": 14336, "optimizer": { "type": "Adam", "params": { "lr": 5.0e-7, "betas": [0.9, 0.95], "eps": 1.0e-8 } }, "min_lr": 0.0, "zero_optimization": { "stage": 1, "allgather_partitions": true, "allgather_bucket_size": 1260000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 1260000000, "contiguous_gradients": true, "cpu_offload": false }, "train_impl": "dpo", "dataset_impl": "pairwise", "dpo_fp32": true, "dpo_beta": 0.01, "allow_chopped": false, "pos_train_data_paths": [ "data/pairwise/llama3_dpo_train_chosen_document" ], "pos_train_label_data_paths": [ "data/pairwise/llama3_dpo_train_chosen_label_document" ], "neg_train_data_paths": [ "data/pairwise/llama3_dpo_train_rejected_document" ], "neg_train_label_data_paths": [ "data/pairwise/llama3_dpo_train_rejected_label_document" ], "pos_valid_data_paths": [ "data/pairwise/llama3_dpo_val_chosen_document" ], "pos_valid_label_data_paths": [ "data/pairwise/llama3_dpo_val_chosen_label_document" ], "neg_valid_data_paths": [ "data/pairwise/llama3_dpo_val_rejected_document" ], "neg_valid_label_data_paths": [ "data/pairwise/llama3_dpo_val_rejected_label_document" ], "pos_test_data_paths": [ "data/pairwise/llama3_dpo_val_chosen_document" ], "pos_test_label_data_paths": [ "data/pairwise/llama3_dpo_val_chosen_label_document" ], "neg_test_data_paths": [ "data/pairwise/llama3_dpo_val_rejected_document" ], "neg_test_label_data_paths": [ "data/pairwise/llama3_dpo_val_rejected_label_document" ], "train_micro_batch_size_per_gpu": 32, "gradient_accumulation_steps": 2, "data_impl": "mmap", "pack_impl": "unpacked", "num_workers": 1, "checkpoint_activations": true, "checkpoint_num_layers": 1, "partition_activations": true, "synchronize_each_layer": true, "gradient_clipping": 1.0, "weight_decay": 0.1, "hidden_dropout": 0, "attention_dropout": 0, "precision": "bfloat16", "fp32_allreduce": true, "bf16": { "enabled": true }, "data_types": { "grad_accum_dtype": "fp32" }, "train_iters": 477, "lr_decay_iters": 477, "distributed_backend": "nccl", "lr_decay_style": "cosine", "warmup": 0.1, "checkpoint_factor": 1000, "eval_interval": 100, "eval_iters": 10, "log_interval": 1, "steps_per_print": 1, "wall_clock_breakdown": true, "save": "checkpoints/dpo/llama3/llama3-8b-instruct", #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save" "load": "checkpoints/neox_converted/llama3-8b-instruct", "vocab-file": "checkpoints/neox_converted/llama3-8b-instruct/tokenizer/tokenizer.json", "use_wandb": true, "wandb_group": "llama3-8b-instruct", "wandb_project": "ultrafeedback-dpo", "finetune": true, # set to false once resuming from intermediate finetuning step "tokenizer_type": "HFTokenizer", }