{ "pipe_parallel_size": 1, "model_parallel_size": 2, "make_vocab_size_divisible_by": 1, # model settings "num_layers": 40, "hidden_size": 5120, "num_attention_heads": 40, "seq_length": 4096, "max_position_embeddings": 4096, "pos_emb": "rotary", "rotary_pct": 1, "no_weight_tying": true, "gpt_j_residual": false, "output_layer_parallelism": "column", "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-5, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": false, "use_bias_in_norms": false, "use_bias_in_attn_linear": false, "activation": "swiglu", "mlp_multiple_of": 256, }