|
{ |
|
"pipe_parallel_size": 1, |
|
"model_parallel_size": 1, |
|
"make_vocab_size_divisible_by": 1, |
|
|
|
|
|
"num_layers": 48, |
|
"hidden_size": 8192, |
|
"num_attention_heads": 64, |
|
"num_kv_heads": 8, |
|
|
|
|
|
"seq_length": 16384, |
|
"max_position_embeddings": 16384, |
|
"pos_emb": "rotary", |
|
"rotary_pct": 1, |
|
"rotary_emb_base": 1000000, |
|
"no_weight_tying": true, |
|
"gpt_j_residual": false, |
|
"output_layer_parallelism": "column", |
|
"norm": "rmsnorm", |
|
"rms_norm_epsilon": 1.0e-5, |
|
|
|
"attention_config": [[["flash"], 48]], |
|
|
|
"scaled_upper_triang_masked_softmax_fusion": true, |
|
"bias_gelu_fusion": false, |
|
"use_bias_in_norms": false, |
|
"use_bias_in_attn_linear": false, |
|
"activation": "swiglu", |
|
"mlp_multiple_of": 256, |
|
} |
|
|