{ "architectures": [ "HymbaForCausalLM" ], "attention_dropout": 0.0, "attn_hidden_size": -1, "attn_implementation": "flex", "attn_implementation_new": "flex", "auto_map": { "AutoConfig": "configuration_hymba.HymbaConfig", "AutoModelForCausalLM": "modeling_hymba.HymbaForCausalLM" }, "bos_token_id": 1, "calc_logits_for_entire_prompt": false, "conv_dim": { "0": 3200, "1": 3200, "2": 3200, "3": 3200, "4": 3200, "5": 3200, "6": 3200, "7": 3200, "8": 3200, "9": 3200, "10": 3200, "11": 3200, "12": 3200, "13": 3200, "14": 3200, "15": 3200, "16": 3200, "17": 3200, "18": 3200, "19": 3200, "20": 3200, "21": 3200, "22": 3200, "23": 3200, "24": 3200, "25": 3200, "26": 3200, "27": 3200, "28": 3200, "29": 3200, "30": 3200, "31": 3200 }, "eos_token_id": 2, "global_attn_idx": [ 0, 15, 31 ], "hidden_act": "silu", "hidden_size": 1600, "initializer_range": 0.02, "intermediate_size": 5504, "kq_head_dim": -1, "kq_norm": "none", "kv_reuse_every_i_layer": -1, "kv_reuse_group": [ [ 1, 2 ], [ 3, 4 ], [ 5, 6 ], [ 7, 8 ], [ 9, 10 ], [ 11, 12 ], [ 13, 14 ], [ 16, 17, 18 ], [ 19, 20 ], [ 21, 22 ], [ 23, 24 ], [ 25, 26 ], [ 27, 28 ], [ 29, 30 ] ], "kv_weight_reuse": false, "layer_type": [ "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h" ], "mamba_conv_bias": true, "mamba_d_conv": 4, "mamba_d_state": 16, "mamba_dt_rank": 100, "mamba_expand": 2, "mamba_inner_layernorms": true, "mamba_proj_bias": false, "max_position_embeddings": 8192, "memory_tokens_interspersed_every": 0, "mlp_hidden_act": "silu", "model_type": "hymba", "num_attention_heads": 25, "num_experts": 1, "num_experts_per_tok": 1, "num_hidden_layers": 32, "num_key_value_heads": 5, "num_mamba": 1, "num_memory_tokens": 128, "orig_max_position_embeddings": 2048, "output_router_logits": false, "pad_token_id": 0, "rms_norm_eps": 1e-06, "rope": true, "rope_theta": 10000.0, "rope_type": "ntk", "router_aux_loss_coef": 0.001, "seq_length": 8192, "sliding_window": 1024, "tie_word_embeddings": true, "torch_dtype": "bfloat16", "transformers_version": "4.44.0", "use_cache": false, "use_mamba_kernels": true, "v_head_dim": 128, "vocab_size": 32001 }