{ "architectures": [ "DistilxLSTM" ], "model_type": "xlstm", "pad_token_id": 151643, "torch_dtype": "float32", "transformers_version": "4.46.3", "xlstm_cfg": { "_block_map": "1,0,1,0,1,0", "add_embedding_dropout": false, "add_post_blocks_norm": true, "bias": false, "context_length": 512, "dropout": 0.0, "embedding_dim": 1536, "mlstm_block": { "_block_idx": null, "_num_blocks": 6, "mlstm": { "_inner_embedding_dim": 3072, "_num_blocks": 6, "_proj_up_dim": 3072, "bias": false, "context_length": 512, "conv1d_kernel_size": 4, "dropout": 0.0, "embedding_dim": 1536, "num_heads": 16, "proj_factor": 2.0, "qkv_proj_blocksize": 32, "round_proj_up_dim_up": true, "round_proj_up_to_multiple_of": 64 } }, "num_blocks": 6, "slstm_at": [ 0, 2, 4 ], "slstm_block": { "_block_idx": null, "_num_blocks": 6, "feedforward": { "_num_blocks": 1, "_proj_up_dim": 0, "act_fn": "gelu", "bias": false, "dropout": 0.0, "embedding_dim": -1, "ff_type": "ffn_gated", "proj_factor": 1.7, "round_proj_up_dim_up": true, "round_proj_up_to_multiple_of": 64 }, "slstm": { "_block_idx": null, "_num_blocks": 6, "backend": "cuda", "batch_size": 8, "bias_init": "powerlaw_blockdependent", "constants": {}, "conv1d_kernel_size": 4, "dropout": 0.0, "dtype": "bfloat16", "dtype_a": "float32", "dtype_b": "float32", "dtype_g": "bfloat16", "dtype_r": "bfloat16", "dtype_s": "bfloat16", "dtype_w": "bfloat16", "embedding_dim": 1536, "enable_automatic_mixed_precision": true, "forward_clipval": null, "function": "slstm", "gradient_recurrent_clipval": null, "gradient_recurrent_cut": false, "group_norm_weight": true, "hidden_size": 1536, "initial_val": 0.0, "input_shape": "BSGNH", "internal_input_shape": "SBNGH", "num_gates": 4, "num_heads": 16, "num_states": 4, "output_shape": "BNSH", "recurrent_weight_init": "zeros" } }, "tie_weights": false, "vocab_size": 151936, "weight_decay_on_embedding": false } }