{ "_name_or_path": "/fs/archive/share/yulan/data/aa_mini/output/miniyulan-2B-final-stage19-hyw-2/checkpoint-184795-rms_norm", "architectures": [ "MiniYuLanModelForCausalLM" ], "attention_bias": true, "attention_dropout": 0.0, "bos_token_id": 1, "dim_model_base": 1920, "dim_model_base_attn": 64, "dim_model_base_init": null, "dim_model_base_lmh": 1, "dim_model_base_logits": 1920.0, "dim_model_base_lr": 256.0, "down_proj_alpha": 0.03450327796711771, "embed_tokens_alpha": 1, "embedding_ln": false, "embedding_rmsln": false, "eos_token_id": 2, "gate_up_proj_alpha": 0.3651483716701107, "gradient_checkpointing_step": 11, "hidden_act": "silu", "hidden_size": 1920, "hidden_states_shrink": 0.18708286933869706, "init_scale_o": 1, "initializer_range": 5e-05, "input_layernorm_alpha": 1.0, "intermediate_size": 4800, "k_proj_alpha": 0.3651483716701107, "layer_norm_eps": 1e-06, "lm_head_alpha": 1.0, "ln_scale": 1, "max_position_embeddings": 4096, "model_reproduce": "transformer", "model_type": "miniyulan", "norm_alpha": 1.0, "num_attention_heads": 30, "num_epochs_trained_before_this_epoch": 19, "num_hidden_layers": 56, "num_key_value_heads": 6, "num_steps_trained_before_this_epoch": 184795, "o_proj_alpha": 0.03450327796711771, "post_attention_layernorm_alpha": 1.0, "q_proj_alpha": 0.3651483716701107, "qk_layernorm": false, "rms_norm_eps": 1e-06, "rms_type": "llama", "rope_scaling": null, "rope_theta": 10000.0, "scale_emb": 10.0, "shrink_alpha": 1, "sliding_window": null, "tie_word_embeddings": true, "torch_dtype": "bfloat16", "transformers_version": "4.44.0", "use_cache": false, "use_emb_alpha": true, "use_liger": true, "use_norm_alpha": true, "use_sliding_window": false, "v_proj_alpha": 0.3651483716701107, "vocab_size": 99000, "wesar_weights": true, "z_loss": 0.0001 }