{ "architectures": [ "MultiModalityCausalLM" ], "language_config": { "hidden_size": 2048, "intermediate_size": 5632, "max_position_embeddings": 16384, "model_type": "llama", "num_attention_heads": 16, "num_hidden_layers": 24, "num_key_value_heads": 16, "torch_dtype": "bfloat16", "vocab_size": 102400 }, "model_type": "multi_modality", "torch_dtype": "bfloat16", "transformers_version": "4.38.1", "vision_gen_dec_config": { "cls": "ShallowUViTDecoder", "model_type": "vision_gen_dec", "params": { "block_out_channels": [ 768 ], "elementwise_affine": true, "hidden_size": 2048, "in_channels": 768, "layers_in_middle": 2, "norm_eps": 1e-06, "out_channels": 4, "upsamples": 1, "use_bias": true, "use_mid_block": true } }, "vision_gen_enc_config": { "cls": "ShallowUViTEncoder", "model_type": "vision_gen_enc", "params": { "block_out_channels": [ 768 ], "elementwize_affine": true, "hidden_size": 2048, "input_channels": 4, "kernel_size": 2, "layers_in_middle": 2, "norm_eps": 1e-06, "num_extra_tensors": 5, "padding": 0, "stride": 2, "use_bias": true, "use_mid_block": true } }, "vision_und_enc_config": { "cls": "CLIPVisionTower", "model_type": "vision_und_enc", "params": { "image_size": 384, "model_name": "siglip_large_patch16_384", "select_feature": "same", "select_layer": -1, "width": 1024 } } }