JanusFlow-1.3B / config.json
Xingchao Liu
upload model files
ca941de
raw
history blame
1.6 kB
{
"architectures": [
"MultiModalityCausalLM"
],
"language_config": {
"hidden_size": 2048,
"intermediate_size": 5632,
"max_position_embeddings": 16384,
"model_type": "llama",
"num_attention_heads": 16,
"num_hidden_layers": 24,
"num_key_value_heads": 16,
"torch_dtype": "bfloat16",
"vocab_size": 102400
},
"model_type": "multi_modality",
"torch_dtype": "bfloat16",
"transformers_version": "4.38.1",
"vision_gen_dec_config": {
"cls": "ShallowUViTDecoder",
"model_type": "vision_gen_dec",
"params": {
"block_out_channels": [
768
],
"elementwise_affine": true,
"hidden_size": 2048,
"in_channels": 768,
"layers_in_middle": 2,
"norm_eps": 1e-06,
"out_channels": 4,
"upsamples": 1,
"use_bias": true,
"use_mid_block": true
}
},
"vision_gen_enc_config": {
"cls": "ShallowUViTEncoder",
"model_type": "vision_gen_enc",
"params": {
"block_out_channels": [
768
],
"elementwize_affine": true,
"hidden_size": 2048,
"input_channels": 4,
"kernel_size": 2,
"layers_in_middle": 2,
"norm_eps": 1e-06,
"num_extra_tensors": 5,
"padding": 0,
"stride": 2,
"use_bias": true,
"use_mid_block": true
}
},
"vision_und_enc_config": {
"cls": "CLIPVisionTower",
"model_type": "vision_und_enc",
"params": {
"image_size": 384,
"model_name": "siglip_large_patch16_384",
"select_feature": "same",
"select_layer": -1,
"width": 1024
}
}
}