snnetv2-semantic-segmentation / configs /san /san-vit-l14_pascal_context-640x640.py
HubHop
update
412c852
raw
history blame contribute delete
867 Bytes
_base_ = ['./san-vit-b16_pascal_context-640x640.py']
model = dict(
type='MultimodalEncoderDecoder',
pretrained='pretrain/jx_vit_base_p16_224-80ecf9dd.pth',
encoder_resolution=0.7,
image_encoder=dict(
type='VisionTransformer',
img_size=(336, 336),
patch_size=14,
patch_pad=0,
embed_dims=1024,
num_layers=18,
num_heads=16,
out_indices=(5, 11, 17),
),
text_encoder=dict(
type='CLIPTextEncoder',
embed_dims=768,
num_layers=12,
num_heads=12,
output_dims=768,
),
decode_head=dict(
type='SideAdapterCLIPHead',
san_cfg=dict(clip_channels=1024, cfg_decoder=dict(num_heads=16)),
maskgen_cfg=dict(
num_layers=6,
embed_dims=1024,
num_heads=16,
out_dims=768,
)))