|
_base_ = ['./san-vit-b16_coco-stuff164k-640x640.py'] |
|
|
|
pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/san/clip_vit-large-patch14-336_3rdparty-0b5df9cb.pth' |
|
model = dict( |
|
type='MultimodalEncoderDecoder', |
|
pretrained=pretrained, |
|
encoder_resolution=0.7, |
|
image_encoder=dict( |
|
type='VisionTransformer', |
|
img_size=(336, 336), |
|
patch_size=14, |
|
patch_pad=0, |
|
embed_dims=1024, |
|
num_layers=18, |
|
num_heads=16, |
|
out_indices=(5, 11, 17), |
|
), |
|
text_encoder=dict( |
|
type='CLIPTextEncoder', |
|
embed_dims=768, |
|
num_layers=12, |
|
num_heads=12, |
|
output_dims=768, |
|
), |
|
decode_head=dict( |
|
type='SideAdapterCLIPHead', |
|
san_cfg=dict(clip_channels=1024, cfg_decoder=dict(num_heads=16)), |
|
maskgen_cfg=dict( |
|
num_layers=6, |
|
embed_dims=1024, |
|
num_heads=16, |
|
out_dims=768, |
|
))) |
|
|
|
|
|
train_dataloader = dict(batch_size=4) |
|
|