smb-vision-base / model_architecture.txt
chenz53's picture
Upload 3 files
99aa071 verified
raw
history blame
2.9 kB
VideoMAEForPreTraining(
(videomae): VideoMAEModel(
(embeddings): VideoMAEEmbeddings(
(patch_embeddings): VideoMAEPatchEmbeddings(
(projection): Conv3d(1, 768, kernel_size=(16, 16, 16), stride=(16, 16, 16))
)
)
(encoder): VideoMAEEncoder(
(layer): ModuleList(
(0-11): 12 x VideoMAELayer(
(attention): VideoMAESdpaAttention(
(attention): VideoMAESdpaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=False)
(key): Linear(in_features=768, out_features=768, bias=False)
(value): Linear(in_features=768, out_features=768, bias=False)
(dropout): Dropout(p=0.0, inplace=False)
)
(output): VideoMAESelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(intermediate): VideoMAEIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): VideoMAEOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
)
)
)
(encoder_to_decoder): Linear(in_features=768, out_features=384, bias=False)
(decoder): VideoMAEDecoder(
(decoder_layers): ModuleList(
(0-3): 4 x VideoMAELayer(
(attention): VideoMAESdpaAttention(
(attention): VideoMAESdpaSelfAttention(
(query): Linear(in_features=384, out_features=384, bias=False)
(key): Linear(in_features=384, out_features=384, bias=False)
(value): Linear(in_features=384, out_features=384, bias=False)
(dropout): Dropout(p=0.0, inplace=False)
)
(output): VideoMAESelfOutput(
(dense): Linear(in_features=384, out_features=384, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(intermediate): VideoMAEIntermediate(
(dense): Linear(in_features=384, out_features=1536, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): VideoMAEOutput(
(dense): Linear(in_features=1536, out_features=384, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(layernorm_before): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
(layernorm_after): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
)
)
(norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
(head): Linear(in_features=384, out_features=4096, bias=True)
)
)