|
VideoMAEForPreTraining( |
|
(videomae): VideoMAEModel( |
|
(embeddings): VideoMAEEmbeddings( |
|
(patch_embeddings): VideoMAEPatchEmbeddings( |
|
(projection): Conv3d(1, 768, kernel_size=(16, 16, 16), stride=(16, 16, 16)) |
|
) |
|
) |
|
(encoder): VideoMAEEncoder( |
|
(layer): ModuleList( |
|
(0-11): 12 x VideoMAELayer( |
|
(attention): VideoMAESdpaAttention( |
|
(attention): VideoMAESdpaSelfAttention( |
|
(query): Linear(in_features=768, out_features=768, bias=False) |
|
(key): Linear(in_features=768, out_features=768, bias=False) |
|
(value): Linear(in_features=768, out_features=768, bias=False) |
|
(dropout): Dropout(p=0.0, inplace=False) |
|
) |
|
(output): VideoMAESelfOutput( |
|
(dense): Linear(in_features=768, out_features=768, bias=True) |
|
(dropout): Dropout(p=0.0, inplace=False) |
|
) |
|
) |
|
(intermediate): VideoMAEIntermediate( |
|
(dense): Linear(in_features=768, out_features=3072, bias=True) |
|
(intermediate_act_fn): GELUActivation() |
|
) |
|
(output): VideoMAEOutput( |
|
(dense): Linear(in_features=3072, out_features=768, bias=True) |
|
(dropout): Dropout(p=0.0, inplace=False) |
|
) |
|
(layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True) |
|
(layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True) |
|
) |
|
) |
|
) |
|
) |
|
(encoder_to_decoder): Linear(in_features=768, out_features=384, bias=False) |
|
(decoder): VideoMAEDecoder( |
|
(decoder_layers): ModuleList( |
|
(0-3): 4 x VideoMAELayer( |
|
(attention): VideoMAESdpaAttention( |
|
(attention): VideoMAESdpaSelfAttention( |
|
(query): Linear(in_features=384, out_features=384, bias=False) |
|
(key): Linear(in_features=384, out_features=384, bias=False) |
|
(value): Linear(in_features=384, out_features=384, bias=False) |
|
(dropout): Dropout(p=0.0, inplace=False) |
|
) |
|
(output): VideoMAESelfOutput( |
|
(dense): Linear(in_features=384, out_features=384, bias=True) |
|
(dropout): Dropout(p=0.0, inplace=False) |
|
) |
|
) |
|
(intermediate): VideoMAEIntermediate( |
|
(dense): Linear(in_features=384, out_features=1536, bias=True) |
|
(intermediate_act_fn): GELUActivation() |
|
) |
|
(output): VideoMAEOutput( |
|
(dense): Linear(in_features=1536, out_features=384, bias=True) |
|
(dropout): Dropout(p=0.0, inplace=False) |
|
) |
|
(layernorm_before): LayerNorm((384,), eps=1e-12, elementwise_affine=True) |
|
(layernorm_after): LayerNorm((384,), eps=1e-12, elementwise_affine=True) |
|
) |
|
) |
|
(norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True) |
|
(head): Linear(in_features=384, out_features=4096, bias=True) |
|
) |
|
) |
|
|