{ | |
"text_encoder": { | |
"tokenizer_class": "bert", | |
"model_type": "bert", | |
"dim": 512, | |
"context_dim": 1024, | |
"vocab_size": 30522, | |
"padding_idx": 0, | |
"num_layers": 12, | |
"num_heads": 8, | |
"embedding_dim": 768, | |
"multimodal_layers_ids": [ | |
6, | |
7, | |
8, | |
9, | |
10, | |
11 | |
], | |
"head_one_neuron": false, | |
"pooling": "cls", | |
"max_position_embeddings": 64, | |
"dropout_prob": 0.1 | |
}, | |
"image_encoder": { | |
"normalization_means": [ | |
0.48145466, | |
0.4578275, | |
0.40821073 | |
], | |
"normalization_deviations": [ | |
0.26862954, | |
0.26130258, | |
0.27577711 | |
], | |
"dim": 1024, | |
"patch_size": 14, | |
"image_size": 224, | |
"num_layers": 24, | |
"num_heads": 16, | |
"embedding_dim": 768, | |
"pooling": "cls", | |
"num_reg_tokens": 4 | |
} | |
} |