Feature Extraction
Transformers
clip
vision
Inference Endpoints
uform-vl-english / torch_config.json
kimihailv's picture
Upload torch_config.json
51f9f5d
raw
history blame
621 Bytes
{
"text_encoder": {
"model_type": "bert",
"dim": 768,
"context_dim": 768,
"vocab_size": 30522,
"padding_idx": 0,
"num_layers": 4,
"num_heads": 12,
"embedding_dim": 256,
"multimodal_layers_ids": [2, 3],
"head_one_neuron": false,
"pooling": "cls",
"max_position_embeddings": 77,
"dropout_prob": 0.1
},
"image_encoder": {
"dim": 768,
"patch_size": 16,
"image_size": 224,
"num_layers": 12,
"num_heads": 12,
"embedding_dim": 256,
"pooling": "cls"
}
}