Feature Extraction
Transformers
clip
vision
Inference Endpoints
File size: 349 Bytes
1ec7966
1
{"img_encoder": {"backbone": "deit3_base_patch16_224_in21ft1k", "dim": 768, "backbone_type": "vit", "pooling": "cls", "output_dim": 256}, "text_encoder": {"backbone": "google/bert_uncased_L-4_H-768_A-12", "backbone_type": "bert", "unimodal_n_layers": 2, "dim": 768, "pooling": "cls", "context_dim": 768, "output_dim": 256, "head_one_neuron": false}}