In [5]:
from transformers import AutoModel, AutoImageProcessor, AutoTokenizer
import torch

dtype = torch.float16
device = "cuda"

model = AutoModel.from_pretrained("visheratin/mexma-siglip", torch_dtype=dtype, trust_remote_code=True, optimized=True).to(device)
processor = AutoImageProcessor.from_pretrained("visheratin/mexma-siglip",use_fast=True)
tokenizer = AutoTokenizer.from_pretrained("visheratin/mexma-siglip")

In [6]:
print(model)

MexmaSigLIP(
  (text_model): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias

In [8]:
texts = ["cat"]
with torch.inference_mode():
    text_tokenized = tokenizer(texts, return_tensors="pt", padding=True).to(device)
    text_embeddings = model.encode_texts(text_tokenized.input_ids,text_tokenized.attention_mask)

print(text_tokenized)

tensor([[   0, 7515,    2]], device='cuda:0')


In [10]:
print(text_tokenized.input_ids.shape)

torch.Size([1, 3])


In [11]:
print(text_embeddings.shape)

torch.Size([1, 1152])


In [19]:
with torch.inference_mode():
    features = model.text_model(
            input_ids=text_tokenized.input_ids, attention_mask=text_tokenized.attention_mask
        ).last_hidden_state#[:, 0]
    print(features.shape)
    featuresp = model.text_projector(features)
    print(featuresp.shape)

torch.Size([1, 3, 1024])
torch.Size([1, 3, 1152])


In [23]:
with torch.inference_mode():
    text_embeddings = model.encode_texts(text_tokenized.input_ids,text_tokenized.attention_mask)
    print(text_embeddings.unsqueeze(1).shape)

torch.Size([1, 1, 1152])


In [29]:
texts = ["cat","dog"]
with torch.inference_mode():
    text_tokenized = tokenizer(texts, return_tensors="pt", padding=True).to(device)
    text_embeddings = model.encode_texts(text_tokenized.input_ids,text_tokenized.attention_mask)
    features = model.text_model(
            input_ids=text_tokenized.input_ids, attention_mask=text_tokenized.attention_mask
        ).last_hidden_state
    featuresp = model.text_projector(features)

print(text_embeddings.unsqueeze(1).shape)
print(featuresp.shape)

torch.Size([2, 1, 1152])
torch.Size([2, 3, 1152])
torch.Size([2, 3, 1152])
