{ "cells": [ { "cell_type": "code", "execution_count": 5, "id": "a343465e-8ee3-4393-bfc1-1d60862035fc", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModel, AutoImageProcessor, AutoTokenizer\n", "import torch\n", "\n", "dtype = torch.float16\n", "device = \"cuda\"\n", "\n", "model = AutoModel.from_pretrained(\"visheratin/mexma-siglip\", torch_dtype=dtype, trust_remote_code=True, optimized=True).to(device)\n", "processor = AutoImageProcessor.from_pretrained(\"visheratin/mexma-siglip\",use_fast=True)\n", "tokenizer = AutoTokenizer.from_pretrained(\"visheratin/mexma-siglip\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "e74122c2-9f96-4e3a-a8ad-b22a04dc553e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MexmaSigLIP(\n", " (text_model): XLMRobertaModel(\n", " (embeddings): XLMRobertaEmbeddings(\n", " (word_embeddings): Embedding(250002, 1024, padding_idx=1)\n", " (position_embeddings): Embedding(514, 1024, padding_idx=1)\n", " (token_type_embeddings): Embedding(1, 1024)\n", " (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (encoder): XLMRobertaEncoder(\n", " (layer): ModuleList(\n", " (0-23): 24 x XLMRobertaLayer(\n", " (attention): XLMRobertaAttention(\n", " (self): XLMRobertaSdpaSelfAttention(\n", " (query): Linear(in_features=1024, out_features=1024, bias=True)\n", " (key): Linear(in_features=1024, out_features=1024, bias=True)\n", " (value): Linear(in_features=1024, out_features=1024, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): XLMRobertaSelfOutput(\n", " (dense): Linear(in_features=1024, out_features=1024, bias=True)\n", " (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): XLMRobertaIntermediate(\n", " (dense): Linear(in_features=1024, out_features=4096, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): XLMRobertaOutput(\n", " (dense): Linear(in_features=4096, out_features=1024, bias=True)\n", " (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " )\n", " (text_projector): Linear(in_features=1024, out_features=1152, bias=False)\n", " (vision_model): SiglipVisionTransformer(\n", " (embeddings): SiglipVisionEmbeddings(\n", " (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)\n", " (position_embedding): Embedding(729, 1152)\n", " )\n", " (encoder): SiglipEncoder(\n", " (layers): ModuleList(\n", " (0-26): 27 x SiglipEncoderLayer(\n", " (self_attn): SiglipFlashAttention2(\n", " (k_proj): Linear(in_features=1152, out_features=1152, bias=True)\n", " (v_proj): Linear(in_features=1152, out_features=1152, bias=True)\n", " (q_proj): Linear(in_features=1152, out_features=1152, bias=True)\n", " (out_proj): Linear(in_features=1152, out_features=1152, bias=True)\n", " )\n", " (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)\n", " (mlp): SiglipMLP(\n", " (activation_fn): PytorchGELUTanh()\n", " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", " )\n", " (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)\n", " )\n", " )\n", " )\n", " (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)\n", " (head): SiglipMultiheadAttentionPoolingHead(\n", " (attention): MultiheadAttention(\n", " (out_proj): NonDynamicallyQuantizableLinear(in_features=1152, out_features=1152, bias=True)\n", " )\n", " (layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)\n", " (mlp): SiglipMLP(\n", " (activation_fn): PytorchGELUTanh()\n", " (fc1): Linear(in_features=1152, out_features=4304, bias=True)\n", " (fc2): Linear(in_features=4304, out_features=1152, bias=True)\n", " )\n", " )\n", " )\n", ")\n" ] } ], "source": [ "print(model)" ] }, { "cell_type": "code", "execution_count": 8, "id": "31a32baf-9ffd-4046-b7fa-70abf55fa7e9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[ 0, 7515, 2]], device='cuda:0')\n" ] } ], "source": [ "texts = [\"cat\"]\n", "with torch.inference_mode():\n", " text_tokenized = tokenizer(texts, return_tensors=\"pt\", padding=True).to(device)\n", " text_embeddings = model.encode_texts(text_tokenized.input_ids,text_tokenized.attention_mask)\n", "\n", "print(text_tokenized)" ] }, { "cell_type": "code", "execution_count": 10, "id": "8e064291-1929-41e7-9bfb-b671781860e9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([1, 3])\n" ] } ], "source": [ "print(text_tokenized.input_ids.shape)" ] }, { "cell_type": "code", "execution_count": 11, "id": "9f9bce6d-270c-45f0-a38b-06ec0a84f3c7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([1, 1152])\n" ] } ], "source": [ "print(text_embeddings.shape)" ] }, { "cell_type": "code", "execution_count": 19, "id": "bf42db3c-4c42-43ad-962e-b6e783ac5be5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([1, 3, 1024])\n", "torch.Size([1, 3, 1152])\n" ] } ], "source": [ "with torch.inference_mode():\n", " features = model.text_model(\n", " input_ids=text_tokenized.input_ids, attention_mask=text_tokenized.attention_mask\n", " ).last_hidden_state#[:, 0]\n", " print(features.shape)\n", " featuresp = model.text_projector(features)\n", " print(featuresp.shape)" ] }, { "cell_type": "code", "execution_count": 23, "id": "df3c9752-4e6b-41e6-a2e0-2be03deff3eb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([1, 1, 1152])\n" ] } ], "source": [ "with torch.inference_mode():\n", " text_embeddings = model.encode_texts(text_tokenized.input_ids,text_tokenized.attention_mask)\n", " print(text_embeddings.unsqueeze(1).shape)" ] }, { "cell_type": "code", "execution_count": 29, "id": "decc2d18-ae48-4df8-aebf-fa97734d18d9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([2, 1, 1152])\n", "torch.Size([2, 3, 1152])\n", "torch.Size([2, 3, 1152])\n" ] } ], "source": [ "texts = [\"cat\",\"dog\"]\n", "with torch.inference_mode():\n", " text_tokenized = tokenizer(texts, return_tensors=\"pt\", padding=True).to(device)\n", " text_embeddings = model.encode_texts(text_tokenized.input_ids,text_tokenized.attention_mask)\n", " features = model.text_model(\n", " input_ids=text_tokenized.input_ids, attention_mask=text_tokenized.attention_mask\n", " ).last_hidden_state\n", " featuresp = model.text_projector(features)\n", "\n", "print(text_embeddings.unsqueeze(1).shape)\n", "print(featuresp.shape)" ] }, { "cell_type": "code", "execution_count": null, "id": "9de5b0dc-ec0e-420b-96ce-8c077c5e4145", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.10" } }, "nbformat": 4, "nbformat_minor": 5 }