modernBERT-base-CLIP / vision_encoder.py
nolan4's picture
initial commit
8a00d0d
import torch
import torch.nn as nn
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image
class ideficsV3(nn.Module):
def __init__(self, model_name="HuggingFaceTB/SmolVLM-Instruct"):
super().__init__()
# load smolVLM model from huggingface
self.image_processor = AutoProcessor.from_pretrained(model_name).image_processor
smolVLM = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype=torch.float32)
# Extract the necessary modules
self.vision_model = smolVLM.model.vision_model
def forward(self, pixel_values):
#################################################################
# The error ValueError: too many values to unpack (expected 4) occurs because the pixel_values tensor you passed into the model has a shape of [1, 13, 3, 384, 384], while the vision transformer (ViT) expects an input shape of [batch_size, channels, height, width], i.e., a 4D tensor.
# Your pixel_values tensor is 5D because it contains multiple patches, while the ViT expects a single image or batch of images.
# You need to flatten the patch dimension (the second dimension, 13) into the batch dimension (1) before passing it to the vision transformer.
# Flatten the patch dimension into the batch dimension
batch_size, num_patches, channels, height, width = pixel_values.shape
pixel_values = pixel_values.view(batch_size * num_patches, channels, height, width)
#################################################################
# Run images through the vision transformer
vision_outputs = self.vision_model(pixel_values)
x = vision_outputs.last_hidden_state # shape := [batch_size * num_patches, 729, 1152]
return x
if __name__ == "__main__":
# Instantiate truncated model
device = "cuda" if torch.cuda.is_available() else "cpu"
truncated_model = ideficsV3().to(device).eval()
truncated_model.eval()
image1 = load_image("https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg")
image2 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
inputs1 = truncated_model.image_processor(images=[image1, image2], return_tensors="pt")
pixel_values = inputs1.pixel_values.to(model_dtype).to(device)
# Pass pixel_values through your truncated model
with torch.no_grad():
outputs = truncated_model(pixel_values)
print(outputs.shape) # Should be [batch_size, 2048] given the projection layer output.