|
import torch |
|
import torch.nn as nn |
|
from transformers import AutoProcessor, AutoModelForVision2Seq |
|
from transformers.image_utils import load_image |
|
|
|
|
|
class ideficsV3(nn.Module): |
|
def __init__(self, model_name="HuggingFaceTB/SmolVLM-Instruct"): |
|
super().__init__() |
|
|
|
|
|
self.image_processor = AutoProcessor.from_pretrained(model_name).image_processor |
|
smolVLM = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype=torch.float32) |
|
|
|
|
|
self.vision_model = smolVLM.model.vision_model |
|
|
|
def forward(self, pixel_values): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
batch_size, num_patches, channels, height, width = pixel_values.shape |
|
pixel_values = pixel_values.view(batch_size * num_patches, channels, height, width) |
|
|
|
|
|
|
|
|
|
vision_outputs = self.vision_model(pixel_values) |
|
x = vision_outputs.last_hidden_state |
|
|
|
return x |
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
truncated_model = ideficsV3().to(device).eval() |
|
truncated_model.eval() |
|
|
|
image1 = load_image("https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg") |
|
image2 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg") |
|
|
|
inputs1 = truncated_model.image_processor(images=[image1, image2], return_tensors="pt") |
|
pixel_values = inputs1.pixel_values.to(model_dtype).to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = truncated_model(pixel_values) |
|
|
|
print(outputs.shape) |
|
|
|
|