Finetuned from p1atdev/siglip-tagger-test-3
https://huggingface.co/p1atdev/siglip-tagger-test-3

test work

Usage:

import torch
import torch.nn as nn
import numpy as np
from dataclasses import dataclass
from transformers import SiglipVisionModel, SiglipPreTrainedModel, SiglipVisionConfig, AutoImageProcessor
from transformers.utils import ModelOutput

@dataclass
class SiglipForImageClassifierOutput(ModelOutput):
    loss: torch.FloatTensor | None = None
    logits: torch.FloatTensor | None = None
    pooler_output: torch.FloatTensor | None = None
    hidden_states: tuple[torch.FloatTensor, ...] | None = None
    attentions: tuple[torch.FloatTensor, ...] | None = None

class SiglipForImageClassification(SiglipPreTrainedModel):
    config_class = SiglipVisionConfig
    main_input_name = "pixel_values"

    def __init__(
        self,
        config,
    ):
        super().__init__(config)

        # self.num_labels = config.num_labels
        self.siglip = SiglipVisionModel(config)

        # Classifier head
        self.classifier = (
            nn.Linear(config.hidden_size, config.num_labels)
            if config.num_labels > 0
            else nn.Identity()
        )

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self, pixel_values: torch.FloatTensor, labels: torch.LongTensor | None = None
    ):
        outputs = self.siglip(pixel_values)
        pooler_output = outputs.pooler_output
        logits = self.classifier(pooler_output)

        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)

        return SiglipForImageClassifierOutput(
            loss=loss,
            logits=logits,
            pooler_output=outputs.pooler_output,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

# モデル設定のロード
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = SiglipVisionConfig.from_pretrained('cella110n/siglip-tagger-FT3ep')
processor = AutoImageProcessor.from_pretrained("cella110n/siglip-tagger-FT3ep", config=config)
model = SiglipForImageClassification.from_pretrained('cella110n/siglip-tagger-FT3ep', torch_dtype=torch.bfloat16).to(device)

model.eval()
print("Model Loaded. device:", model.device)

from PIL import Image

# 入力画像サイズの確認と調整
img_path =  "path/to/image"
img = Image.open(img_path).

inputs = processor(images=img, return_tensors="pt")  # 画像をモデルに適した形式に変換
print("Image processed.")

# inputs.pixel_valuesの画像を表示
img = inputs.pixel_values[0].permute(1, 2, 0).cpu().numpy()
plt.imshow(img)
plt.axis('off')
plt.show()

# # モデルの予測実行
with torch.no_grad():
    logits = (model(
            **inputs.to(
            model.device,
            model.dtype
            )
        )
        .logits.detach()
        .cpu()
        .float()
    )

logits = np.clip(logits, 0.0, 1.0)  # オーバーフローを防ぐためにlogitsをクリップ

prob_cutoff = 0.3  # この確率以上のクラスのみを表示

result = {}

for prediction in logits:
    for i, prob in enumerate(prediction):
        if prob.item() > prob_cutoff:
            result[model.config.id2label[i]] = prob.item()

# resultを、高いほうから表示
sorted_result = sorted(result.items(), key=lambda x: x[1], reverse=True)
sorted_result
Downloads last month
4
Safetensors
Model size
439M params
Tensor type
BF16
·
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.