Amirparsa-Sal
/

RANomAly

Model card Files Files and versions Community

Amirparsa-Sal commited on Nov 15, 2024

Commit

5d1f0ae

1 Parent(s): 9a99f40

Add codes

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +7 -0
AnomalyCLIP_lib/AnomalyCLIP.py +531 -0
AnomalyCLIP_lib/CLIP.py +436 -0
AnomalyCLIP_lib/__init__.py +1 -0
AnomalyCLIP_lib/bpe_simple_vocab_16e6.txt.gz +3 -0
AnomalyCLIP_lib/build_model.py +50 -0
AnomalyCLIP_lib/constants.py +2 -0
AnomalyCLIP_lib/model_load.py +235 -0
AnomalyCLIP_lib/simple_tokenizer.py +132 -0
AnomalyCLIP_lib/transform.py +133 -0
Dockerfile +14 -0
LICENSE +21 -0
README.md +142 -0
checkpoints/9_12_4_multiscale/epoch_1.pth +3 -0
checkpoints/9_12_4_multiscale/epoch_10.pth +3 -0
checkpoints/9_12_4_multiscale/epoch_11.pth +3 -0
checkpoints/9_12_4_multiscale/epoch_12.pth +3 -0
checkpoints/9_12_4_multiscale/epoch_13.pth +3 -0
checkpoints/9_12_4_multiscale/epoch_14.pth +3 -0
checkpoints/9_12_4_multiscale/epoch_15.pth +3 -0
checkpoints/9_12_4_multiscale/epoch_2.pth +3 -0
checkpoints/9_12_4_multiscale/epoch_3.pth +3 -0
checkpoints/9_12_4_multiscale/epoch_4.pth +3 -0
checkpoints/9_12_4_multiscale/epoch_5.pth +3 -0
checkpoints/9_12_4_multiscale/epoch_6.pth +3 -0
checkpoints/9_12_4_multiscale/epoch_7.pth +3 -0
checkpoints/9_12_4_multiscale/epoch_8.pth +3 -0
checkpoints/9_12_4_multiscale/epoch_9.pth +3 -0
checkpoints/9_12_4_multiscale/log.txt +0 -0
checkpoints/9_12_4_multiscale_visa/epoch_1.pth +3 -0
checkpoints/9_12_4_multiscale_visa/epoch_10.pth +3 -0
checkpoints/9_12_4_multiscale_visa/epoch_11.pth +3 -0
checkpoints/9_12_4_multiscale_visa/epoch_12.pth +3 -0
checkpoints/9_12_4_multiscale_visa/epoch_13.pth +3 -0
checkpoints/9_12_4_multiscale_visa/epoch_14.pth +3 -0
checkpoints/9_12_4_multiscale_visa/epoch_15.pth +3 -0
checkpoints/9_12_4_multiscale_visa/epoch_2.pth +3 -0
checkpoints/9_12_4_multiscale_visa/epoch_3.pth +3 -0
checkpoints/9_12_4_multiscale_visa/epoch_4.pth +3 -0
checkpoints/9_12_4_multiscale_visa/epoch_5.pth +3 -0
checkpoints/9_12_4_multiscale_visa/epoch_6.pth +3 -0
checkpoints/9_12_4_multiscale_visa/epoch_7.pth +3 -0
checkpoints/9_12_4_multiscale_visa/epoch_8.pth +3 -0
checkpoints/9_12_4_multiscale_visa/epoch_9.pth +3 -0
dataset.py +50 -0
datasets/rayan_dataset.py +127 -0
docker-compose.yml +21 -0
evaluation/base_eval.py +293 -0
evaluation/class_name_mapping.json +5 -0
evaluation/eval_main.py +78 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+*.pyc
+*.pyo
+__pycache__/
+*.tar.gz
+*.tar.xz
+ZSAD-dataset
+data/

AnomalyCLIP_lib/AnomalyCLIP.py ADDED Viewed

	@@ -0,0 +1,531 @@

+from collections import OrderedDict
+from typing import Tuple, Union
+import numpy as np
+import torch
+from torch import nn
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = self.relu2(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu3(out)
+        return out
+# implement attention module for v-v self-attention
+class Attention(nn.Module):
+    def __init__(self, out_dim, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., settings=''):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(out_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.settings = settings
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        # original self-attention for the original path
+        attn_ori = (q @ k.transpose(-2, -1)) * self.scale
+        attn_ori = attn_ori.softmax(dim=-1)
+        attn_ori = self.attn_drop(attn_ori)
+        # replace k & q by v
+        k = v
+        q = k
+        # self-attention, higher temperate for resnets performs better
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = (attn).softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x_ori = (attn_ori @ v).transpose(1, 2).reshape(B, N, C)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj_drop(self.proj(x))
+        x_ori = self.proj_drop(self.proj(x_ori))
+        return [x, x_ori]
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, design_details = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        if isinstance(self.attn, Attention):
+            x = x.transpose(0, 1)
+            x, x_ori = self.attn(x)
+            return [x.transpose(0, 1), x_ori.transpose(0, 1)]
+        else:
+            return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x, whole = False, ffn = False):
+        # print("xxxxx",x.shape)
+        # dual paths for blocks deeper than "d"
+        if isinstance(self.attn, Attention):
+            if isinstance(x, list):
+                if not ffn:
+                    x, x_ori = x
+                    x_res = self.attention(self.ln_1(x_ori))
+                    x_res, x_ori_res = x_res
+                    x_ori += x_ori_res
+                    x_ori = x_ori + self.mlp(self.ln_2(x_ori))
+                    x += x_res # skip ffn for the new path
+                    # print('hellloooo')
+                    return [x, x_ori]
+                else:
+                    x, x_ori_1 = x
+                    x_res = self.attention(self.ln_1(x_ori_1))
+                    x_res, x_ori_res = x_res
+                    x_ori = x_ori_1 +  x_ori_res
+                    x_ori = x_ori + self.mlp(self.ln_2(x_ori))
+                    x += x_res # skip ffn for the new path
+                    x = x_res + x_ori_1
+                    x = x + self.mlp(self.ln_2(x))
+                    return [x, x_ori]
+            # start of dual path
+            else:
+                x_res = self.attention(self.ln_1(x))
+                if isinstance(x_res, list):
+                    x_res, x_ori_res = x_res
+                    x_ori = x + x_ori_res
+                    x_ori = x_ori + self.mlp(self.ln_2(x_ori))
+                    x += x_res
+                    return [x, x_ori]
+        # singl path before "d"
+        else:
+            x = x + self.attention(self.ln_1(x))
+            x = x + self.mlp(self.ln_2(x))
+        return x
+class ResidualAttentionBlock_learnable_token(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, design_details=None,
+            text_layer=False, i = 0):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+        self.i = i
+        self.compound_prompt_nctx = design_details['learnabel_text_embedding_length']
+        self.text_layer = text_layer
+        if i == 0:
+            self.first_layer = True
+        else:
+            self.first_layer = False
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        if isinstance(self.attn, Attention):
+            x = x.transpose(0, 1)
+            x, x_ori = self.attn(x)
+            return [x.transpose(0, 1), x_ori.transpose(0, 1)]
+        else:
+            return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, inputs):
+        # dual paths for blocks deeper than "d"
+        if isinstance(self.attn, Attention):
+            x = inputs[0]
+            if isinstance(x, list):
+                x, x_ori = x
+                x_res = self.attention(self.ln_1(x_ori))
+                x_res, x_ori_res = x_res
+                x_ori += x_ori_res
+                x_ori = x_ori + self.mlp(self.ln_2(x_ori))
+                x += x_res # skip ffn for the new path
+                return [x, x_ori]
+            # start of dual path
+            else:
+                x_res = self.attention(self.ln_1(x))
+                if isinstance(x_res, list):
+                    x_res, x_ori_res = x_res
+                    x_ori = x + x_ori_res
+                    x_ori = x_ori + self.mlp(self.ln_2(x_ori))
+                    x += x_res
+                    return [x, x_ori]
+        # singl path before "d"
+        else:
+            x = inputs[0]
+            compound_prompts_deeper = inputs[1]
+            counter = inputs[2]
+            if not self.first_layer:
+                # First check if the ith layer needs compound prompts or not
+                if not (counter > len(compound_prompts_deeper) - 1):
+                    # Appending the learnable tokens in different way
+                    # x -> [77, NCLS, DIM]
+                    # First remove the learnable tokens from previous layer
+                    prefix = x[:1, :, :]
+                    suffix = x[1 + self.compound_prompt_nctx:, :, :]
+                    textual_context = compound_prompts_deeper[counter]
+                    textual_context = textual_context.expand(x.shape[1], -1, -1).permute(1, 0, 2).half()
+                    # Add the learnable tokens of this layer with the input, replaced by previous
+                    # layer learnable tokens
+                    x = torch.cat([prefix, textual_context, suffix], dim=0)
+                    # Once done, update the counter, so that the next time, it does not use same learnable tokens
+                    counter += 1
+            x = x + self.attention(self.ln_1(x))
+            x = x + self.mlp(self.ln_2(x))
+        return [x, compound_prompts_deeper, counter]
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, need_weights: bool = False, design_details = None ,text_layer = False):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.text_layer = text_layer
+        self.design_deatails = design_details
+        print("text_layer", self.text_layer)
+        if self.text_layer and (design_details is not None):
+            self.resblocks = nn.ModuleList([ResidualAttentionBlock_learnable_token(width, heads, attn_mask, design_details, text_layer, i=i) for i in range(layers)])
+        else:
+            self.resblocks = nn.ModuleList([ResidualAttentionBlock(width, heads, attn_mask,) for i in range(layers)])
+    def ori_CLIP_with_patch_forward(self, x, out_layers):
+        idx = 0
+        out_tokens = []
+        for r in self.resblocks:
+            idx += 1
+            x = r(x)
+            if idx in out_layers:
+                if isinstance(x, list):
+                    out_tokens.append(x[1])
+                else:
+                    out_tokens.append(x)
+        return [x, x], out_tokens
+    def AnomalyCLIP_forward(self, x, out_layers, ffn):
+        idx = 0
+        out_tokens = []
+        for r in self.resblocks:
+            idx += 1
+            x = r(x, ffn = ffn)
+            # print("out_layers", out_layers, idx)
+            if idx in out_layers:
+                if isinstance(x, list):
+                    out_tokens.append(x[0])
+                else:
+                    out_tokens.append(x)
+        return x, out_tokens
+    def forward(self, x: torch.Tensor, out_layers = [6, 12, 18, 24], DPAM_layer = None, ffn = False):
+        # visual encoder forward
+        if not self.text_layer:
+            out_tokens = []
+            if DPAM_layer is None:
+                [x, x], out_tokens = self.ori_CLIP_with_patch_forward(x, out_layers)
+                return [x, x], out_tokens
+            else:
+                x, out_tokens = self.AnomalyCLIP_forward(x, out_layers, ffn)
+                return x, out_tokens
+        # text encoder forward
+        # ori text embedding
+        elif self.design_deatails is None:
+            for idx, r in enumerate(self.resblocks):
+                x = r(x)
+            return x
+        # insert learnable text embedding
+        elif self.design_deatails is not None:
+            for idx, r in enumerate(self.resblocks):
+                x = r(x)
+            return x[0]
+    def get_cast_dtype(self) -> torch.dtype:
+        return self.resblocks[0].mlp.c_fc.weight.dtype
+class VisionTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads, need_weights=True)
+        self.attn = None
+        self.embed_dim = width
+        self.num_heads = heads
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    @torch.no_grad()
+    def DAPM_replace(self, DPAM_layer):
+        if DPAM_layer is not None:
+            for i in range(1, DPAM_layer):
+                self.attn = Attention(self.embed_dim, self.embed_dim, self.num_heads, True)
+                self.attn.qkv.weight.data = self.transformer.resblocks[-i].attn.in_proj_weight.clone()
+                self.attn.qkv.bias.data = self.transformer.resblocks[-i].attn.in_proj_bias.clone()
+                self.attn.proj.weight.data = self.transformer.resblocks[-i].attn.out_proj.weight.clone()
+                self.attn.proj.bias.data = self.transformer.resblocks[-i].attn.out_proj.bias.clone()
+                self.transformer.resblocks[-i].attn = self.attn
+    @torch.no_grad()
+    def forward(self, x: torch.Tensor, features_list, ori_patch = False, proj_use = True, DPAM_layer = None, ffn = False):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        side = int((self.positional_embedding.shape[0] - 1) ** 0.5)
+        new_side = int((x.shape[1] - 1) ** 0.5)
+        # update the position embedding during inference for varied input size
+        if side != new_side:
+            new_pos = self.positional_embedding[1:, :].reshape(-1, side, side, x.shape[-1]).permute(0, 3, 1, 2)
+            new_pos = torch.nn.functional.interpolate(new_pos, (new_side, new_side), mode='bilinear')
+            new_pos = new_pos.reshape(-1, x.shape[-1], new_side * new_side).transpose(1, 2)
+            self.positional_embedding.data = torch.cat([self.positional_embedding[:1, :], new_pos[0]], 0)
+        pos = self.positional_embedding.to(x.dtype)
+        x = x + pos
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        [x, x_ori], patch_tokens = self.transformer(x, features_list, DPAM_layer = DPAM_layer, ffn = ffn)
+        if True:
+            patch_token_list = []
+            for patch_token in patch_tokens:
+                patch_token = self.ln_post(patch_token.permute(1, 0, 2)) @ self.proj  # LND -> NLD
+                patch_token_list.append(patch_token)
+            patch_tokens = patch_token_list
+            return x_ori[0, :, :] @ self.proj, patch_tokens
+        return x
+from thop import profile
+class AnomalyCLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int,
+                 design_details = None
+                 ):
+        super().__init__()
+        self.context_length = context_length
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim
+            )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask(), text_layer=True, design_details=design_details
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image, feature_list = [], ori_patch = False, proj_use = True, DPAM_layer = None, ffn = False):
+        return self.visual(image.type(self.dtype), feature_list, ori_patch = ori_patch, proj_use = proj_use, DPAM_layer = DPAM_layer, ffn = ffn)
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def encode_text_learn(self, prompts, tokenized_prompts, deep_compound_prompts_text = None, normalize: bool = False):
+        cast_dtype = self.transformer.get_cast_dtype()
+        # x = self.token_embedding(text).to(cast_dtype)  # [batch_size, n_ctx, d_model]
+        # x = x + self.positional_embedding.to(cast_dtype)
+        x = prompts + self.positional_embedding.to(cast_dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        # print("test", x.shape, len(deep_compound_prompts_text))
+        if deep_compound_prompts_text is None:
+            x = self.transformer(x)
+        else:
+            x = self.transformer([x, deep_compound_prompts_text, 0])
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)  # [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), tokenized_prompts.argmax(dim=-1)] @ self.text_projection
+        return x
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text

AnomalyCLIP_lib/CLIP.py ADDED Viewed

	@@ -0,0 +1,436 @@

+from collections import OrderedDict
+from typing import Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = self.relu2(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu3(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        side = int((self.positional_embedding.shape[0] - 1) ** 0.5)
+        new_side = int((x.shape[0] - 1) ** 0.5)
+        # update the position embedding during inference for varied input size
+        if side != new_side:
+            new_pos = self.positional_embedding[1:, :].reshape(-1, side, side, x.shape[-1]).permute(0, 3, 1, 2)
+            new_pos = torch.nn.functional.interpolate(new_pos, (new_side, new_side), mode='bilinear')
+            new_pos = new_pos.reshape(-1, x.shape[-1], new_side * new_side).transpose(1, 2)
+            self.positional_embedding.data = torch.cat([self.positional_embedding[:1, :], new_pos[0]], 0)
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x, key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        #return x[0]
+        return x.transpose(0, 1) # return both cls token and image tokens, B,N,C
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(2)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        def stem(x):
+            x = self.relu1(self.bn1(self.conv1(x)))
+            x = self.relu2(self.bn2(self.conv2(x)))
+            x = self.relu3(self.bn3(self.conv3(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, need_weights: bool = False):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+        self.need_weights = need_weights
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        if self.need_weights == False:
+            return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+        else:
+            return self.attn(x, x, x, need_weights=True, attn_mask=self.attn_mask)
+    def forward(self, x: torch.Tensor):
+        if self.need_weights == False:
+            x = x + self.attention(self.ln_1(x))
+            x = x + self.mlp(self.ln_2(x))
+            return x
+        else:
+            y, attn = self.attention(self.ln_1(x))
+            x = x + y
+            x = x + self.mlp(self.ln_2(x))
+            return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, need_weights: bool = False):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask, need_weights if i == layers - 1 else False) for i in range(layers)])
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+    def get_cast_dtype(self) -> torch.dtype:
+        return self.resblocks[0].mlp.c_fc.weight.dtype
+class VisionTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads, need_weights=True)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        #####################################################################################
+        side = int((self.positional_embedding.shape[0] - 1) ** 0.5)
+        new_side = int((x.shape[1] - 1) ** 0.5)
+        # update the position embedding during inference for varied input size
+        if side != new_side:
+            new_pos = self.positional_embedding[1:, :].reshape(-1, side, side, x.shape[-1]).permute(0, 3, 1, 2)
+            new_pos = torch.nn.functional.interpolate(new_pos, (new_side, new_side), mode='bilinear')
+            new_pos = new_pos.reshape(-1, x.shape[-1], new_side * new_side).transpose(1, 2)
+            self.positional_embedding.data = torch.cat([self.positional_embedding[:1, :], new_pos[0]], 0)
+        #####################################################################################
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        #x = self.ln_post(x[:, 0, :])
+        x = self.ln_post(x) # return both cls token and image tokens
+        if self.proj is not None:
+            x = x @ self.proj
+        return x
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int
+                 ):
+        super().__init__()
+        self.context_length = context_length
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim
+            )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask()
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def encode_text_learn(self, prompts, tokenized_prompts, deep_compound_prompts_text = None, normalize: bool = False):
+        cast_dtype = self.transformer.get_cast_dtype()
+        # x = self.token_embedding(text).to(cast_dtype)  # [batch_size, n_ctx, d_model]
+        # x = x + self.positional_embedding.to(cast_dtype)
+        x = prompts + self.positional_embedding.to(cast_dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        # print("test", x.shape, len(deep_compound_prompts_text))
+        if deep_compound_prompts_text is None:
+            x = self.transformer(x)
+        else:
+            x = self.transformer([x, deep_compound_prompts_text, 0])
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)  # [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), tokenized_prompts.argmax(dim=-1)] @ self.text_projection
+        return x
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text

AnomalyCLIP_lib/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model_load import *

AnomalyCLIP_lib/bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

AnomalyCLIP_lib/build_model.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from torch import nn
+from .CLIP import CLIP
+from .AnomalyCLIP import AnomalyCLIP
+def build_model(name: str, state_dict: dict, design_details = None):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
+    # print('name', name)
+    # if 'CS-' in name:
+    if design_details is not None:
+        model = AnomalyCLIP(
+            embed_dim,
+            image_resolution, vision_layers, vision_width, vision_patch_size,
+            context_length, vocab_size, transformer_width, transformer_heads, transformer_layers, design_details = design_details
+        )
+    else:
+        model = CLIP(
+            embed_dim,
+            image_resolution, vision_layers, vision_width, vision_patch_size,
+            context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
+        )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+    #convert_weights(model)
+    model.load_state_dict(state_dict)
+    return model.eval()

AnomalyCLIP_lib/constants.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2	+ OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)

AnomalyCLIP_lib/model_load.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import hashlib
+import os
+import urllib
+import warnings
+from typing import Union, List
+from pkg_resources import packaging
+import torch
+from PIL import Image
+from torchvision.transforms import Compose, Resize, ToTensor, Normalize
+from tqdm import tqdm
+import numpy as np
+from .build_model import build_model
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+from torchvision.transforms import InterpolationMode
+if packaging.version.parse(torch.__version__) < packaging.version.parse("1.7.1"):
+    warnings.warn("PyTorch version 1.7.1 or higher is recommended")
+__all__ = ["available_models", "load",
+           "get_similarity_map",  "compute_similarity"]
+_tokenizer = _Tokenizer()
+_MODELS = {
+    "ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
+}
+def _download(
+        url: str,
+        cache_dir: Union[str, None] = None,
+):
+    if not cache_dir:
+        # cache_dir = os.path.expanduser("~/.cache/clip")
+        cache_dir = os.path.expanduser("/remote-home/iot_zhouqihang/root/.cache/clip")
+    os.makedirs(cache_dir, exist_ok=True)
+    filename = os.path.basename(url)
+    if 'openaipublic' in url:
+        expected_sha256 = url.split("/")[-2]
+    elif 'mlfoundations' in url:
+        expected_sha256 = os.path.splitext(filename)[0].split("-")[-1]
+    else:
+        expected_sha256 = ''
+    download_target = os.path.join(cache_dir, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        if expected_sha256:
+            if hashlib.sha256(open(download_target, "rb").read()).hexdigest().startswith(expected_sha256):
+                return download_target
+            else:
+                warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+        else:
+            return download_target
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.headers.get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if expected_sha256 and not hashlib.sha256(open(download_target, "rb").read()).hexdigest().startswith(expected_sha256):
+        raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
+    return download_target
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+def _transform(n_px):
+    return Compose([
+        Resize((n_px, n_px), interpolation=InterpolationMode.BICUBIC),
+        #CenterCrop(n_px), # rm center crop to explain whole image
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+def load_state_dict(checkpoint_path: str, map_location='cpu'):
+    checkpoint = torch.load(checkpoint_path, map_location=map_location)
+    if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    if next(iter(state_dict.items()))[0].startswith('module'):
+        state_dict = {k[7:]: v for k, v in state_dict.items()}
+    return state_dict
+def load_checkpoint(model, checkpoint_path, strict=True):
+    state_dict = load_state_dict(checkpoint_path)
+    # detect old format and make compatible with new format
+    if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'):
+        state_dict = convert_to_custom_text_state_dict(state_dict)
+    resize_pos_embed(state_dict, model)
+    incompatible_keys = model.load_state_dict(state_dict, strict=strict)
+    return incompatible_keys
+def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", design_details = None, jit: bool = False, download_root: str = None):
+    """Load a CLIP model
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+    device : Union[str, torch.device]
+        The device to put the loaded model
+    jit : bool
+        Whether to load the optimized JIT model or more hackable non-JIT model (default).
+    download_root: str
+        path to download the model files; by default, it uses "~/.cache/clip"
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLIP model
+    preprocess : Callable[[PIL.Image], torch.Tensor]
+        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+    """
+    print("name", name)
+    if name in _MODELS:
+        # model_path = _download(_MODELS[name], download_root or os.path.expanduser("~/.cache/clip"))
+        model_path = _download(_MODELS[name], download_root or os.path.expanduser("/remote-home/iot_zhouqihang/root/.cache/clip"))
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
+    with open(model_path, 'rb') as opened_file:
+        try:
+            # loading JIT archive
+            model = torch.jit.load(opened_file, map_location=device if jit else "cpu").eval()
+            state_dict = None
+        except RuntimeError:
+            # loading saved state dict
+            if jit:
+                warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
+                jit = False
+            state_dict = torch.load(opened_file, map_location="cpu")
+    if not jit:
+        model = build_model(name, state_dict or model.state_dict(), design_details).to(device)
+        if str(device) == "cpu":
+            model.float()
+        return model, _transform(model.visual.input_resolution)
+    # patch the device names
+    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
+    def patch_device(module):
+        try:
+            graphs = [module.graph] if hasattr(module, "graph") else []
+        except RuntimeError:
+            graphs = []
+        if hasattr(module, "forward1"):
+            graphs.append(module.forward1.graph)
+        for graph in graphs:
+            for node in graph.findAllNodes("prim::Constant"):
+                if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
+                    node.copyAttributes(device_node)
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+    # patch dtype to float32 on CPU
+    if str(device) == "cpu":
+        float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
+        float_node = float_input.node()
+        def patch_float(module):
+            try:
+                graphs = [module.graph] if hasattr(module, "graph") else []
+            except RuntimeError:
+                graphs = []
+            if hasattr(module, "forward1"):
+                graphs.append(module.forward1.graph)
+            for graph in graphs:
+                for node in graph.findAllNodes("aten::to"):
+                    inputs = list(node.inputs())
+                    for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
+                        if inputs[i].node()["value"] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+        model.float()
+    return model, _transform(model.input_resolution.item())
+def get_similarity_map(sm, shape):
+    side = int(sm.shape[1] ** 0.5)
+    sm = sm.reshape(sm.shape[0], side, side, -1).permute(0, 3, 1, 2)
+    sm = torch.nn.functional.interpolate(sm, shape, mode='bilinear')
+    sm = sm.permute(0, 2, 3, 1)
+    return sm
+def compute_similarity(image_features, text_features, t=2):
+    prob_1 = image_features[:, :1, :] @ text_features.t()
+    b, n_t, n_i, c = image_features.shape[0], text_features.shape[0], image_features.shape[1], image_features.shape[2]
+    feats = image_features.reshape(b, n_i, 1, c) * text_features.reshape(1, 1, n_t, c)
+    similarity = feats.sum(-1)
+    return (similarity/0.07).softmax(-1), prob_1

AnomalyCLIP_lib/simple_tokenizer.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import gzip
+import html
+import os
+from functools import lru_cache
+import ftfy
+import regex as re
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
+        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token+'</w>'
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text

AnomalyCLIP_lib/transform.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import warnings
+from dataclasses import dataclass, asdict
+from typing import Any, Dict, Optional, Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+import torchvision.transforms.functional as F
+from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \
+    CenterCrop
+from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
+@dataclass
+class AugmentationCfg:
+    scale: Tuple[float, float] = (0.9, 1.0)
+    ratio: Optional[Tuple[float, float]] = None
+    color_jitter: Optional[Union[float, Tuple[float, float, float]]] = None
+    interpolation: Optional[str] = None
+    re_prob: Optional[float] = None
+    re_count: Optional[int] = None
+    use_timm: bool = False
+class ResizeMaxSize(nn.Module):
+    def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn='max', fill=0):
+        super().__init__()
+        if not isinstance(max_size, int):
+            raise TypeError(f"Size should be int. Got {type(max_size)}")
+        self.max_size = max_size
+        self.interpolation = interpolation
+        self.fn = min if fn == 'min' else min
+        self.fill = fill
+    def forward(self, img):
+        if isinstance(img, torch.Tensor):
+            height, width = img.shape[:2]
+        else:
+            width, height = img.size
+        scale = self.max_size / float(max(height, width))
+        if scale != 1.0:
+            new_size = tuple(round(dim * scale) for dim in (height, width))
+            img = F.resize(img, new_size, self.interpolation)
+            pad_h = self.max_size - new_size[0]
+            pad_w = self.max_size - new_size[1]
+            img = F.pad(img, padding=[pad_w//2, pad_h//2, pad_w - pad_w//2, pad_h - pad_h//2], fill=self.fill)
+        return img
+def _convert_to_rgb(image):
+    return image.convert('RGB')
+def image_transform(
+        image_size: int,
+        is_train: bool,
+        mean: Optional[Tuple[float, ...]] = None,
+        std: Optional[Tuple[float, ...]] = None,
+        resize_longest_max: bool = False,
+        fill_color: int = 0,
+        aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
+):
+    mean = mean or OPENAI_DATASET_MEAN
+    if not isinstance(mean, (list, tuple)):
+        mean = (mean,) * 3
+    std = std or OPENAI_DATASET_STD
+    if not isinstance(std, (list, tuple)):
+        std = (std,) * 3
+    if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]:
+        # for square size, pass size as int so that Resize() uses aspect preserving shortest edge
+        image_size = image_size[0]
+    if isinstance(aug_cfg, dict):
+        aug_cfg = AugmentationCfg(**aug_cfg)
+    else:
+        aug_cfg = aug_cfg or AugmentationCfg()
+    normalize = Normalize(mean=mean, std=std)
+    if is_train:
+        aug_cfg_dict = {k: v for k, v in asdict(aug_cfg).items() if v is not None}
+        use_timm = aug_cfg_dict.pop('use_timm', False)
+        if use_timm:
+            from timm.data import create_transform  # timm can still be optional
+            if isinstance(image_size, (tuple, list)):
+                assert len(image_size) >= 2
+                input_size = (3,) + image_size[-2:]
+            else:
+                input_size = (3, image_size, image_size)
+            # by default, timm aug randomly alternates bicubic & bilinear for better robustness at inference time
+            aug_cfg_dict.setdefault('interpolation', 'random')
+            aug_cfg_dict.setdefault('color_jitter', None)  # disable by default
+            train_transform = create_transform(
+                input_size=input_size,
+                is_training=True,
+                hflip=0.,
+                mean=mean,
+                std=std,
+                re_mode='pixel',
+                **aug_cfg_dict,
+            )
+        else:
+            train_transform = Compose([
+                RandomResizedCrop(
+                    image_size,
+                    scale=aug_cfg_dict.pop('scale'),
+                    interpolation=InterpolationMode.BICUBIC,
+                ),
+                _convert_to_rgb,
+                ToTensor(),
+                normalize,
+            ])
+            if aug_cfg_dict:
+                warnings.warn(f'Unused augmentation cfg items, specify `use_timm` to use ({list(aug_cfg_dict.keys())}).')
+        return train_transform
+    else:
+        if resize_longest_max:
+            transforms = [
+                ResizeMaxSize(image_size, fill=fill_color)
+            ]
+        else:
+            transforms = [
+                Resize(image_size, interpolation=InterpolationMode.BICUBIC),
+                CenterCrop(image_size),
+            ]
+        transforms.extend([
+            _convert_to_rgb,
+            ToTensor(),
+            normalize,
+        ])
+        return Compose(transforms)

Dockerfile ADDED Viewed

	@@ -0,0 +1,14 @@

+# -----------------------------------------------------------------------------
+#  A sample Dockerfile to help you replicate our test environment
+# -----------------------------------------------------------------------------
+FROM pytorch/pytorch:2.4.1-cuda12.4-cudnn9-runtime
+WORKDIR /app
+COPY . .
+# Install your python and apt requirements
+RUN pip install -r requirements.txt
+RUN apt-get update && apt-get install $(cat apt_requirements.txt) -y
+RUN chmod +x run.sh
+CMD ["python3", "runner.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Qihang Zhou
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,142 @@

+# AnomalyCLIP (Train once and test other)
+> [**ICLR 24**] [**AnomalyCLIP: Object-agnostic Prompt Learning for Zero-shot Anomaly Detection**](https://arxiv.org/pdf/2310.18961.pdf)
+>
+> by [Qihang Zhou*](), [Guansong Pang*](https://www.guansongpang.com/),  [Yu Tian](https://yutianyt.com/), [Shibo He](https://scholar.google.com/citations?hl=zh-CN&user=5GOcb4gAAAAJ&view_op=list_works&sortby=pubdate), [Jiming Chen](https://scholar.google.com/citations?user=zK9tvo8AAAAJ&hl=zh-CN).
+## Updates
+- **03.19.2024**: Code has been released !!!
+- **08.08.2024**: Update the code for testing one image.
+## Introduction
+Zero-shot anomaly detection (ZSAD) requires detection models trained using auxiliary data to detect anomalies without any training sample in a target dataset. It is a crucial task when training data is not accessible due to various concerns, e.g., data privacy, yet it is challenging since the models need to generalize to anomalies across different domains where the appearance of foreground objects, abnormal regions, and background features, such as defects/tumors on different products/organs, can vary significantly. Recently large pre-trained vision-language models (VLMs), such as CLIP,
+have demonstrated strong zero-shot recognition ability in various vision tasks, including anomaly detection. However, their ZSAD performance is weak since the VLMs focus more on modeling the class semantics of the foreground objects rather than the abnormality/normality in the images.
+In this paper we introduce a novel approach, namely AnomalyCLIP, to adapt CLIP for accurate ZSAD across different domains. The key insight of AnomalyCLIP is to learn object-agnostic text prompts that capture generic normality and abnormality in an image regardless of its foreground objects. This allows our model to focus on the abnormal image regions rather than the object semantics, enabling generalized normality and abnormality recognition on diverse types of objects. Large-scale experiments on 17 real-world anomaly detection datasets show that AnomalyCLIP achieves superior zero-shot performance of detecting and segmenting anomalies in datasets of highly diverse class semantics from various defect inspection and medical imaging domains. All experiments are conducted in PyTorch-2.0.0 with a single NVIDIA RTX 3090 24GB.
+## Overview of AnomalyCLIP
+![overview](https://github.com/zqhang/AnomalyCLIP/assets/19222962/4ec3e5fc-9570-41f7-8067-6e7a515841be)
+## Analysis of different text prompt templates
+![analysis](./assets/analysis.png)
+## How to Run
+### Prepare your dataset
+Download the dataset below:
+* Industrial Domain:
+[MVTec](https://www.mvtec.com/company/research/datasets/mvtec-ad), [VisA](https://github.com/amazon-science/spot-diff), [MPDD](https://github.com/stepanje/MPDD), [BTAD](http://avires.dimi.uniud.it/papers/btad/btad.zip), [SDD](https://www.vicos.si/resources/kolektorsdd/), [DAGM](https://www.kaggle.com/datasets/mhskjelvareid/dagm-2007-competition-dataset-optical-inspection), [DTD-Synthetic](https://drive.google.com/drive/folders/10OyPzvI3H6llCZBxKxFlKWt1Pw1tkMK1)
+* Medical Domain:
+[HeadCT](https://www.kaggle.com/datasets/felipekitamura/head-ct-hemorrhage), [BrainMRI](https://www.kaggle.com/datasets/navoneel/brain-mri-images-for-brain-tumor-detection), [Br35H](https://www.kaggle.com/datasets/ahmedhamada0/brain-tumor-detection), [COVID-19](https://www.kaggle.com/datasets/tawsifurrahman/covid19-radiography-database), [ISIC](https://isic-challenge-data.s3.amazonaws.com/2016/ISBI2016_ISIC_Part1_Test_Data.zip), [CVC-ColonDB](https://figshare.com/articles/figure/Polyp_DataSet_zip/21221579), [CVC-ClinicDB](https://figshare.com/articles/figure/Polyp_DataSet_zip/21221579), [Kvasir](https://figshare.com/articles/figure/Polyp_DataSet_zip/21221579), [Endo](https://drive.google.com/file/d/1LNpLkv5ZlEUzr_RPN5rdOHaqk0SkZa3m/view), [TN3K](https://github.com/haifangong/TRFE-Net-for-thyroid-nodule-segmentation?tab=readme-ov-file).
+* Google Drive link (frequently requested dataset): [SDD](https://drive.google.com/drive/folders/1oqaxUZYi44jlLT4WtT6D5T6onPTNZXsu?usp=drive_link), [Br35H](https://drive.google.com/file/d/1l9XODMBm4X23K70LtpxAxgoaBbNzr4Nc/view?usp=drive_link), [COVID-19](https://drive.google.com/file/d/1ECwI8DJmhEtcVHatxCAdFqnSmXs35WFL/view?usp=drive_link)
+### Generate the dataset JSON
+Take MVTec AD for example (With multiple anomaly categories)
+Structure of MVTec Folder:
+```
+mvtec/
+│
+├── meta.json
+│
+├── bottle/
+│   ├── ground_truth/
+│   │   ├── broken_large/
+│   │   │   └── 000_mask.png
+|   |   |   └── ...
+│   │   └── ...
+│   └── test/
+│       ├── broken_large/
+│       │   └── 000.png
+|       |   └── ...
+│       └── ...
+│
+└── ...
+```
+```bash
+cd generate_dataset_json
+python mvtec.py
+```
+Take SDD for example (With single anomaly category)
+Structure of SDD Folder:
+```
+SDD/
+│
+├── electrical_commutators/
+│   └── test/
+│       ├─��� defect/
+│       │   └── kos01_Part5_0.png
+|       |   └── ...
+│       └── good/
+│           └── kos01_Part0_0.png
+│           └── ...
+│
+└── meta.json
+```
+```bash
+cd generate_dataset_json
+python SDD.py
+```
+Select the corresponding script and run it (we provide all scripts for datasets that AnomalyCLIP reported). The generated JSON stores all the information that AnomalyCLIP needs.
+### Custom dataset (optional)
+1. Create a new JSON script in fold [generate_dataset_json](https://github.com/zqhang/AnomalyCLIP/tree/main/generate_dataset_json) according to the fold structure of your own datasets.
+2. Add the related info of your dataset (i.e., dataset name and class names) in script [dataset\.py](https://github.com/zqhang/AnomalyCLIP/blob/main/dataset.py)
+### Run AnomalyCLIP
+* Quick start (use the pre-trained weights)
+```bash
+bash test.sh
+```
+* Train your own weights
+```bash
+bash train.sh
+```
+## Main results (We test all datasets by training once on MVTec AD. For MVTec AD, AnomalyCLIP is trained on VisA.)
+### Industrial dataset
+![industrial](./assets/Industrial.png)
+### Medical dataset
+![medical](./assets/medical.png)
+## Visualization
+![hazelnut](./assets/hazelnut.png)
+![capusle](./assets/capusle.png)
+![skin](./assets/skin.png)
+![brain](./assets/brain.png)
+## We provide the reproduction of WinCLIP [here](https://github.com/zqhang/WinCLIP-pytorch)
+* We thank for the code repository: [open_clip](https://github.com/mlfoundations/open_clip), [DualCoOp](https://github.com/sunxm2357/DualCoOp), [CLIP_Surgery](https://github.com/xmed-lab/CLIP_Surgery), and [VAND](https://github.com/ByChelsea/VAND-APRIL-GAN/tree/master).
+## BibTex Citation
+If you find this paper and repository useful, please cite our paper.
+```
+@inproceedings{zhou2023anomalyclip,
+  title={AnomalyCLIP: Object-agnostic Prompt Learning for Zero-shot Anomaly Detection},
+  author={Zhou, Qihang and Pang, Guansong and Tian, Yu and He, Shibo and Chen, Jiming},
+  booktitle={The Twelfth International Conference on Learning Representations},
+  year={2023}
+}
+```

checkpoints/9_12_4_multiscale/epoch_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a89d1ffe49d86995e936c8e91515efa878d4e1777c73888622091e89a8df9e5b
+size 22631493

checkpoints/9_12_4_multiscale/epoch_10.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7205c05df3319984b349686cbfd8cc01d3ac241a82f33943e9217cbb85604b0b
+size 22631975

checkpoints/9_12_4_multiscale/epoch_11.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40017b0588b3e41aea4cf3902b388bbee494201b4406583f0a9c96f90818a986
+size 22631975

checkpoints/9_12_4_multiscale/epoch_12.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef4bdfad5689797d48296eeceb57343aabba5ae5a2c7e57d4b9e225d2d254252
+size 22631975

checkpoints/9_12_4_multiscale/epoch_13.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4381596b44bbaa33e7b04b4a19a46582980f1ee8742414d71147c8be95ef90d7
+size 22631975

checkpoints/9_12_4_multiscale/epoch_14.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd2a3865c4cf1363b80f301da7dc181a54787e3c218cc1f3464650a5f749cb26
+size 22631975

checkpoints/9_12_4_multiscale/epoch_15.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94ce202da3e6486a864b904fdfed5057de75846c5834e446fd1d2fe7f97acb44
+size 22631975

checkpoints/9_12_4_multiscale/epoch_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6bfcd2ed1725b3d58dd06d5d38f7ef6d3b9c49d817bb4714a16f3153c3d7450
+size 22631493

checkpoints/9_12_4_multiscale/epoch_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5af4c383158732845ac2ef195e5036e8528f187ed80173c8d993830a0abed64c
+size 22631493

checkpoints/9_12_4_multiscale/epoch_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ab9a9909711c89cac5f02f0c46c7baac82b09bfaca59a83271a50b195cad89f
+size 22631493

checkpoints/9_12_4_multiscale/epoch_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:317837a0ef5b46d2476c234d3fa77e8cfab7bbfa85711f5fe7eb7f50ea7151a0
+size 22631493

checkpoints/9_12_4_multiscale/epoch_6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04379155c0df8d4e1194335427091e626df512a9747e47c1bbb7ee3a55708164
+size 22631493

checkpoints/9_12_4_multiscale/epoch_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41c5a77a355c27266d6a9c7b6da4b3ee2c193596873d889822e68a797a2688b2
+size 22631493

checkpoints/9_12_4_multiscale/epoch_8.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c92bfa088eccb2efb71b27c9703c0f21158903581efd7292f42938ad96940c82
+size 22631493

checkpoints/9_12_4_multiscale/epoch_9.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43f0eca2d506b88370a06c94a6cd557360c7bcb179a4f3f24981230349a9581a
+size 22631493

checkpoints/9_12_4_multiscale/log.txt ADDED Viewed

File without changes

checkpoints/9_12_4_multiscale_visa/epoch_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de5df7fc2ec18acb5709e65b1889d586974d365c39d1aa4df728336633e4ee70
+size 22631493

checkpoints/9_12_4_multiscale_visa/epoch_10.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:397255934bd313beeab2b610fa901f113e12342974687147cad78f502e5ae7e5
+size 22631975

checkpoints/9_12_4_multiscale_visa/epoch_11.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:843fb9df1c46da89f6976a42d10d5fe34675ad48eccb365e3f43785f925c2ae9
+size 22631975

checkpoints/9_12_4_multiscale_visa/epoch_12.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17f69ad9ae4bcc5823fdd9ad56b51ec57cc641270280a1776c1014ea1969f282
+size 22631975

checkpoints/9_12_4_multiscale_visa/epoch_13.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5bf5fd9c269e3f68e81134f4361c3239ba14d5f2cd4e3564f93f5b59f616cd19
+size 22631975

checkpoints/9_12_4_multiscale_visa/epoch_14.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:969dbaaa1a986f17d79dfb81d2ce90443d0e9dd9f19db7fd9a9190f97cc8e3d4
+size 22631975

checkpoints/9_12_4_multiscale_visa/epoch_15.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:415c5dcb52668b8c33fb9c1a351c686d632b919df5b384d63fa9ce7a2338ced4
+size 22631975

checkpoints/9_12_4_multiscale_visa/epoch_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c98c722977ac0fc42c1067a8038656c10466728f6e9d448aad9e3f6b3d5368b6
+size 22631493

checkpoints/9_12_4_multiscale_visa/epoch_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3e7a65d6b9ff057b5fa53bfc59bfa57a25619b5a5d9cd40ed37579e312ab4aa
+size 22631493

checkpoints/9_12_4_multiscale_visa/epoch_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f56b0ed7bd9da05f77780a3c4318e038c258b99a02ad1455652cad146b3dded5
+size 22631493

checkpoints/9_12_4_multiscale_visa/epoch_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2c44c082a19abde2993e80044466c1e45a620cc24aad39e85bd65ed60d3572d
+size 22631493

checkpoints/9_12_4_multiscale_visa/epoch_6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:402d63bca2150631fb09d8d1c7529712a4ee8eea29bd7746412eae99b4ec6dc5
+size 22631493

checkpoints/9_12_4_multiscale_visa/epoch_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:081526236212ebc011ec53babaf8f0da7e25fbe92300aa7cc68eb41ca29b054f
+size 22631493

checkpoints/9_12_4_multiscale_visa/epoch_8.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f2587be72657ab30fc26bc5957e130ba7359ff53c32beb7984be517a818427c
+size 22631493

checkpoints/9_12_4_multiscale_visa/epoch_9.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4850f209b34912c33718b86c13d2a01c340907d182236a8ef8903f35c80daec0
+size 22631493

dataset.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch.utils.data as data
+import json
+import random
+from PIL import Image
+import numpy as np
+import torch
+import os
+class Dataset(data.Dataset):
+    def __init__(self, root, transform, target_transform, dataset_name, mode='test'):
+        self.root = root
+        self.transform = transform
+        self.target_transform = target_transform
+        self.data_all = []
+        meta_info = json.load(open(f'{self.root}/meta.json', 'r'))
+        name = self.root.split('/')[-1]
+        meta_info = meta_info[mode]
+        self.cls_names = list(meta_info.keys())
+        for cls_name in self.cls_names:
+            self.data_all.extend(meta_info[cls_name])
+        self.length = len(self.data_all)
+        self.obj_list = [folder for folder in os.listdir(root) if os.path.isdir(os.path.join(root, folder)) and not folder.startswith('.')]
+        self.class_name_map_class_id  = {o: i for i, o in enumerate(self.obj_list)}
+    def __len__(self):
+        return self.length
+    def __getitem__(self, index):
+        data = self.data_all[index]
+        img_path, mask_path, cls_name, specie_name, anomaly = data['img_path'], data['mask_path'], data['cls_name'], \
+                                                              data['specie_name'], data['anomaly']
+        img = Image.open(os.path.join(self.root, img_path))
+        if anomaly == 0:
+            img_mask = Image.fromarray(np.zeros((img.size[0], img.size[1])), mode='L')
+        else:
+            if os.path.isdir(os.path.join(self.root, mask_path)):
+                # just for classification not report error
+                img_mask = Image.fromarray(np.zeros((img.size[0], img.size[1])), mode='L')
+            else:
+                img_mask = np.array(Image.open(os.path.join(self.root, mask_path)).convert('L')) > 0
+                img_mask = Image.fromarray(img_mask.astype(np.uint8) * 255, mode='L')
+        # transforms
+        img = self.transform(img) if self.transform is not None else img
+        img_mask = self.target_transform(
+            img_mask) if self.target_transform is not None and img_mask is not None else img_mask
+        img_mask = [] if img_mask is None else img_mask
+        return {'img': img, 'img_mask': img_mask, 'cls_name': cls_name, 'anomaly': anomaly,
+                'img_path': os.path.join(self.root, img_path), "cls_id": self.class_name_map_class_id[cls_name]}

datasets/rayan_dataset.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# -----------------------------------------------------------------------------
+#  Do Not Alter This File!
+# -----------------------------------------------------------------------------
+#  The following code is part of the logic used for loading and evaluating your
+#  output scores. Please DO NOT modify this section, as upon your submission,
+#  the whole evaluation logic will be overwritten by the original code.
+# -----------------------------------------------------------------------------
+#  If you'd like to make modifications, you can create a completely new Dataset
+#  class or a child class that inherits from this one and use that with your
+#  data loader.
+# -----------------------------------------------------------------------------
+import os
+from enum import Enum
+import PIL
+import torch
+from torchvision import transforms
+IMAGENET_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_STD = [0.229, 0.224, 0.225]
+class DatasetSplit(Enum):
+    TRAIN = "train"
+    VAL = "val"
+    TEST = "test"
+class RayanDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        source,
+        classname,
+        input_size=518,
+        output_size=224,
+        split=DatasetSplit.TEST,
+        external_transform=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.source = source
+        self.split = split
+        self.classnames_to_use = [classname]
+        self.imgpaths_per_class, self.data_to_iterate = self.get_image_data()
+        if external_transform is None:
+            self.transform_img = [
+                transforms.Resize((input_size, input_size)),
+                transforms.CenterCrop(input_size),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
+            ]
+            self.transform_img = transforms.Compose(self.transform_img)
+        else:
+            self.transform_img = external_transform
+        # Output size of the mask has to be of shape: 1×224×224
+        self.transform_mask = [
+            transforms.Resize((output_size, output_size)),
+            transforms.CenterCrop(output_size),
+            transforms.ToTensor(),
+        ]
+        self.transform_mask = transforms.Compose(self.transform_mask)
+        self.output_shape = (1, output_size, output_size)
+    def __getitem__(self, idx):
+        classname, anomaly, image_path, mask_path = self.data_to_iterate[idx]
+        image = PIL.Image.open(image_path).convert("RGB")
+        image = self.transform_img(image)
+        if self.split == DatasetSplit.TEST and mask_path is not None:
+            mask = PIL.Image.open(mask_path).convert("L")
+            mask = self.transform_mask(mask) > 0
+        else:
+            mask = torch.zeros([*self.output_shape])
+        return {
+            "image": image,
+            "mask": mask,
+            "is_anomaly": int(anomaly != "good"),
+            "image_path": image_path,
+        }
+    def __len__(self):
+        return len(self.data_to_iterate)
+    def get_image_data(self):
+        imgpaths_per_class = {}
+        maskpaths_per_class = {}
+        for classname in self.classnames_to_use:
+            classpath = os.path.join(self.source, classname, self.split.value)
+            maskpath = os.path.join(self.source, classname, "ground_truth")
+            anomaly_types = os.listdir(classpath)
+            imgpaths_per_class[classname] = {}
+            maskpaths_per_class[classname] = {}
+            for anomaly in anomaly_types:
+                anomaly_path = os.path.join(classpath, anomaly)
+                anomaly_files = sorted(os.listdir(anomaly_path))
+                imgpaths_per_class[classname][anomaly] = [
+                    os.path.join(anomaly_path, x) for x in anomaly_files
+                ]
+                if self.split == DatasetSplit.TEST and anomaly != "good":
+                    anomaly_mask_path = os.path.join(maskpath, anomaly)
+                    anomaly_mask_files = sorted(os.listdir(anomaly_mask_path))
+                    maskpaths_per_class[classname][anomaly] = [
+                        os.path.join(anomaly_mask_path, x) for x in anomaly_mask_files
+                    ]
+                else:
+                    maskpaths_per_class[classname]["good"] = None
+        data_to_iterate = []
+        for classname in sorted(imgpaths_per_class.keys()):
+            for anomaly in sorted(imgpaths_per_class[classname].keys()):
+                for i, image_path in enumerate(imgpaths_per_class[classname][anomaly]):
+                    data_tuple = [classname, anomaly, image_path]
+                    if self.split == DatasetSplit.TEST and anomaly != "good":
+                        data_tuple.append(maskpaths_per_class[classname][anomaly][i])
+                    else:
+                        data_tuple.append(None)
+                    data_to_iterate.append(data_tuple)
+        return imgpaths_per_class, data_to_iterate

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+# -----------------------------------------------------------------------------
+#  A sample Docker Compose file to help you replicate our test environment
+# -----------------------------------------------------------------------------
+services:
+    zsad-service:
+        image: zsad-image:1
+        build:
+            context: .
+        container_name: zsad-container
+        volumes:
+            - ./shared_folder:/app/output
+        deploy:
+            resources:
+                reservations:
+                    devices:
+                        - driver: nvidia
+                          count: all
+                          capabilities: [gpu]
+        command: [ "python3", "runner.py" ]

evaluation/base_eval.py ADDED Viewed

	@@ -0,0 +1,293 @@

+# -----------------------------------------------------------------------------
+#  Do Not Alter This File!
+# -----------------------------------------------------------------------------
+#  The following code is part of the logic used for loading and evaluating your
+#  output scores. Please DO NOT modify this section, as upon your submission,
+#  the whole evaluation logic will be overwritten by the original code.
+# -----------------------------------------------------------------------------
+import warnings
+import os
+from pathlib import Path
+import csv
+import json
+import torch
+import datasets.rayan_dataset as rayan_dataset
+from evaluation.utils.metrics import compute_metrics
+warnings.filterwarnings("ignore")
+class BaseEval:
+    def __init__(self, cfg):
+        self.cfg = cfg
+        self.device = torch.device(
+            "cuda:{}".format(cfg["device"]) if torch.cuda.is_available() else "cpu"
+        )
+        self.path = cfg["datasets"]["data_path"]
+        self.dataset = cfg["datasets"]["dataset_name"]
+        self.save_csv = cfg["testing"]["save_csv"]
+        self.save_json = cfg["testing"]["save_json"]
+        self.categories = cfg["datasets"]["class_name"]
+        if isinstance(self.categories, str):
+            if self.categories.lower() == "all":
+                if self.dataset == "rayan_dataset":
+                    self.categories = self.get_available_class_names(self.path)
+            else:
+                self.categories = [self.categories]
+        self.output_dir = cfg["testing"]["output_dir"]
+        os.makedirs(self.output_dir, exist_ok=True)
+        self.scores_dir = cfg["testing"]["output_scores_dir"]
+        self.class_name_mapping_dir = cfg["testing"]["class_name_mapping_dir"]
+        self.leaderboard_metric_weights = {
+            "image_auroc": 1.2,
+            "image_ap": 1.1,
+            "image_f1": 1.1,
+            "pixel_auroc": 1.0,
+            "pixel_aupro": 1.4,
+            "pixel_ap": 1.3,
+            "pixel_f1": 1.3,
+        }
+    def get_available_class_names(self, root_data_path):
+        all_items = os.listdir(root_data_path)
+        folder_names = [
+            item
+            for item in all_items
+            if os.path.isdir(os.path.join(root_data_path, item))
+        ]
+        return folder_names
+    def load_datasets(self, category):
+        dataset_classes = {
+            "rayan_dataset": rayan_dataset.RayanDataset,
+        }
+        dataset_splits = {
+            "rayan_dataset": rayan_dataset.DatasetSplit.TEST,
+        }
+        test_dataset = dataset_classes[self.dataset](
+            source=self.path,
+            split=dataset_splits[self.dataset],
+            classname=category,
+        )
+        return test_dataset
+    def get_category_metrics(self, category):
+        print(f"Loading scores of '{category}'")
+        gt_sp, pr_sp, gt_px, pr_px, _ = self.load_category_scores(category)
+        print(f"Computing metrics for '{category}'")
+        image_metric, pixel_metric = compute_metrics(gt_sp, pr_sp, gt_px, pr_px)
+        return image_metric, pixel_metric
+    def load_category_scores(self, category):
+        raise NotImplementedError()
+    def get_scores_path_for_image(self, image_path):
+        """example image_path: './data/photovoltaic_module/test/good/037.png'"""
+        path = Path(image_path)
+        category, split, anomaly_type = path.parts[-4:-1]
+        image_name = path.stem
+        return os.path.join(
+            self.scores_dir, category, split, anomaly_type, f"{image_name}_scores.json"
+        )
+    def calc_leaderboard_score(self, **metrics):
+        weighted_sum = 0
+        total_weight = 0
+        for key, weight in self.leaderboard_metric_weights.items():
+            metric = metrics.get(key)
+            weighted_sum += metric * weight
+            total_weight += weight
+        if total_weight == 0:
+            return 0
+        return weighted_sum / total_weight
+    def main(self):
+        image_auroc_list = []
+        image_f1_list = []
+        image_ap_list = []
+        pixel_auroc_list = []
+        pixel_f1_list = []
+        pixel_ap_list = []
+        pixel_aupro_list = []
+        leaderboard_score_list = []
+        for category in self.categories:
+            image_metric, pixel_metric = self.get_category_metrics(
+                category=category,
+            )
+            image_auroc, image_f1, image_ap = image_metric
+            pixel_auroc, pixel_f1, pixel_ap, pixel_aupro = pixel_metric
+            leaderboard_score = self.calc_leaderboard_score(
+                image_auroc=image_auroc,
+                image_f1=image_f1,
+                image_ap=image_ap,
+                pixel_auroc=pixel_auroc,
+                pixel_aupro=pixel_aupro,
+                pixel_f1=pixel_f1,
+                pixel_ap=pixel_ap,
+            )
+            image_auroc_list.append(image_auroc)
+            image_f1_list.append(image_f1)
+            image_ap_list.append(image_ap)
+            pixel_auroc_list.append(pixel_auroc)
+            pixel_f1_list.append(pixel_f1)
+            pixel_ap_list.append(pixel_ap)
+            pixel_aupro_list.append(pixel_aupro)
+            leaderboard_score_list.append(leaderboard_score)
+            print(category)
+            print(
+                "[image level] auroc:{}, f1:{}, ap:{}".format(
+                    image_auroc * 100,
+                    image_f1 * 100,
+                    image_ap * 100,
+                )
+            )
+            print(
+                "[pixel level] auroc:{}, f1:{}, ap:{}, aupro:{}".format(
+                    pixel_auroc * 100,
+                    pixel_f1 * 100,
+                    pixel_ap * 100,
+                    pixel_aupro * 100,
+                )
+            )
+            print(
+                "leaderboard score:{}".format(
+                    leaderboard_score * 100,
+                )
+            )
+        image_auroc_mean = sum(image_auroc_list) / len(image_auroc_list)
+        image_f1_mean = sum(image_f1_list) / len(image_f1_list)
+        image_ap_mean = sum(image_ap_list) / len(image_ap_list)
+        pixel_auroc_mean = sum(pixel_auroc_list) / len(pixel_auroc_list)
+        pixel_f1_mean = sum(pixel_f1_list) / len(pixel_f1_list)
+        pixel_ap_mean = sum(pixel_ap_list) / len(pixel_ap_list)
+        pixel_aupro_mean = sum(pixel_aupro_list) / len(pixel_aupro_list)
+        leaderboard_score_mean = sum(leaderboard_score_list) / len(
+            leaderboard_score_list
+        )
+        print("mean")
+        print(
+            "[image level] auroc:{}, f1:{}, ap:{}".format(
+                image_auroc_mean * 100, image_f1_mean * 100, image_ap_mean * 100
+            )
+        )
+        print(
+            "[pixel level] auroc:{}, f1:{}, ap:{}, aupro:{}".format(
+                pixel_auroc_mean * 100,
+                pixel_f1_mean * 100,
+                pixel_ap_mean * 100,
+                pixel_aupro_mean * 100,
+            )
+        )
+        print(
+            "leaderboard score:{}".format(
+                leaderboard_score_mean * 100,
+            )
+        )
+        # Save the final results as a csv file
+        if self.save_csv:
+            with open(self.class_name_mapping_dir, "r") as f:
+                class_name_mapping_dict = json.load(f)
+            csv_data = [
+                [
+                    "Category",
+                    "pixel_auroc",
+                    "pixel_f1",
+                    "pixel_ap",
+                    "pixel_aupro",
+                    "image_auroc",
+                    "image_f1",
+                    "image_ap",
+                    "leaderboard_score",
+                ]
+            ]
+            for i, category in enumerate(self.categories):
+                csv_data.append(
+                    [
+                        class_name_mapping_dict[category],
+                        pixel_auroc_list[i] * 100,
+                        pixel_f1_list[i] * 100,
+                        pixel_ap_list[i] * 100,
+                        pixel_aupro_list[i] * 100,
+                        image_auroc_list[i] * 100,
+                        image_f1_list[i] * 100,
+                        image_ap_list[i] * 100,
+                        leaderboard_score_list[i] * 100,
+                    ]
+                )
+            csv_data.append(
+                [
+                    "mean",
+                    pixel_auroc_mean * 100,
+                    pixel_f1_mean * 100,
+                    pixel_ap_mean * 100,
+                    pixel_aupro_mean * 100,
+                    image_auroc_mean * 100,
+                    image_f1_mean * 100,
+                    image_ap_mean * 100,
+                    leaderboard_score_mean * 100,
+                ]
+            )
+            csv_file_path = os.path.join(self.output_dir, "results.csv")
+            with open(csv_file_path, mode="w", newline="") as file:
+                writer = csv.writer(file)
+                writer.writerows(csv_data)
+        # Save the final results as a json file
+        if self.save_json:
+            json_data = []
+            with open(self.class_name_mapping_dir, "r") as f:
+                class_name_mapping_dict = json.load(f)
+            for i, category in enumerate(self.categories):
+                json_data.append(
+                    {
+                        "Category": class_name_mapping_dict[category],
+                        "pixel_auroc": pixel_auroc_list[i] * 100,
+                        "pixel_f1": pixel_f1_list[i] * 100,
+                        "pixel_ap": pixel_ap_list[i] * 100,
+                        "pixel_aupro": pixel_aupro_list[i] * 100,
+                        "image_auroc": image_auroc_list[i] * 100,
+                        "image_f1": image_f1_list[i] * 100,
+                        "image_ap": image_ap_list[i] * 100,
+                        "leaderboard_score": leaderboard_score_list[i] * 100,
+                    }
+                )
+            json_data.append(
+                {
+                    "Category": "mean",
+                    "pixel_auroc": pixel_auroc_mean * 100,
+                    "pixel_f1": pixel_f1_mean * 100,
+                    "pixel_ap": pixel_ap_mean * 100,
+                    "pixel_aupro": pixel_aupro_mean * 100,
+                    "image_auroc": image_auroc_mean * 100,
+                    "image_f1": image_f1_mean * 100,
+                    "image_ap": image_ap_mean * 100,
+                    "leaderboard_score": leaderboard_score_mean * 100,
+                }
+            )
+            json_file_path = os.path.join(self.output_dir, "results.json")
+            with open(json_file_path, mode="w") as file:
+                final_json = {
+                    "result": leaderboard_score_mean * 100,
+                    "metadata": json_data,
+                }
+                json.dump(final_json, file, indent=4)

evaluation/class_name_mapping.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "pill": "industrial_01",
+    "photovoltaic_module": "industrial_02",
+    "capsules": "industrial_03"
+}

evaluation/eval_main.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# -----------------------------------------------------------------------------
+#  Do Not Alter This File!
+# -----------------------------------------------------------------------------
+#  The following code is part of the logic used for loading and evaluating your
+#  output scores. Please DO NOT modify this section, as upon your submission,
+#  the whole evaluation logic will be overwritten by the original code.
+# -----------------------------------------------------------------------------
+import warnings
+import argparse
+import os
+import sys
+sys.path.append(os.getcwd())
+from evaluation.json_score import JsonScoreEvaluator
+warnings.filterwarnings("ignore")
+def get_args():
+    parser = argparse.ArgumentParser(description="Rayan ZSAD Evaluation Code")
+    parser.add_argument("--data_path", type=str, default=None, help="dataset path")
+    parser.add_argument("--dataset_name", type=str, default=None, help="dataset name")
+    parser.add_argument("--class_name", type=str, default=None, help="category")
+    parser.add_argument("--device", type=int, default=None, help="gpu id")
+    parser.add_argument(
+        "--output_dir", type=str, default=None, help="save results path"
+    )
+    parser.add_argument(
+        "--output_scores_dir", type=str, default=None, help="save scores path"
+    )
+    parser.add_argument("--save_csv", type=str, default=None, help="save csv")
+    parser.add_argument("--save_json", type=str, default=None, help="save json")
+    parser.add_argument(
+        "--class_name_mapping_dir",
+        type=str,
+        default=None,
+        help="mapping from actual class names to class numbers",
+    )
+    args = parser.parse_args()
+    return args
+def load_args(cfg, args):
+    cfg["datasets"]["data_path"] = args.data_path
+    assert os.path.exists(
+        cfg["datasets"]["data_path"]
+    ), f"The dataset path {cfg['datasets']['data_path']} does not exist."
+    cfg["datasets"]["dataset_name"] = args.dataset_name
+    cfg["datasets"]["class_name"] = args.class_name
+    cfg["device"] = args.device
+    if isinstance(cfg["device"], int):
+        cfg["device"] = str(cfg["device"])
+    cfg["testing"]["output_dir"] = args.output_dir
+    cfg["testing"]["output_scores_dir"] = args.output_scores_dir
+    os.makedirs(cfg["testing"]["output_scores_dir"], exist_ok=True)
+    cfg["testing"]["class_name_mapping_dir"] = args.class_name_mapping_dir
+    if args.save_csv.lower() == "true":
+        cfg["testing"]["save_csv"] = True
+    else:
+        cfg["testing"]["save_csv"] = False
+    if args.save_json.lower() == "true":
+        cfg["testing"]["save_json"] = True
+    else:
+        cfg["testing"]["save_json"] = False
+    return cfg
+if __name__ == "__main__":
+    args = get_args()
+    cfg = load_args(cfg={"datasets": {}, "testing": {}, "models": {}}, args=args)
+    print(cfg)
+    model = JsonScoreEvaluator(cfg=cfg)
+    model.main()