Spaces:

altndrr
/

vic

Running

App Files Files Community

altndrr commited on Jun 6, 2023

Commit

fcf6714

1 Parent(s): c275a12

Load CLIP model from transformers

Browse files

Files changed (3) hide show

requirements.txt +1 -2
src/nn.py +35 -179
src/retrieval.py +2 -2

requirements.txt CHANGED Viewed

@@ -6,5 +6,4 @@ gradio==3.33.1
 gdown==4.4.0
 inflect==6.0.4
 nltk==3.8.1
-open_clip_torch==2.20.0
-transformers==4.26.1

 gdown==4.4.0
 inflect==6.0.4
 nltk==3.8.1
+transformers==4.29.2

src/nn.py CHANGED Viewed

@@ -6,10 +6,9 @@ from typing import Optional
 import faiss
 import gdown
 import numpy as np
-import open_clip
 import torch
-from open_clip.transformer import Transformer
 from PIL import Image
 from src.retrieval import ArrowMetadataProvider
 from src.transforms import TextCompose, default_vocabulary_transforms
@@ -28,73 +27,46 @@ class CaSED(torch.nn.Module):
     Args:
         index_name (str): Name of the faiss index to use.
         vocabulary_transforms (TextCompose): List of transforms to apply to the vocabulary.
-        model_name (str): Name of the CLIP model to use. Defaults to "ViT-L-14".
-        pretrained (str): Pretrained weights to use for the CLIP model. Defaults to "openai".
     Extra hparams:
         alpha (float): Weight for the average of the image and text predictions. Defaults to 0.5.
         artifact_dir (str): Path to the directory where the databases are stored. Defaults to
             "artifacts/".
         retrieval_num_results (int): Number of results to return. Defaults to 10.
-        vocabulary_prompt (str): Prompt to use for the vocabulary. Defaults to "{}".
-        tau (float): Temperature to use for the classifier. Defaults to 1.0.
     """
     def __init__(
         self,
         index_name: str = "ViT-L-14_CC12M",
         vocabulary_transforms: TextCompose = default_vocabulary_transforms(),
-        model_name: str = "ViT-L-14",
-        pretrained: str = "openai",
-        vocabulary_prompt: str = "{}",
         **kwargs,
     ):
         super().__init__()
-        self._prev_vocab_words = None
-        self._prev_used_prompts = None
-        self._prev_vocab_words_z = None
-        model, _, preprocess = open_clip.create_model_and_transforms(
-            model_name, pretrained=pretrained, device="cpu"
-        )
-        tokenizer = open_clip.get_tokenizer(model_name)
-        self.tokenizer = tokenizer
-        self.preprocess = preprocess
         kwargs["alpha"] = kwargs.get("alpha", 0.5)
         kwargs["artifact_dir"] = kwargs.get("artifact_dir", "artifacts/")
         kwargs["retrieval_num_results"] = kwargs.get("retrieval_num_results", 10)
-        vocabulary_prompt = kwargs.get("vocabulary_prompt", "{}")
-        kwargs["vocabulary_prompts"] = [vocabulary_prompt]
-        kwargs["tau"] = kwargs.get("tau", 1.0)
         self.hparams = kwargs
-        language_encoder = LanguageTransformer(
-            model.transformer,
-            model.token_embedding,
-            model.positional_embedding,
-            model.ln_final,
-            model.text_projection,
-            model.attn_mask,
-        )
-        scale = model.logit_scale.exp().item()
-        classifier = NearestNeighboursClassifier(scale=scale, tau=self.hparams["tau"])
-        self.index_name = index_name
-        self.vocabulary_transforms = vocabulary_transforms
-        self.vision_encoder = model.visual
-        self.language_encoder = language_encoder
-        self.classifier = classifier
         # download databases
         self.prepare_data()
-        # load faiss indices
         indices_list_dir = Path(self.hparams["artifact_dir"]) / "models" / "retrieval"
         indices_fp = indices_list_dir / "indices.json"
         self.indices = json.load(open(indices_fp))
-        # load faiss indices and metadata providers
         self.resources = {}
         for name, index_fp in self.indices.items():
             text_index_fp = Path(index_fp) / "text.index"
@@ -107,7 +79,7 @@ class CaSED(torch.nn.Module):
             self.resources[name] = {
                 "device": DEVICE,
-                "model": model_name,
                 "text_index": text_index,
                 "metadata_provider": metadata_provider,
             }
@@ -175,156 +147,40 @@ class CaSED(torch.nn.Module):
         return vocabularies
-    @torch.no_grad()
-    def encode_vocabulary(self, vocabulary: list, use_prompts: bool = False) -> torch.Tensor:
-        """Encode a vocabulary.
-        Args:
-            vocabulary (list): List of words.
-        """
-        # check if vocabulary has changed
-        if vocabulary == self._prev_vocab_words and use_prompts == self._prev_used_prompts:
-            return self._prev_vocab_words_z
-        # tokenize vocabulary
-        classes = [c.replace("_", " ") for c in vocabulary]
-        prompts = self.hparams["vocabulary_prompts"] if use_prompts else ["{}"]
-        texts_views = [[p.format(c) for c in classes] for p in prompts]
-        tokenized_texts_views = [
-            torch.cat([self.tokenizer(prompt) for prompt in class_prompts])
-            for class_prompts in texts_views
-        ]
-        tokenized_texts_views = torch.stack(tokenized_texts_views).to(DEVICE)
-        # encode vocabulary
-        T, C, _ = tokenized_texts_views.shape
-        texts_z_views = self.language_encoder(tokenized_texts_views.view(T * C, -1))
-        texts_z_views = texts_z_views.view(T, C, -1)
-        texts_z_views = texts_z_views / texts_z_views.norm(dim=-1, keepdim=True)
-        # cache vocabulary
-        self._prev_vocab_words = vocabulary
-        self._prev_used_prompts = use_prompts
-        self._prev_vocab_words_z = texts_z_views
-        return texts_z_views
     @torch.no_grad()
     def forward(self, image_fp: str, alpha: Optional[float] = None) -> torch.Tensor():
-        image = self.preprocess(Image.open(image_fp)).unsqueeze(0)
-        image_z = self.vision_encoder(image.to(DEVICE))
-        # get the vocabulary
-        vocabulary = self.query_index(image_z)
         # generate a single text embedding from the unfiltered vocabulary
-        unfiltered_vocabulary_z = self.encode_vocabulary(vocabulary).squeeze(0)
-        text_z = unfiltered_vocabulary_z.mean(dim=0)
-        text_z = text_z / text_z.norm(dim=-1, keepdim=True)
-        text_z = text_z.unsqueeze(0)
         # filter the vocabulary, embed it, and get its mean embedding
         vocabulary = self.vocabulary_transforms(vocabulary) or ["object"]
-        vocabulary_z = self.encode_vocabulary(vocabulary, use_prompts=True)
-        mean_vocabulary_z = vocabulary_z.mean(dim=0)
-        mean_vocabulary_z = mean_vocabulary_z / mean_vocabulary_z.norm(dim=-1, keepdim=True)
         # get the image and text predictions
-        image_p = self.classifier(image_z, vocabulary_z)
-        text_p = self.classifier(text_z, vocabulary_z)
         # average the image and text predictions
         alpha = alpha or self.hparams["alpha"]
         sample_p = alpha * image_p + (1 - alpha) * text_p
         # get the scores
-        sample_p = sample_p.cpu()
-        scores = sample_p[0].tolist()
-        del image_z, unfiltered_vocabulary_z, text_z, vocabulary_z, mean_vocabulary_z
-        del image_p, text_p, sample_p
         return vocabulary, scores
-class NearestNeighboursClassifier(torch.nn.Module):
-    """Nearest neighbours classifier.
-    It computes the similarity between the query and the supports using the
-    cosine similarity and then applies a softmax to obtain the logits.
-    Args:
-        scale (float): Scale for the logits of the query. Defaults to 1.0.
-        tau (float): Temperature for the softmax. Defaults to 1.0.
-    """
-    def __init__(self, scale: float = 1.0, tau: float = 1.0):
-        super().__init__()
-        self.scale = scale
-        self.tau = tau
-    def forward(self, query: torch.Tensor, supports: torch.Tensor):
-        query = query / query.norm(dim=-1, keepdim=True)
-        supports = supports / supports.norm(dim=-1, keepdim=True)
-        if supports.dim() == 2:
-            supports = supports.unsqueeze(0)
-        Q, _ = query.shape
-        N, C, _ = supports.shape
-        supports = supports.mean(dim=0)
-        supports = supports / supports.norm(dim=-1, keepdim=True)
-        similarity = self.scale * query @ supports.T
-        similarity = similarity / self.tau if self.tau != 1.0 else similarity
-        logits = similarity.softmax(dim=-1)
-        return logits
-class LanguageTransformer(torch.nn.Module):
-    """Language Transformer for CLIP.
-    Args:
-        transformer (Transformer): Transformer model.
-        token_embedding (torch.nn.Embedding): Token embedding.
-        positional_embedding (torch.nn.Parameter): Positional embedding.
-        ln_final (torch.nn.LayerNorm): Layer norm.
-        text_projection (torch.nn.Parameter): Text projection.
-    """
-    def __init__(
-        self,
-        model: Transformer,
-        token_embedding: torch.nn.Embedding,
-        positional_embedding: torch.nn.Parameter,
-        ln_final: torch.nn.LayerNorm,
-        text_projection: torch.nn.Parameter,
-        attn_mask: torch.Tensor,
-    ):
-        super().__init__()
-        self.transformer = model
-        self.token_embedding = token_embedding
-        self.positional_embedding = positional_embedding
-        self.ln_final = ln_final
-        self.text_projection = text_projection
-        self.register_buffer("attn_mask", attn_mask, persistent=False)
-    def forward(self, text: torch.Tensor) -> torch.Tensor:
-        cast_dtype = self.transformer.get_cast_dtype()
-        """Forward pass for the text encoder."""
-        x = self.token_embedding(text).to(cast_dtype)
-        x = x + self.positional_embedding.to(cast_dtype)
-        x = x.permute(1, 0, 2)
-        x = self.transformer(x, attn_mask=self.attn_mask)
-        x = x.permute(1, 0, 2)
-        x = self.ln_final(x)
-        # x.shape = [batch_size, n_ctx, transformer.width]
-        # take features from the eot embedding (eot_token is the highest number in each sequence)
-        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
-        return x

 import faiss
 import gdown
 import numpy as np
 import torch
 from PIL import Image
+from transformers import CLIPModel, CLIPProcessor
 from src.retrieval import ArrowMetadataProvider
 from src.transforms import TextCompose, default_vocabulary_transforms
     Args:
         index_name (str): Name of the faiss index to use.
         vocabulary_transforms (TextCompose): List of transforms to apply to the vocabulary.
     Extra hparams:
         alpha (float): Weight for the average of the image and text predictions. Defaults to 0.5.
         artifact_dir (str): Path to the directory where the databases are stored. Defaults to
             "artifacts/".
         retrieval_num_results (int): Number of results to return. Defaults to 10.
     """
     def __init__(
         self,
         index_name: str = "ViT-L-14_CC12M",
         vocabulary_transforms: TextCompose = default_vocabulary_transforms(),
         **kwargs,
     ):
         super().__init__()
+        # load CLIP
+        model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(DEVICE)
+        self.index_name = index_name
+        self.vocabulary_transforms = vocabulary_transforms
+        self.vision_encoder = model.vision_model
+        self.vision_proj = model.visual_projection
+        self.language_encoder = model.text_model
+        self.language_proj = model.text_projection
+        self.logit_scale = model.logit_scale.exp()
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        # set hparams
         kwargs["alpha"] = kwargs.get("alpha", 0.5)
         kwargs["artifact_dir"] = kwargs.get("artifact_dir", "artifacts/")
         kwargs["retrieval_num_results"] = kwargs.get("retrieval_num_results", 10)
         self.hparams = kwargs
         # download databases
         self.prepare_data()
+        # load faiss indices and metadata providers
         indices_list_dir = Path(self.hparams["artifact_dir"]) / "models" / "retrieval"
         indices_fp = indices_list_dir / "indices.json"
         self.indices = json.load(open(indices_fp))
         self.resources = {}
         for name, index_fp in self.indices.items():
             text_index_fp = Path(index_fp) / "text.index"
             self.resources[name] = {
                 "device": DEVICE,
+                "model": "ViT-L-14",
                 "text_index": text_index,
                 "metadata_provider": metadata_provider,
             }
         return vocabularies
     @torch.no_grad()
     def forward(self, image_fp: str, alpha: Optional[float] = None) -> torch.Tensor():
+        # forward the image
+        image = self.processor(images=Image.open(image_fp), return_tensors="pt")
+        image["pixel_values"] = image["pixel_values"].to(DEVICE)
+        image_z = self.vision_proj(self.vision_encoder(**image)[1])
         # generate a single text embedding from the unfiltered vocabulary
+        vocabulary = self.query_index(image_z)
+        text = self.processor(text=vocabulary, return_tensors="pt", padding=True)
+        text["input_ids"] = text["input_ids"][:, :77].to(DEVICE)
+        text["attention_mask"] = text["attention_mask"][:, :77].to(DEVICE)
+        text_z = self.language_encoder(**text)[1]
+        text_z = self.language_proj(text_z)
         # filter the vocabulary, embed it, and get its mean embedding
         vocabulary = self.vocabulary_transforms(vocabulary) or ["object"]
+        text = self.processor(text=vocabulary, return_tensors="pt", padding=True)
+        text = {k: v.to(DEVICE) for k, v in text.items()}
+        vocabulary_z = self.language_encoder(**text)[1]
+        vocabulary_z = self.language_proj(vocabulary_z)
+        vocabulary_z = vocabulary_z / vocabulary_z.norm(dim=-1, keepdim=True)
         # get the image and text predictions
+        image_z = image_z / image_z.norm(dim=-1, keepdim=True)
+        text_z = text_z / text_z.norm(dim=-1, keepdim=True)
+        image_p = (torch.matmul(image_z, vocabulary_z.T) * self.logit_scale).softmax(dim=-1)
+        text_p = (torch.matmul(text_z, vocabulary_z.T) * self.logit_scale).softmax(dim=-1)
         # average the image and text predictions
         alpha = alpha or self.hparams["alpha"]
         sample_p = alpha * image_p + (1 - alpha) * text_p
         # get the scores
+        scores = sample_p[0].cpu().tolist()
         return vocabulary, scores

src/retrieval.py CHANGED Viewed

@@ -11,8 +11,8 @@ class ArrowMetadataProvider:
     Code taken from: https://github.dev/rom1504/clip-retrieval
     """
-    def __init__(self, arrow_folder: str):
-        arrow_files = [str(a) for a in sorted(Path(arrow_folder).glob("**/*")) if a.is_file()]
         self.table = pa.concat_tables(
             [
                 pa.ipc.RecordBatchFileReader(pa.memory_map(arrow_file, "r")).read_all()

     Code taken from: https://github.dev/rom1504/clip-retrieval
     """
+    def __init__(self, arrow_folder: Path):
+        arrow_files = [str(a) for a in sorted(arrow_folder.glob("**/*")) if a.is_file()]
         self.table = pa.concat_tables(
             [
                 pa.ipc.RecordBatchFileReader(pa.memory_map(arrow_file, "r")).read_all()