Spaces:

altndrr
/

vic

Running

App Files Files Community

altndrr commited on Jun 6, 2023

Commit

50c0cb3

1 Parent(s): 8c43e37

Use altndrr/cased model

Browse files

Files changed (6) hide show

app.py +9 -6
artifacts/models/databases/.gitkeep +0 -0
artifacts/models/retrieval/indices.json +0 -3
src/nn.py +0 -186
src/retrieval.py +0 -30
src/transforms.py +0 -506

app.py CHANGED Viewed

@@ -2,8 +2,8 @@ from typing import Optional
 import gradio as gr
 import torch
-from src.nn import CaSED
 PAPER_TITLE = "Vocabulary-free Image Classification"
 PAPER_DESCRIPTION = """
@@ -37,14 +37,17 @@ To assign a label to an image, we:
 """
 PAPER_URL = "https://arxiv.org/abs/2306.00917"
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = CaSED().to(DEVICE).eval()
 def vic(filename: str, alpha: Optional[float] = None):
-    # get the outputs of the model
-    vocabulary, scores = model(filename, alpha=alpha)
     confidences = dict(zip(vocabulary, scores))
     return confidences

 import gradio as gr
 import torch
+from PIL import Image
+from transformers import AutoModel, CLIPProcessor
 PAPER_TITLE = "Vocabulary-free Image Classification"
 PAPER_DESCRIPTION = """
 """
 PAPER_URL = "https://arxiv.org/abs/2306.00917"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = AutoModel.from_pretrained("altndrr/cased", trust_remote_code=True).to(device)
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
 def vic(filename: str, alpha: Optional[float] = None):
+    images = processor(images=[Image.open(filename)], return_tensors="pt", padding=True)
+    outputs = model(images, alpha=alpha)
+    vocabulary = outputs["vocabularies"][0]
+    scores = outputs["scores"][0]
     confidences = dict(zip(vocabulary, scores))
     return confidences

artifacts/models/databases/.gitkeep DELETED Viewed

File without changes

artifacts/models/retrieval/indices.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-    "ViT-L-14_CC12M": "./artifacts/models/databases/cc12m/vit-l-14/"
-}

src/nn.py DELETED Viewed

@@ -1,186 +0,0 @@
-import json
-import tarfile
-from pathlib import Path
-from typing import Optional
-import faiss
-import gdown
-import numpy as np
-import torch
-from PIL import Image
-from transformers import CLIPModel, CLIPProcessor
-from src.retrieval import ArrowMetadataProvider
-from src.transforms import TextCompose, default_vocabulary_transforms
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-RETRIEVAL_DATABASES = {
-    "cc12m": "https://drive.google.com/uc?id=1HyM4mnKSxF0sqzAe-KZL8y-cQWRPiuXn&confirm=t",
-}
-class CaSED(torch.nn.Module):
-    """Torch module for Category Search from External Databases (CaSED).
-    Args:
-        index_name (str): Name of the faiss index to use.
-        vocabulary_transforms (TextCompose): List of transforms to apply to the vocabulary.
-    Extra hparams:
-        alpha (float): Weight for the average of the image and text predictions. Defaults to 0.5.
-        artifact_dir (str): Path to the directory where the databases are stored. Defaults to
-            "artifacts/".
-        retrieval_num_results (int): Number of results to return. Defaults to 10.
-    """
-    def __init__(
-        self,
-        index_name: str = "ViT-L-14_CC12M",
-        vocabulary_transforms: TextCompose = default_vocabulary_transforms(),
-        **kwargs,
-    ):
-        super().__init__()
-        # load CLIP
-        model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(DEVICE)
-        self.index_name = index_name
-        self.vocabulary_transforms = vocabulary_transforms
-        self.vision_encoder = model.vision_model
-        self.vision_proj = model.visual_projection
-        self.language_encoder = model.text_model
-        self.language_proj = model.text_projection
-        self.logit_scale = model.logit_scale.exp()
-        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
-        # set hparams
-        kwargs["alpha"] = kwargs.get("alpha", 0.5)
-        kwargs["artifact_dir"] = kwargs.get("artifact_dir", "artifacts/")
-        kwargs["retrieval_num_results"] = kwargs.get("retrieval_num_results", 10)
-        self.hparams = kwargs
-        # download databases
-        self.prepare_data()
-        # load faiss indices and metadata providers
-        indices_list_dir = Path(self.hparams["artifact_dir"]) / "models" / "retrieval"
-        indices_fp = indices_list_dir / "indices.json"
-        self.indices = json.load(open(indices_fp))
-        self.resources = {}
-        for name, index_fp in self.indices.items():
-            text_index_fp = Path(index_fp) / "text.index"
-            metadata_fp = Path(index_fp) / "metadata/"
-            text_index = faiss.read_index(
-                str(text_index_fp), faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY
-            )
-            metadata_provider = ArrowMetadataProvider(metadata_fp)
-            self.resources[name] = {
-                "device": DEVICE,
-                "model": "ViT-L-14",
-                "text_index": text_index,
-                "metadata_provider": metadata_provider,
-            }
-    def prepare_data(self):
-        """Download data if needed."""
-        databases_path = Path(self.hparams["artifact_dir"]) / "models" / "databases"
-        for name, url in RETRIEVAL_DATABASES.items():
-            database_path = Path(databases_path, name)
-            if database_path.exists():
-                continue
-            # download data
-            target_path = Path(databases_path, name + ".tar.gz")
-            try:
-                gdown.download(url, str(target_path), quiet=False)
-                tar = tarfile.open(target_path, "r:gz")
-                tar.extractall(target_path.parent)
-                tar.close()
-                target_path.unlink()
-            except FileNotFoundError:
-                print(f"Could not download {url}.")
-                print(f"Please download it manually and place it in {target_path.parent}.")
-    @torch.no_grad()
-    def query_index(self, sample_z: torch.Tensor) -> torch.Tensor:
-        # get the index
-        resources = self.resources[self.index_name]
-        text_index = resources["text_index"]
-        metadata_provider = resources["metadata_provider"]
-        # query the index
-        sample_z = sample_z.squeeze(0)
-        sample_z = sample_z / sample_z.norm(dim=-1, keepdim=True)
-        query_input = sample_z.cpu().detach().numpy().tolist()
-        query = np.expand_dims(np.array(query_input).astype("float32"), 0)
-        distances, idxs, _ = text_index.search_and_reconstruct(
-            query, self.hparams["retrieval_num_results"]
-        )
-        results = idxs[0]
-        nb_results = np.where(results == -1)[0]
-        nb_results = nb_results[0] if len(nb_results) > 0 else len(results)
-        indices = results[:nb_results]
-        distances = distances[0][:nb_results]
-        if len(distances) == 0:
-            return []
-        # get the metadata
-        results = []
-        metadata = metadata_provider.get(indices[:20], ["caption"])
-        for key, (d, i) in enumerate(zip(distances, indices)):
-            output = {}
-            meta = None if key + 1 > len(metadata) else metadata[key]
-            if meta is not None:
-                output.update(meta)
-            output["id"] = i.item()
-            output["similarity"] = d.item()
-            results.append(output)
-        # get the captions only
-        vocabularies = [result["caption"] for result in results]
-        return vocabularies
-    @torch.no_grad()
-    def forward(self, image_fp: str, alpha: Optional[float] = None) -> torch.Tensor():
-        # forward the image
-        image = self.processor(images=Image.open(image_fp), return_tensors="pt")
-        image["pixel_values"] = image["pixel_values"].to(DEVICE)
-        image_z = self.vision_proj(self.vision_encoder(**image)[1])
-        # generate a single text embedding from the unfiltered vocabulary
-        vocabulary = self.query_index(image_z)
-        text = self.processor(text=vocabulary, return_tensors="pt", padding=True)
-        text["input_ids"] = text["input_ids"][:, :77].to(DEVICE)
-        text["attention_mask"] = text["attention_mask"][:, :77].to(DEVICE)
-        text_z = self.language_encoder(**text)[1]
-        text_z = self.language_proj(text_z)
-        # filter the vocabulary, embed it, and get its mean embedding
-        vocabulary = self.vocabulary_transforms(vocabulary) or ["object"]
-        text = self.processor(text=vocabulary, return_tensors="pt", padding=True)
-        text = {k: v.to(DEVICE) for k, v in text.items()}
-        vocabulary_z = self.language_encoder(**text)[1]
-        vocabulary_z = self.language_proj(vocabulary_z)
-        vocabulary_z = vocabulary_z / vocabulary_z.norm(dim=-1, keepdim=True)
-        # get the image and text predictions
-        image_z = image_z / image_z.norm(dim=-1, keepdim=True)
-        text_z = text_z / text_z.norm(dim=-1, keepdim=True)
-        image_p = (torch.matmul(image_z, vocabulary_z.T) * self.logit_scale).softmax(dim=-1)
-        text_p = (torch.matmul(text_z, vocabulary_z.T) * self.logit_scale).softmax(dim=-1)
-        # average the image and text predictions
-        alpha = alpha or self.hparams["alpha"]
-        sample_p = alpha * image_p + (1 - alpha) * text_p
-        # get the scores
-        scores = sample_p[0].cpu().tolist()
-        return vocabulary, scores

src/retrieval.py DELETED Viewed

@@ -1,30 +0,0 @@
-from pathlib import Path
-from typing import Optional
-import numpy as np
-import pyarrow as pa
-class ArrowMetadataProvider:
-    """The arrow metadata provider provides metadata from contiguous ids using arrow.
-    Code taken from: https://github.dev/rom1504/clip-retrieval
-    """
-    def __init__(self, arrow_folder: Path):
-        arrow_files = [str(a) for a in sorted(arrow_folder.glob("**/*")) if a.is_file()]
-        self.table = pa.concat_tables(
-            [
-                pa.ipc.RecordBatchFileReader(pa.memory_map(arrow_file, "r")).read_all()
-                for arrow_file in arrow_files
-            ]
-        )
-    def get(self, ids: np.ndarray, cols: Optional[list] = None):
-        """Implement the get method from the arrow metadata provide, get metadata from ids."""
-        if cols is None:
-            cols = self.table.schema.names
-        else:
-            cols = list(set(self.table.schema.names) & set(cols))
-        t = pa.concat_tables([self.table[i:j] for i, j in zip(ids, ids + 1)])
-        return t.select(cols).to_pandas().to_dict("records")

src/transforms.py DELETED Viewed

@@ -1,506 +0,0 @@
-import re
-from abc import ABC, abstractmethod
-from typing import Any, Optional, Union, cast
-import inflect
-import nltk
-import numpy as np
-import PIL.Image
-import torch
-import torchvision.transforms as T
-import torchvision.transforms.functional as F
-from flair.data import Sentence
-from flair.models import SequenceTagger
-__all__ = [
-    "DynamicResize",
-    "DropFileExtensions",
-    "DropNonAlpha",
-    "DropShortWords",
-    "DropSpecialCharacters",
-    "DropTokens",
-    "DropURLs",
-    "DropWords",
-    "FilterPOS",
-    "FrequencyMinWordCount",
-    "FrequencyTopK",
-    "ReplaceSeparators",
-    "ToRGBTensor",
-    "ToLowercase",
-    "ToSingular",
-]
-class BaseTextTransform(ABC):
-    """Base class for string transforms."""
-    @abstractmethod
-    def __call__(self, text: str):
-        raise NotImplementedError
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}()"
-class DynamicResize(T.Resize):
-    """Resize the input PIL Image to the given size.
-    Extends the torchvision Resize transform to dynamically evaluate the second dimension of the
-    output size based on the aspect ratio of the first input image.
-    """
-    def forward(self, img):
-        if isinstance(self.size, int):
-            _, h, w = F.get_dimensions(img)
-            aspect_ratio = w / h
-            side = self.size
-            if aspect_ratio < 1.0:
-                self.size = int(side / aspect_ratio), side
-            else:
-                self.size = side, int(side * aspect_ratio)
-        return super().forward(img)
-class DropFileExtensions(BaseTextTransform):
-    """Remove file extensions from the input text."""
-    def __call__(self, text: str):
-        """
-        Args:
-            text (str): Text to remove file extensions from.
-        """
-        text = re.sub(r"\.\w+", "", text)
-        return text
-class DropNonAlpha(BaseTextTransform):
-    """Remove non-alpha words from the input text."""
-    def __call__(self, text: str):
-        """
-        Args:
-            text (str): Text to remove non-alpha words from.
-        """
-        text = re.sub(r"[^a-zA-Z\s]", "", text)
-        return text
-class DropShortWords(BaseTextTransform):
-    """Remove short words from the input text.
-    Args:
-        min_length (int): Minimum length of words to keep.
-    """
-    def __init__(self, min_length) -> None:
-        super().__init__()
-        self.min_length = min_length
-    def __call__(self, text: str):
-        """
-        Args:
-            text (str): Text to remove short words from.
-        """
-        text = " ".join([word for word in text.split() if len(word) >= self.min_length])
-        return text
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}(min_length={self.min_length})"
-class DropSpecialCharacters(BaseTextTransform):
-    """Remove special characters from the input text.
-    Special characters are defined as any character that is not a word character, whitespace,
-    hyphen, period, apostrophe, or ampersand.
-    """
-    def __call__(self, text: str):
-        """
-        Args:
-            text (str): Text to remove special characters from.
-        """
-        text = re.sub(r"[^\w\s\-\.\'\&]", "", text)
-        return text
-class DropTokens(BaseTextTransform):
-    """Remove tokens from the input text.
-    Tokens are defined as strings enclosed in angle brackets, e.g. <token>.
-    """
-    def __call__(self, text: str):
-        """
-        Args:
-            text (str): Text to remove tokens from.
-        """
-        text = re.sub(r"<[^>]+>", "", text)
-        return text
-class DropURLs(BaseTextTransform):
-    """Remove URLs from the input text."""
-    def __call__(self, text: str):
-        """
-        Args:
-            text (str): Text to remove URLs from.
-        """
-        text = re.sub(r"http\S+", "", text)
-        return text
-class DropWords(BaseTextTransform):
-    """Remove words from the input text.
-    It is case-insensitive and supports singular and plural forms of the words.
-    """
-    def __init__(self, words: list[str]) -> None:
-        super().__init__()
-        self.words = words
-        self.pattern = r"\b(?:{})\b".format("|".join(words))
-    def __call__(self, text: str):
-        """
-        Args:
-            text (str): Text to remove words from.
-        """
-        text = re.sub(self.pattern, "", text, flags=re.IGNORECASE)
-        return text
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}(pattern={self.pattern})"
-class FilterPOS(BaseTextTransform):
-    """Filter words by POS tags.
-    Args:
-        tags (list): List of POS tags to remove.
-        engine (str): POS tagger to use. Must be one of "nltk" or "flair". Defaults to "nltk".
-        keep_compound_nouns (bool): Whether to keep composed words. Defaults to True.
-    """
-    def __init__(self, tags: list, engine: str = "nltk", keep_compound_nouns: bool = True) -> None:
-        super().__init__()
-        self.tags = tags
-        self.engine = engine
-        self.keep_compound_nouns = keep_compound_nouns
-        if engine == "nltk":
-            nltk.download("averaged_perceptron_tagger", quiet=True)
-            nltk.download("punkt", quiet=True)
-            self.tagger = lambda x: nltk.pos_tag(nltk.word_tokenize(x))
-        elif engine == "flair":
-            self.tagger = SequenceTagger.load("flair/pos-english-fast").predict
-    def __call__(self, text: str):
-        """
-        Args:
-            text (str): Text to remove words with specific POS tags from.
-        """
-        if self.engine == "nltk":
-            word_tags = self.tagger(text)
-            text = " ".join([word for word, tag in word_tags if tag not in self.tags])
-        elif self.engine == "flair":
-            sentence = Sentence(text)
-            self.tagger(sentence)
-            text = " ".join([token.text for token in sentence.tokens if token.tag in self.tags])
-        if self.keep_compound_nouns:
-            compound_nouns = []
-            if self.engine == "nltk":
-                for i in range(len(word_tags) - 1):
-                    if word_tags[i][1] == "NN" and word_tags[i + 1][1] == "NN":
-                        # if they are the same word, skip
-                        if word_tags[i][0] == word_tags[i + 1][0]:
-                            continue
-                        compound_noun = word_tags[i][0] + "_" + word_tags[i + 1][0]
-                        compound_nouns.append(compound_noun)
-            elif self.engine == "flair":
-                for i in range(len(sentence.tokens) - 1):
-                    if sentence.tokens[i].tag == "NN" and sentence.tokens[i + 1].tag == "NN":
-                        # if they are the same word, skip
-                        if sentence.tokens[i].text == sentence.tokens[i + 1].text:
-                            continue
-                        compound_noun = sentence.tokens[i].text + "_" + sentence.tokens[i + 1].text
-                        compound_nouns.append(compound_noun)
-            text = " ".join([text, " ".join(compound_nouns)])
-        return text
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}(tags={self.tags}, engine={self.engine})"
-class FrequencyMinWordCount(BaseTextTransform):
-    """Keep only words that occur more than a minimum number of times in the input text.
-    If the threshold is too strong and no words pass the threshold, the threshold is reduced to
-    the most frequent word.
-    Args:
-        min_count (int): Minimum number of occurrences of a word to keep.
-    """
-    def __init__(self, min_count) -> None:
-        super().__init__()
-        self.min_count = min_count
-    def __call__(self, text: str):
-        """
-        Args:
-            text (str): Text to remove infrequent words from.
-        """
-        if self.min_count <= 1:
-            return text
-        words = text.split()
-        word_counts = {word: words.count(word) for word in words}
-        # if nothing passes the threshold, reduce the threshold to the most frequent word
-        max_word_count = max(word_counts.values() or [0])
-        min_count = max_word_count if self.min_count > max_word_count else self.min_count
-        text = " ".join([word for word in words if word_counts[word] >= min_count])
-        return text
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}(min_count={self.min_count})"
-class FrequencyTopK(BaseTextTransform):
-    """Keep only the top k most frequent words in the input text.
-    In case of a tie, all words with the same count as the last word are kept.
-    Args:
-        top_k (int): Number of top words to keep.
-    """
-    def __init__(self, top_k: int) -> None:
-        super().__init__()
-        self.top_k = top_k
-    def __call__(self, text: str):
-        """
-        Args:
-            text (str): Text to remove infrequent words from.
-        """
-        if self.top_k < 1:
-            return text
-        words = text.split()
-        word_counts = {word: words.count(word) for word in words}
-        top_words = sorted(word_counts, key=word_counts.get, reverse=True)
-        # in case of a tie, keep all words with the same count
-        top_words = top_words[: self.top_k]
-        top_words = [word for word in top_words if word_counts[word] == word_counts[top_words[-1]]]
-        text = " ".join([word for word in words if word in top_words])
-        return text
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}(top_k={self.top_k})"
-class ReplaceSeparators(BaseTextTransform):
-    """Replace underscores and dashes with spaces."""
-    def __call__(self, text: str):
-        """
-        Args:
-            text (str): Text to replace separators in.
-        """
-        text = re.sub(r"[_\-]", " ", text)
-        return text
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}()"
-class RemoveDuplicates(BaseTextTransform):
-    """Remove duplicate words from the input text."""
-    def __call__(self, text: str):
-        """
-        Args:
-            text (str): Text to remove duplicate words from.
-        """
-        text = " ".join(list(set(text.split())))
-        return text
-class TextCompose:
-    """Compose several transforms together.
-    It differs from the torchvision.transforms.Compose class in that it applies the transforms to
-    a string instead of a PIL Image or Tensor. In addition, it automatically join the list of
-    input strings into a single string and splits the output string into a list of words.
-    Args:
-        transforms (list): List of transforms to compose.
-    """
-    def __init__(self, transforms: list[BaseTextTransform]) -> None:
-        self.transforms = transforms
-    def __call__(self, text: Union[str, list[str]]) -> Any:
-        if isinstance(text, list):
-            text = " ".join(text)
-        for t in self.transforms:
-            text = t(text)
-        return text.split()
-    def __repr__(self) -> str:
-        format_string = self.__class__.__name__ + "("
-        for t in self.transforms:
-            format_string += "\n"
-            format_string += f"    {t}"
-        format_string += "\n)"
-        return format_string
-class ToRGBTensor(T.ToTensor):
-    """Convert a `PIL Image` or `numpy.ndarray` to tensor.
-    Compared with the torchvision `ToTensor` transform, it converts images with a single channel to
-    RGB images. In addition, the conversion to tensor is done only if the input is not already a
-    tensor.
-    """
-    def __call__(self, pic: Union[PIL.Image.Image, np.ndarray, torch.Tensor]):
-        """
-        Args:
-            pic (PIL Image | numpy.ndarray | torch.Tensor): Image to be converted to tensor.
-        """
-        img = pic if isinstance(pic, torch.Tensor) else F.to_tensor(pic)
-        img = cast(torch.Tensor, img)
-        if img.shape[0] == 1:
-            img = img.repeat(3, 1, 1)
-        return img
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}()"
-class ToLowercase(BaseTextTransform):
-    """Convert text to lowercase."""
-    def __call__(self, text: str):
-        """
-        Args:
-            text (str): Text to convert to lowercase.
-        """
-        text = text.lower()
-        return text
-class ToSingular(BaseTextTransform):
-    """Convert plural words to singular form."""
-    def __init__(self) -> None:
-        super().__init__()
-        self.transform = inflect.engine().singular_noun
-    def __call__(self, text: str):
-        """
-        Args:
-            text (str): Text to convert to singular form.
-        """
-        words = text.split()
-        for i, word in enumerate(words):
-            if not word.endswith("s"):
-                continue
-            if word[-2:] in ["ss", "us", "is"]:
-                continue
-            if word[-3:] in ["ies", "oes"]:
-                continue
-            words[i] = self.transform(word) or word
-        text = " ".join(words)
-        return text
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}()"
-def default_preprocess(size: Optional[int] = None) -> T.Compose:
-    """Preprocess input images with preprocessing transforms.
-    Args:
-        size (int): Size to resize image to.
-    """
-    transforms = []
-    if size is not None:
-        transforms.append(DynamicResize(size, interpolation=T.InterpolationMode.BICUBIC))
-    transforms.append(ToRGBTensor())
-    transforms = T.Compose(transforms)
-    return transforms
-def default_vocabulary_transforms() -> TextCompose:
-    """Preprocess input text with preprocessing transforms."""
-    words_to_drop = [
-        "image",
-        "photo",
-        "picture",
-        "thumbnail",
-        "logo",
-        "symbol",
-        "clipart",
-        "portrait",
-        "painting",
-        "illustration",
-        "icon",
-        "profile",
-    ]
-    pos_tags = ["NN", "NNS", "NNP", "NNPS", "JJ", "JJR", "JJS", "VBG", "VBN"]
-    transforms = []
-    transforms.append(DropTokens())
-    transforms.append(DropURLs())
-    transforms.append(DropSpecialCharacters())
-    transforms.append(DropFileExtensions())
-    transforms.append(ReplaceSeparators())
-    transforms.append(DropShortWords(min_length=3))
-    transforms.append(DropNonAlpha())
-    transforms.append(ToLowercase())
-    transforms.append(ToSingular())
-    transforms.append(DropWords(words=words_to_drop))
-    transforms.append(FrequencyMinWordCount(min_count=2))
-    transforms.append(FilterPOS(tags=pos_tags, engine="flair", keep_compound_nouns=False))
-    transforms.append(RemoveDuplicates())
-    transforms = TextCompose(transforms)
-    return transforms