Spaces:
Sleeping
Sleeping
from typing import List | |
import torch | |
from PIL import Image | |
from transformers import CLIPModel, CLIPProcessor, CLIPTokenizer | |
class Embedding: | |
""" | |
A class for encoding images and text using the CLIP model. | |
""" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") | |
model = CLIPModel.from_pretrained( | |
"openai/clip-vit-base-patch32", | |
cache_dir="./models", | |
) | |
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") | |
model.to(device) | |
def encode_image(cls, image: Image): | |
""" | |
Encode an image using the specified image processor and model, and return the image features as a numpy array. | |
Args: | |
image (Image): The input image to be encoded. | |
Returns: | |
np.ndarray: The encoded image features as a numpy array. | |
""" | |
inputs = cls.processor(images=image, return_tensors="pt")["pixel_values"].to( | |
cls.device | |
) | |
outputs = cls.model.get_image_features(inputs) | |
return outputs.cpu().detach().numpy() | |
def encode_text(cls, text: str): | |
""" | |
Encode the input text using the tokenizer and model, and return the resulting numpy embedding. | |
Args: | |
text (str): The input text to be encoded. | |
Returns: | |
numpy array: The encoded text features as a numpy array. | |
""" | |
inputs = cls.tokenizer(text, return_tensors="pt").to(cls.device) | |
outputs = cls.model.get_text_features(**inputs) | |
return outputs.cpu().detach().numpy() | |