from PIL import Image import io from transformers import AutoTokenizer, CLIPProcessor, CLIPModel import torch # Load CLIP model and processor model_name = "openai/clip-vit-base-patch32" loaded_model = CLIPModel.from_pretrained(model_name) loaded_processor = CLIPProcessor.from_pretrained(model_name) def getTextEmbedding(text): # Preprocess the text print("tear") inputs_text = loaded_processor(text=[text], return_tensors="pt", padding=True) print("here") # Forward pass through the model with torch.no_grad(): # Get the text features text_features = loaded_model.get_text_features(input_ids=inputs_text.input_ids, attention_mask=inputs_text.attention_mask) print("bear") # Convert tensor to numpy array for better readability text_embedding = text_features.squeeze().numpy() print("done") return text_embedding def getImageEmbedding(binary_image_data): # Load and preprocess the image image = Image.open(io.BytesIO(binary_image_data)) inputs = loaded_processor(images=image, return_tensors="pt", padding=True) # Forward pass through the model with torch.no_grad(): # Get the image features image_features = loaded_model.get_image_features(pixel_values=inputs.pixel_values) # Convert tensor to numpy array for better readability image_embedding = image_features.squeeze().numpy() return image_embedding