microsoft
/

LLM2CLIP-EVA02-L-14-336

Zero-Shot Image Classification

Model card Files Files and versions Community

Gengzigang commited on Nov 22, 2024

Commit

1ee9ed9

·

1 Parent(s): 56d841b

update

Files changed (1) hide show

README.md +33 -13

README.md CHANGED Viewed

@@ -31,26 +31,46 @@ In this paper, we propose LLM2CLIP, a novel approach that embraces the power of
 ### Huggingface Version
 ```python
 from PIL import Image
-from transformers import AutoModel
-from transformers import CLIPImageProcessor
 import torch
-image_path = "CLIP.png"
-model_name_or_path = "LLM2CLIP-EVA02-L-14-336" # or /path/to/local/LLM2CLIP-EVA02-L-14-336
-image_size = 336
-processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
-model = AutoModel.from_pretrained(
-    model_name_or_path,
-    torch_dtype=torch.float16,
-    trust_remote_code=True).to('cuda').eval()
-image = Image.open(image_path)
-input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cuda')
 with torch.no_grad(), torch.cuda.amp.autocast():
-    outputs = model.get_image_features(input_pixels)
 ```
 ## BibTeX & Citation

 ### Huggingface Version
 ```python
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+from transformers import AutoModel, AutoConfig, AutoTokenizer
+from eva_clip import create_model_and_transforms
+from llm2vec import LLM2Vec
 from PIL import Image
 import torch
+model, _, preprocess_val  = create_model_and_transforms('EVA02-CLIP-L-14-336', force_custom_clip=True)
+ckpt = torch.load('LLM2CLIP-EVA02-L-14-336.pt')
+model.load_state_dict(ckpt)
+model = model.cuda().eval()
+llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
+config = AutoConfig.from_pretrained(
+    llm_model_name, trust_remote_code=True
+)
+llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
+llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' #  Workaround for LLM2VEC
+l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
+image_path = "CLIP.png"
+captions = ["a diagram", "a dog", "a cat"]
+image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
+text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
 with torch.no_grad(), torch.cuda.amp.autocast():
+    image_features = model.encode_image(image)
+    text_features = model.encode_text(text_features)
+    image_features /= image_features.norm(dim=-1, keepdim=True)
+    text_features /= text_features.norm(dim=-1, keepdim=True)
+    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
+print("Label probs:", text_probs)
 ```
 ## BibTeX & Citation