Gengzigang commited on
Commit
1ee9ed9
·
1 Parent(s): 56d841b
Files changed (1) hide show
  1. README.md +33 -13
README.md CHANGED
@@ -31,26 +31,46 @@ In this paper, we propose LLM2CLIP, a novel approach that embraces the power of
31
 
32
  ### Huggingface Version
33
  ```python
 
 
 
 
 
 
34
  from PIL import Image
35
- from transformers import AutoModel
36
- from transformers import CLIPImageProcessor
37
  import torch
38
 
39
- image_path = "CLIP.png"
40
- model_name_or_path = "LLM2CLIP-EVA02-L-14-336" # or /path/to/local/LLM2CLIP-EVA02-L-14-336
41
- image_size = 336
42
 
43
- processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
44
- model = AutoModel.from_pretrained(
45
- model_name_or_path,
46
- torch_dtype=torch.float16,
47
- trust_remote_code=True).to('cuda').eval()
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- image = Image.open(image_path)
50
- input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cuda')
51
 
52
  with torch.no_grad(), torch.cuda.amp.autocast():
53
- outputs = model.get_image_features(input_pixels)
 
 
 
 
 
 
 
 
54
  ```
55
 
56
  ## BibTeX & Citation
 
31
 
32
  ### Huggingface Version
33
  ```python
34
+ import os
35
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
36
+
37
+ from transformers import AutoModel, AutoConfig, AutoTokenizer
38
+ from eva_clip import create_model_and_transforms
39
+ from llm2vec import LLM2Vec
40
  from PIL import Image
 
 
41
  import torch
42
 
 
 
 
43
 
44
+ model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-L-14-336', force_custom_clip=True)
45
+ ckpt = torch.load('LLM2CLIP-EVA02-L-14-336.pt')
46
+ model.load_state_dict(ckpt)
47
+ model = model.cuda().eval()
48
+
49
+ llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
50
+ config = AutoConfig.from_pretrained(
51
+ llm_model_name, trust_remote_code=True
52
+ )
53
+ llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
54
+ tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
55
+ llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
56
+ l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
57
+
58
+ image_path = "CLIP.png"
59
+ captions = ["a diagram", "a dog", "a cat"]
60
 
61
+ image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
62
+ text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
63
 
64
  with torch.no_grad(), torch.cuda.amp.autocast():
65
+ image_features = model.encode_image(image)
66
+ text_features = model.encode_text(text_features)
67
+
68
+ image_features /= image_features.norm(dim=-1, keepdim=True)
69
+ text_features /= text_features.norm(dim=-1, keepdim=True)
70
+
71
+ text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
72
+
73
+ print("Label probs:", text_probs)
74
  ```
75
 
76
  ## BibTeX & Citation