Update evaluation codes
Browse files
evaluation/evaluate_mmvp_MetaCLIP_huge.py
CHANGED
|
@@ -7,8 +7,6 @@ import torch
|
|
| 7 |
from tqdm import tqdm
|
| 8 |
import json
|
| 9 |
from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
|
| 10 |
-
import argparse
|
| 11 |
-
|
| 12 |
|
| 13 |
|
| 14 |
def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
@@ -51,8 +49,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
| 51 |
text1 = 'a photo of ' + statement1
|
| 52 |
text2 = 'a photo of ' + statement2
|
| 53 |
|
| 54 |
-
#text1 = clip.tokenize([text1]).to(device)
|
| 55 |
-
#text2 = clip.tokenize([text2]).to(device)
|
| 56 |
text1 = tokenizer(
|
| 57 |
text1,
|
| 58 |
truncation=True,
|
|
@@ -70,18 +66,15 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
| 70 |
return_overflowing_tokens=False,
|
| 71 |
padding="max_length",
|
| 72 |
return_tensors="pt",
|
| 73 |
-
)["input_ids"].to(device)
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 79 |
-
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
| 80 |
|
| 81 |
with torch.no_grad():
|
| 82 |
model.eval().float()
|
| 83 |
-
|
| 84 |
-
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
| 85 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 86 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 87 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
@@ -141,17 +134,14 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
|
|
| 141 |
|
| 142 |
if __name__ == "__main__":
|
| 143 |
|
| 144 |
-
BENCHMARK_DIR = '
|
| 145 |
|
| 146 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 147 |
-
vision_tower_name = f'
|
| 148 |
|
| 149 |
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
| 150 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
| 151 |
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
| 152 |
-
#processor = CLIPProcessor.from_pretrained(vision_tower_name)
|
| 153 |
|
| 154 |
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
| 155 |
print(results)
|
| 156 |
-
|
| 157 |
-
|
|
|
|
| 7 |
from tqdm import tqdm
|
| 8 |
import json
|
| 9 |
from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
|
|
| 49 |
text1 = 'a photo of ' + statement1
|
| 50 |
text2 = 'a photo of ' + statement2
|
| 51 |
|
|
|
|
|
|
|
| 52 |
text1 = tokenizer(
|
| 53 |
text1,
|
| 54 |
truncation=True,
|
|
|
|
| 66 |
return_overflowing_tokens=False,
|
| 67 |
padding="max_length",
|
| 68 |
return_tensors="pt",
|
| 69 |
+
)["input_ids"].to(device)
|
| 70 |
+
|
| 71 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
|
| 72 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
|
| 73 |
+
imgs = torch.cat((img1, img2), dim=0)
|
|
|
|
|
|
|
| 74 |
|
| 75 |
with torch.no_grad():
|
| 76 |
model.eval().float()
|
| 77 |
+
|
|
|
|
| 78 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 79 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 80 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
|
|
| 134 |
|
| 135 |
if __name__ == "__main__":
|
| 136 |
|
| 137 |
+
BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
|
| 138 |
|
| 139 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 140 |
+
vision_tower_name = f'MetaCLIP/metaclip-h14-fullcc2.5b'
|
| 141 |
|
| 142 |
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
| 143 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
| 144 |
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
|
|
|
| 145 |
|
| 146 |
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
| 147 |
print(results)
|
|
|
|
|
|
evaluation/evaluate_mmvp_MetaCLIP_large.py
CHANGED
|
@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
| 51 |
text1 = 'a photo of ' + statement1
|
| 52 |
text2 = 'a photo of ' + statement2
|
| 53 |
|
| 54 |
-
#text1 = clip.tokenize([text1]).to(device)
|
| 55 |
-
#text2 = clip.tokenize([text2]).to(device)
|
| 56 |
text1 = tokenizer(
|
| 57 |
text1,
|
| 58 |
truncation=True,
|
|
@@ -70,18 +68,15 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
| 70 |
return_overflowing_tokens=False,
|
| 71 |
padding="max_length",
|
| 72 |
return_tensors="pt",
|
| 73 |
-
)["input_ids"].to(device)
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 79 |
-
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
| 80 |
|
| 81 |
with torch.no_grad():
|
| 82 |
model.eval().float()
|
| 83 |
-
|
| 84 |
-
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
| 85 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 86 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 87 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
@@ -92,7 +87,7 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
| 92 |
|
| 93 |
img1_score1 = probs1[0][0]
|
| 94 |
img1_score2 = probs2[0][0]
|
| 95 |
-
|
| 96 |
pred1 = "img1" if img1_score1 > 0.5 else "img2"
|
| 97 |
pred2 = "img1" if img1_score2 > 0.5 else "img2"
|
| 98 |
|
|
@@ -141,15 +136,14 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
|
|
| 141 |
|
| 142 |
if __name__ == "__main__":
|
| 143 |
|
| 144 |
-
BENCHMARK_DIR = '
|
| 145 |
|
| 146 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 147 |
-
vision_tower_name = '
|
| 148 |
|
| 149 |
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
| 150 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
| 151 |
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
| 152 |
-
#processor = CLIPProcessor.from_pretrained(vision_tower_name)
|
| 153 |
|
| 154 |
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
| 155 |
print(results)
|
|
|
|
| 51 |
text1 = 'a photo of ' + statement1
|
| 52 |
text2 = 'a photo of ' + statement2
|
| 53 |
|
|
|
|
|
|
|
| 54 |
text1 = tokenizer(
|
| 55 |
text1,
|
| 56 |
truncation=True,
|
|
|
|
| 68 |
return_overflowing_tokens=False,
|
| 69 |
padding="max_length",
|
| 70 |
return_tensors="pt",
|
| 71 |
+
)["input_ids"].to(device)
|
| 72 |
+
|
| 73 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
|
| 74 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
|
| 75 |
+
imgs = torch.cat((img1, img2), dim=0)
|
|
|
|
|
|
|
| 76 |
|
| 77 |
with torch.no_grad():
|
| 78 |
model.eval().float()
|
| 79 |
+
|
|
|
|
| 80 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 81 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 82 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
|
|
| 87 |
|
| 88 |
img1_score1 = probs1[0][0]
|
| 89 |
img1_score2 = probs2[0][0]
|
| 90 |
+
|
| 91 |
pred1 = "img1" if img1_score1 > 0.5 else "img2"
|
| 92 |
pred2 = "img1" if img1_score2 > 0.5 else "img2"
|
| 93 |
|
|
|
|
| 136 |
|
| 137 |
if __name__ == "__main__":
|
| 138 |
|
| 139 |
+
BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
|
| 140 |
|
| 141 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 142 |
+
vision_tower_name = 'MetaCLIP/metaclip-l14-fullcc2.5b'
|
| 143 |
|
| 144 |
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
| 145 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
| 146 |
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
|
|
|
| 147 |
|
| 148 |
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
| 149 |
print(results)
|
evaluation/evaluate_mmvp_OpenAICLIP_224.py
CHANGED
|
@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
| 51 |
text1 = 'a photo of ' + statement1
|
| 52 |
text2 = 'a photo of ' + statement2
|
| 53 |
|
| 54 |
-
#text1 = clip.tokenize([text1]).to(device)
|
| 55 |
-
#text2 = clip.tokenize([text2]).to(device)
|
| 56 |
text1 = tokenizer(
|
| 57 |
text1,
|
| 58 |
truncation=True,
|
|
@@ -71,17 +69,14 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
| 71 |
padding="max_length",
|
| 72 |
return_tensors="pt",
|
| 73 |
)["input_ids"].to(device) # torch.Size([1, 77])
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 78 |
-
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 79 |
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
| 80 |
|
| 81 |
with torch.no_grad():
|
| 82 |
model.eval().float()
|
| 83 |
-
|
| 84 |
-
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
| 85 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 86 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 87 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
@@ -141,19 +136,14 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
|
|
| 141 |
|
| 142 |
if __name__ == "__main__":
|
| 143 |
|
| 144 |
-
BENCHMARK_DIR = '
|
| 145 |
|
| 146 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 147 |
-
vision_tower_name = f'
|
| 148 |
|
| 149 |
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
| 150 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
| 151 |
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
| 152 |
-
#processor = CLIPProcessor.from_pretrained(vision_tower_name)
|
| 153 |
-
|
| 154 |
-
#vision_tower.to(torch.float32)
|
| 155 |
-
# print(next(model.parameters()).device) # cuda:0
|
| 156 |
|
| 157 |
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
| 158 |
print(results)
|
| 159 |
-
|
|
|
|
| 51 |
text1 = 'a photo of ' + statement1
|
| 52 |
text2 = 'a photo of ' + statement2
|
| 53 |
|
|
|
|
|
|
|
| 54 |
text1 = tokenizer(
|
| 55 |
text1,
|
| 56 |
truncation=True,
|
|
|
|
| 69 |
padding="max_length",
|
| 70 |
return_tensors="pt",
|
| 71 |
)["input_ids"].to(device) # torch.Size([1, 77])
|
| 72 |
+
|
| 73 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
|
| 74 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
|
|
|
|
|
|
|
| 75 |
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
| 76 |
|
| 77 |
with torch.no_grad():
|
| 78 |
model.eval().float()
|
| 79 |
+
|
|
|
|
| 80 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 81 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 82 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
|
|
| 136 |
|
| 137 |
if __name__ == "__main__":
|
| 138 |
|
| 139 |
+
BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
|
| 140 |
|
| 141 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 142 |
+
vision_tower_name = f'OpenAICLIP/clip-vit-large-patch14'
|
| 143 |
|
| 144 |
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
| 145 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
| 146 |
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
| 149 |
print(results)
|
|
|
evaluation/evaluate_mmvp_OpenAICLIP_336.py
CHANGED
|
@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
| 51 |
text1 = 'a photo of ' + statement1
|
| 52 |
text2 = 'a photo of ' + statement2
|
| 53 |
|
| 54 |
-
#text1 = clip.tokenize([text1]).to(device)
|
| 55 |
-
#text2 = clip.tokenize([text2]).to(device)
|
| 56 |
text1 = tokenizer(
|
| 57 |
text1,
|
| 58 |
truncation=True,
|
|
@@ -71,17 +69,14 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
| 71 |
padding="max_length",
|
| 72 |
return_tensors="pt",
|
| 73 |
)["input_ids"].to(device) # torch.Size([1, 77])
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 79 |
-
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
| 80 |
|
| 81 |
with torch.no_grad():
|
| 82 |
model.eval().float()
|
| 83 |
-
|
| 84 |
-
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
| 85 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 86 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 87 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
@@ -141,19 +136,14 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
|
|
| 141 |
|
| 142 |
if __name__ == "__main__":
|
| 143 |
|
| 144 |
-
BENCHMARK_DIR = '
|
| 145 |
|
| 146 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 147 |
-
vision_tower_name = f'
|
| 148 |
|
| 149 |
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
| 150 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
| 151 |
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
| 152 |
-
#processor = CLIPProcessor.from_pretrained(vision_tower_name)
|
| 153 |
-
|
| 154 |
-
#vision_tower.to(torch.float32)
|
| 155 |
-
# print(next(model.parameters()).device) # cuda:0
|
| 156 |
|
| 157 |
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
| 158 |
print(results)
|
| 159 |
-
|
|
|
|
| 51 |
text1 = 'a photo of ' + statement1
|
| 52 |
text2 = 'a photo of ' + statement2
|
| 53 |
|
|
|
|
|
|
|
| 54 |
text1 = tokenizer(
|
| 55 |
text1,
|
| 56 |
truncation=True,
|
|
|
|
| 69 |
padding="max_length",
|
| 70 |
return_tensors="pt",
|
| 71 |
)["input_ids"].to(device) # torch.Size([1, 77])
|
| 72 |
+
|
| 73 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
|
| 74 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
|
| 75 |
+
imgs = torch.cat((img1, img2), dim=0)
|
|
|
|
|
|
|
| 76 |
|
| 77 |
with torch.no_grad():
|
| 78 |
model.eval().float()
|
| 79 |
+
|
|
|
|
| 80 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 81 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 82 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
|
|
| 136 |
|
| 137 |
if __name__ == "__main__":
|
| 138 |
|
| 139 |
+
BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
|
| 140 |
|
| 141 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 142 |
+
vision_tower_name = f'OpenAICLIP/clip-vit-large-patch14-336'
|
| 143 |
|
| 144 |
vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
|
| 145 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
| 146 |
tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
|
| 149 |
print(results)
|
|
|
evaluation/evaluate_mmvp_SigLIP_224.py
CHANGED
|
@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
| 51 |
text1 = 'a photo of ' + statement1
|
| 52 |
text2 = 'a photo of ' + statement2
|
| 53 |
|
| 54 |
-
#text1 = clip.tokenize([text1]).to(device)
|
| 55 |
-
#text2 = clip.tokenize([text2]).to(device)
|
| 56 |
text1 = tokenizer(
|
| 57 |
text1,
|
| 58 |
truncation=True,
|
|
@@ -68,18 +66,15 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
| 68 |
return_overflowing_tokens=False,
|
| 69 |
padding="max_length",
|
| 70 |
return_tensors="pt",
|
| 71 |
-
)["input_ids"].to(device)
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 77 |
-
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
| 78 |
|
| 79 |
with torch.no_grad():
|
| 80 |
model.eval().float()
|
| 81 |
-
|
| 82 |
-
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
| 83 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 84 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 85 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
@@ -139,10 +134,10 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
|
|
| 139 |
|
| 140 |
if __name__ == "__main__":
|
| 141 |
|
| 142 |
-
BENCHMARK_DIR = '
|
| 143 |
|
| 144 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 145 |
-
vision_tower_name = f'
|
| 146 |
|
| 147 |
vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
|
| 148 |
image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
|
|
|
|
| 51 |
text1 = 'a photo of ' + statement1
|
| 52 |
text2 = 'a photo of ' + statement2
|
| 53 |
|
|
|
|
|
|
|
| 54 |
text1 = tokenizer(
|
| 55 |
text1,
|
| 56 |
truncation=True,
|
|
|
|
| 66 |
return_overflowing_tokens=False,
|
| 67 |
padding="max_length",
|
| 68 |
return_tensors="pt",
|
| 69 |
+
)["input_ids"].to(device)
|
| 70 |
+
|
| 71 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
|
| 72 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
|
| 73 |
+
imgs = torch.cat((img1, img2), dim=0)
|
|
|
|
|
|
|
| 74 |
|
| 75 |
with torch.no_grad():
|
| 76 |
model.eval().float()
|
| 77 |
+
|
|
|
|
| 78 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 79 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 80 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
|
|
| 134 |
|
| 135 |
if __name__ == "__main__":
|
| 136 |
|
| 137 |
+
BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
|
| 138 |
|
| 139 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 140 |
+
vision_tower_name = f'SigLIP/siglip-so400m-patch14-224'
|
| 141 |
|
| 142 |
vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
|
| 143 |
image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
|
evaluation/evaluate_mmvp_SigLIP_384.py
CHANGED
|
@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
| 51 |
text1 = 'a photo of ' + statement1
|
| 52 |
text2 = 'a photo of ' + statement2
|
| 53 |
|
| 54 |
-
#text1 = clip.tokenize([text1]).to(device)
|
| 55 |
-
#text2 = clip.tokenize([text2]).to(device)
|
| 56 |
text1 = tokenizer(
|
| 57 |
text1,
|
| 58 |
truncation=True,
|
|
@@ -68,18 +66,15 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
|
|
| 68 |
return_overflowing_tokens=False,
|
| 69 |
padding="max_length",
|
| 70 |
return_tensors="pt",
|
| 71 |
-
)["input_ids"].to(device)
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device) # torch.Size([1, 3, 224, 224])
|
| 77 |
-
imgs = torch.cat((img1, img2), dim=0) # torch.Size([2, 3, 224, 224])
|
| 78 |
|
| 79 |
with torch.no_grad():
|
| 80 |
model.eval().float()
|
| 81 |
-
|
| 82 |
-
#logits_per_image2, logits_per_text2 = model(imgs, text2)
|
| 83 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 84 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 85 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
@@ -139,10 +134,10 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
|
|
| 139 |
|
| 140 |
if __name__ == "__main__":
|
| 141 |
|
| 142 |
-
BENCHMARK_DIR = '
|
| 143 |
|
| 144 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 145 |
-
vision_tower_name = f'
|
| 146 |
|
| 147 |
vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
|
| 148 |
image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
|
|
|
|
| 51 |
text1 = 'a photo of ' + statement1
|
| 52 |
text2 = 'a photo of ' + statement2
|
| 53 |
|
|
|
|
|
|
|
| 54 |
text1 = tokenizer(
|
| 55 |
text1,
|
| 56 |
truncation=True,
|
|
|
|
| 66 |
return_overflowing_tokens=False,
|
| 67 |
padding="max_length",
|
| 68 |
return_tensors="pt",
|
| 69 |
+
)["input_ids"].to(device)
|
| 70 |
+
|
| 71 |
+
img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
|
| 72 |
+
img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
|
| 73 |
+
imgs = torch.cat((img1, img2), dim=0)
|
|
|
|
|
|
|
| 74 |
|
| 75 |
with torch.no_grad():
|
| 76 |
model.eval().float()
|
| 77 |
+
|
|
|
|
| 78 |
outputs1 = model(input_ids=text1, pixel_values=imgs)
|
| 79 |
logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
|
| 80 |
outputs2 = model(input_ids=text2, pixel_values=imgs)
|
|
|
|
| 134 |
|
| 135 |
if __name__ == "__main__":
|
| 136 |
|
| 137 |
+
BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
|
| 138 |
|
| 139 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 140 |
+
vision_tower_name = f'SigLIP/siglip-so400m-patch14-384'
|
| 141 |
|
| 142 |
vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
|
| 143 |
image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)
|