Update evaluation codes

Browse files

Files changed (6) hide show

evaluation/evaluate_mmvp_MetaCLIP_huge.py +8 -18
evaluation/evaluate_mmvp_MetaCLIP_large.py +9 -15
evaluation/evaluate_mmvp_OpenAICLIP_224.py +6 -16
evaluation/evaluate_mmvp_OpenAICLIP_336.py +7 -17
evaluation/evaluate_mmvp_SigLIP_224.py +8 -13
evaluation/evaluate_mmvp_SigLIP_384.py +8 -13

evaluation/evaluate_mmvp_MetaCLIP_huge.py CHANGED Viewed

@@ -7,8 +7,6 @@ import torch
 from tqdm import tqdm
 import json
 from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
-import argparse
 def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
@@ -51,8 +49,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
             text1 = 'a photo of ' + statement1
             text2 = 'a photo of ' + statement2
-            #text1 = clip.tokenize([text1]).to(device)
-            #text2 = clip.tokenize([text2]).to(device)
             text1 = tokenizer(
                 text1,
                 truncation=True,
@@ -70,18 +66,15 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
                 return_overflowing_tokens=False,
                 padding="max_length",
                 return_tensors="pt",
-            )["input_ids"].to(device)   # torch.Size([1, 77])
-            #img1 = preprocess(img1).unsqueeze(0).to(device)
-            #img2 = preprocess(img2).unsqueeze(0).to(device)
-            img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)   # torch.Size([1, 3, 224, 224])
-            img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)   # torch.Size([1, 3, 224, 224])
-            imgs = torch.cat((img1, img2), dim=0)   # torch.Size([2, 3, 224, 224])
             with torch.no_grad():
                 model.eval().float()
-                #logits_per_image1, logits_per_text1 = model(imgs, text1)
-                #logits_per_image2, logits_per_text2 = model(imgs, text2)
                 outputs1 = model(input_ids=text1, pixel_values=imgs)
                 logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
                 outputs2 = model(input_ids=text2, pixel_values=imgs)
@@ -141,17 +134,14 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
 if __name__ == "__main__":
-    BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    vision_tower_name = f'MetaCLIP_huge/metaclip-h14-fullcc2.5b-6000'
     vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
     image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
     tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
-    #processor = CLIPProcessor.from_pretrained(vision_tower_name)
     results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
     print(results)

 from tqdm import tqdm
 import json
 from transformers import CLIPVisionModel, CLIPModel, CLIPImageProcessor, CLIPTokenizer
 def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
             text1 = 'a photo of ' + statement1
             text2 = 'a photo of ' + statement2
             text1 = tokenizer(
                 text1,
                 truncation=True,
                 return_overflowing_tokens=False,
                 padding="max_length",
                 return_tensors="pt",
+            )["input_ids"].to(device)
+            img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
+            img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
+            imgs = torch.cat((img1, img2), dim=0)
             with torch.no_grad():
                 model.eval().float()
                 outputs1 = model(input_ids=text1, pixel_values=imgs)
                 logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
                 outputs2 = model(input_ids=text2, pixel_values=imgs)
 if __name__ == "__main__":
+    BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    vision_tower_name = f'MetaCLIP/metaclip-h14-fullcc2.5b'
     vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
     image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
     tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
     results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
     print(results)

evaluation/evaluate_mmvp_MetaCLIP_large.py CHANGED Viewed

@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
             text1 = 'a photo of ' + statement1
             text2 = 'a photo of ' + statement2
-            #text1 = clip.tokenize([text1]).to(device)
-            #text2 = clip.tokenize([text2]).to(device)
             text1 = tokenizer(
                 text1,
                 truncation=True,
@@ -70,18 +68,15 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
                 return_overflowing_tokens=False,
                 padding="max_length",
                 return_tensors="pt",
-            )["input_ids"].to(device)   # torch.Size([1, 77])
-            #img1 = preprocess(img1).unsqueeze(0).to(device)
-            #img2 = preprocess(img2).unsqueeze(0).to(device)
-            img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)   # torch.Size([1, 3, 224, 224])
-            img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)   # torch.Size([1, 3, 224, 224])
-            imgs = torch.cat((img1, img2), dim=0)   # torch.Size([2, 3, 224, 224])
             with torch.no_grad():
                 model.eval().float()
-                #logits_per_image1, logits_per_text1 = model(imgs, text1)
-                #logits_per_image2, logits_per_text2 = model(imgs, text2)
                 outputs1 = model(input_ids=text1, pixel_values=imgs)
                 logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
                 outputs2 = model(input_ids=text2, pixel_values=imgs)
@@ -92,7 +87,7 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
             img1_score1 = probs1[0][0]
             img1_score2 = probs2[0][0]
             pred1 = "img1" if img1_score1 > 0.5 else "img2"
             pred2 = "img1" if img1_score2 > 0.5 else "img2"
@@ -141,15 +136,14 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
 if __name__ == "__main__":
-    BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    vision_tower_name = 'MetaCLIP_large/metaclip-l14-fullcc2.5b-7000'
     vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
     image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
     tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
-    #processor = CLIPProcessor.from_pretrained(vision_tower_name)
     results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
     print(results)

             text1 = 'a photo of ' + statement1
             text2 = 'a photo of ' + statement2
             text1 = tokenizer(
                 text1,
                 truncation=True,
                 return_overflowing_tokens=False,
                 padding="max_length",
                 return_tensors="pt",
+            )["input_ids"].to(device)
+            img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
+            img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
+            imgs = torch.cat((img1, img2), dim=0)
             with torch.no_grad():
                 model.eval().float()
                 outputs1 = model(input_ids=text1, pixel_values=imgs)
                 logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
                 outputs2 = model(input_ids=text2, pixel_values=imgs)
             img1_score1 = probs1[0][0]
             img1_score2 = probs2[0][0]
             pred1 = "img1" if img1_score1 > 0.5 else "img2"
             pred2 = "img1" if img1_score2 > 0.5 else "img2"
 if __name__ == "__main__":
+    BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    vision_tower_name = 'MetaCLIP/metaclip-l14-fullcc2.5b'
     vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
     image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
     tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
     results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
     print(results)

evaluation/evaluate_mmvp_OpenAICLIP_224.py CHANGED Viewed

@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
             text1 = 'a photo of ' + statement1
             text2 = 'a photo of ' + statement2
-            #text1 = clip.tokenize([text1]).to(device)
-            #text2 = clip.tokenize([text2]).to(device)
             text1 = tokenizer(
                 text1,
                 truncation=True,
@@ -71,17 +69,14 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
                 padding="max_length",
                 return_tensors="pt",
             )["input_ids"].to(device)   # torch.Size([1, 77])
-            #img1 = preprocess(img1).unsqueeze(0).to(device)
-            #img2 = preprocess(img2).unsqueeze(0).to(device)
-            img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)   # torch.Size([1, 3, 224, 224])
-            img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)   # torch.Size([1, 3, 224, 224])
             imgs = torch.cat((img1, img2), dim=0)   # torch.Size([2, 3, 224, 224])
             with torch.no_grad():
                 model.eval().float()
-                #logits_per_image1, logits_per_text1 = model(imgs, text1)
-                #logits_per_image2, logits_per_text2 = model(imgs, text2)
                 outputs1 = model(input_ids=text1, pixel_values=imgs)
                 logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
                 outputs2 = model(input_ids=text2, pixel_values=imgs)
@@ -141,19 +136,14 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
 if __name__ == "__main__":
-    BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    vision_tower_name = f'OpenAICLIP_224/clip-vit-large-patch14-all-lr5-3000-res384'
     vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
     image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
     tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
-    #processor = CLIPProcessor.from_pretrained(vision_tower_name)
-    #vision_tower.to(torch.float32)
-    # print(next(model.parameters()).device)   # cuda:0
     results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
     print(results)

             text1 = 'a photo of ' + statement1
             text2 = 'a photo of ' + statement2
             text1 = tokenizer(
                 text1,
                 truncation=True,
                 padding="max_length",
                 return_tensors="pt",
             )["input_ids"].to(device)   # torch.Size([1, 77])
+            img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
+            img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
             imgs = torch.cat((img1, img2), dim=0)   # torch.Size([2, 3, 224, 224])
             with torch.no_grad():
                 model.eval().float()
                 outputs1 = model(input_ids=text1, pixel_values=imgs)
                 logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
                 outputs2 = model(input_ids=text2, pixel_values=imgs)
 if __name__ == "__main__":
+    BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    vision_tower_name = f'OpenAICLIP/clip-vit-large-patch14'
     vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
     image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
     tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
     results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
     print(results)

evaluation/evaluate_mmvp_OpenAICLIP_336.py CHANGED Viewed

@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
             text1 = 'a photo of ' + statement1
             text2 = 'a photo of ' + statement2
-            #text1 = clip.tokenize([text1]).to(device)
-            #text2 = clip.tokenize([text2]).to(device)
             text1 = tokenizer(
                 text1,
                 truncation=True,
@@ -71,17 +69,14 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
                 padding="max_length",
                 return_tensors="pt",
             )["input_ids"].to(device)   # torch.Size([1, 77])
-            #img1 = preprocess(img1).unsqueeze(0).to(device)
-            #img2 = preprocess(img2).unsqueeze(0).to(device)
-            img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)   # torch.Size([1, 3, 224, 224])
-            img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)   # torch.Size([1, 3, 224, 224])
-            imgs = torch.cat((img1, img2), dim=0)   # torch.Size([2, 3, 224, 224])
             with torch.no_grad():
                 model.eval().float()
-                #logits_per_image1, logits_per_text1 = model(imgs, text1)
-                #logits_per_image2, logits_per_text2 = model(imgs, text2)
                 outputs1 = model(input_ids=text1, pixel_values=imgs)
                 logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
                 outputs2 = model(input_ids=text2, pixel_values=imgs)
@@ -141,19 +136,14 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
 if __name__ == "__main__":
-    BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    vision_tower_name = f'OpenAICLIP_336/clip-vit-large-patch14-336-all-lr5-3500-512-tokens'
     vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
     image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
     tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
-    #processor = CLIPProcessor.from_pretrained(vision_tower_name)
-    #vision_tower.to(torch.float32)
-    # print(next(model.parameters()).device)   # cuda:0
     results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
     print(results)

             text1 = 'a photo of ' + statement1
             text2 = 'a photo of ' + statement2
             text1 = tokenizer(
                 text1,
                 truncation=True,
                 padding="max_length",
                 return_tensors="pt",
             )["input_ids"].to(device)   # torch.Size([1, 77])
+            img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
+            img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
+            imgs = torch.cat((img1, img2), dim=0)
             with torch.no_grad():
                 model.eval().float()
                 outputs1 = model(input_ids=text1, pixel_values=imgs)
                 logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
                 outputs2 = model(input_ids=text2, pixel_values=imgs)
 if __name__ == "__main__":
+    BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    vision_tower_name = f'OpenAICLIP/clip-vit-large-patch14-336'
     vision_tower = CLIPModel.from_pretrained(vision_tower_name, device_map=device)
     image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
     tokenizer = CLIPTokenizer.from_pretrained(vision_tower_name, max_length=77)
     results = official_evaluation(image_processor, tokenizer, vision_tower, vision_tower_name, BENCHMARK_DIR, device)
     print(results)

evaluation/evaluate_mmvp_SigLIP_224.py CHANGED Viewed

@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
             text1 = 'a photo of ' + statement1
             text2 = 'a photo of ' + statement2
-            #text1 = clip.tokenize([text1]).to(device)
-            #text2 = clip.tokenize([text2]).to(device)
             text1 = tokenizer(
                 text1,
                 truncation=True,
@@ -68,18 +66,15 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
                 return_overflowing_tokens=False,
                 padding="max_length",
                 return_tensors="pt",
-            )["input_ids"].to(device)   # torch.Size([1, 77])
-            #img1 = preprocess(img1).unsqueeze(0).to(device)
-            #img2 = preprocess(img2).unsqueeze(0).to(device)
-            img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)   # torch.Size([1, 3, 224, 224])
-            img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)   # torch.Size([1, 3, 224, 224])
-            imgs = torch.cat((img1, img2), dim=0)   # torch.Size([2, 3, 224, 224])
             with torch.no_grad():
                 model.eval().float()
-                #logits_per_image1, logits_per_text1 = model(imgs, text1)
-                #logits_per_image2, logits_per_text2 = model(imgs, text2)
                 outputs1 = model(input_ids=text1, pixel_values=imgs)
                 logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
                 outputs2 = model(input_ids=text2, pixel_values=imgs)
@@ -139,10 +134,10 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
 if __name__ == "__main__":
-    BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    vision_tower_name = f'SigLIP_224/siglip-so400m-patch14-224-9000'
     vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
     image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)

             text1 = 'a photo of ' + statement1
             text2 = 'a photo of ' + statement2
             text1 = tokenizer(
                 text1,
                 truncation=True,
                 return_overflowing_tokens=False,
                 padding="max_length",
                 return_tensors="pt",
+            )["input_ids"].to(device)
+            img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
+            img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
+            imgs = torch.cat((img1, img2), dim=0)
             with torch.no_grad():
                 model.eval().float()
                 outputs1 = model(input_ids=text1, pixel_values=imgs)
                 logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
                 outputs2 = model(input_ids=text2, pixel_values=imgs)
 if __name__ == "__main__":
+    BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    vision_tower_name = f'SigLIP/siglip-so400m-patch14-224'
     vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
     image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)

evaluation/evaluate_mmvp_SigLIP_384.py CHANGED Viewed

@@ -51,8 +51,6 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
             text1 = 'a photo of ' + statement1
             text2 = 'a photo of ' + statement2
-            #text1 = clip.tokenize([text1]).to(device)
-            #text2 = clip.tokenize([text2]).to(device)
             text1 = tokenizer(
                 text1,
                 truncation=True,
@@ -68,18 +66,15 @@ def benchmark_model(processor, tokenizer, model, benchmark_dir, device="cpu"):
                 return_overflowing_tokens=False,
                 padding="max_length",
                 return_tensors="pt",
-            )["input_ids"].to(device)   # torch.Size([1, 77])
-            #img1 = preprocess(img1).unsqueeze(0).to(device)
-            #img2 = preprocess(img2).unsqueeze(0).to(device)
-            img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)   # torch.Size([1, 3, 224, 224])
-            img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)   # torch.Size([1, 3, 224, 224])
-            imgs = torch.cat((img1, img2), dim=0)   # torch.Size([2, 3, 224, 224])
             with torch.no_grad():
                 model.eval().float()
-                #logits_per_image1, logits_per_text1 = model(imgs, text1)
-                #logits_per_image2, logits_per_text2 = model(imgs, text2)
                 outputs1 = model(input_ids=text1, pixel_values=imgs)
                 logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
                 outputs2 = model(input_ids=text2, pixel_values=imgs)
@@ -139,10 +134,10 @@ def official_evaluation(processor, tokenizer, clip_model, model_name, benchmark_
 if __name__ == "__main__":
-    BENCHMARK_DIR = '/group/40033/public_datasets/MMVP_VLM'
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    vision_tower_name = f'SigLIP_384/siglip-so400m-patch14-384-7500'
     vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
     image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)

             text1 = 'a photo of ' + statement1
             text2 = 'a photo of ' + statement2
             text1 = tokenizer(
                 text1,
                 truncation=True,
                 return_overflowing_tokens=False,
                 padding="max_length",
                 return_tensors="pt",
+            )["input_ids"].to(device)
+            img1 = processor.preprocess(img1, return_tensors='pt')['pixel_values'].to(device)
+            img2 = processor.preprocess(img2, return_tensors='pt')['pixel_values'].to(device)
+            imgs = torch.cat((img1, img2), dim=0)
             with torch.no_grad():
                 model.eval().float()
                 outputs1 = model(input_ids=text1, pixel_values=imgs)
                 logits_per_image1, logits_per_text1 = outputs1.logits_per_image, outputs1.logits_per_text
                 outputs2 = model(input_ids=text2, pixel_values=imgs)
 if __name__ == "__main__":
+    BENCHMARK_DIR = 'YOUR_MMVP_VLM_PATH'
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    vision_tower_name = f'SigLIP/siglip-so400m-patch14-384'
     vision_tower = SiglipModel.from_pretrained(vision_tower_name, device_map=device)
     image_processor = SiglipImageProcessor.from_pretrained(vision_tower_name)