timmAttentionViz

Runtime error

App Files Files Community

taesiri commited on Sep 1, 2024

Commit

980c76b

1 Parent(s): ac69117

Update

Browse files

Files changed (1) hide show

app.py +13 -13

app.py CHANGED Viewed

@@ -24,8 +24,9 @@ def load_model(model_name: str) -> Tuple[torch.nn.Module, AttentionExtract]:
     """Load a model from timm and prepare it for attention extraction."""
     timm.layers.set_fused_attn(False)
     model = create_model(model_name, pretrained=True)
     model.eval()
-    extractor = AttentionExtract(model, method='fx')  # can use 'hooks', can also allow specifying matching names for attention nodes or modules...
     return model, extractor
 @spaces.GPU
@@ -46,8 +47,8 @@ def process_image(
         is_training=False
     )
-    # Preprocess the image
-    tensor = transform(image).unsqueeze(0).to('cuda')
     # Extract attention maps
     attention_maps = extractor(tensor)
@@ -67,12 +68,11 @@ def apply_mask(image: np.ndarray, mask: np.ndarray, color: Tuple[float, float, f
     return masked_image.astype(np.uint8)
 def rollout(attentions, discard_ratio, head_fusion, num_prefix_tokens=1):
-    # based on https://github.com/jacobgil/vit-explain/blob/main/vit_rollout.py
-    result = torch.eye(attentions[0].size(-1)).to(attentions[0].device)
     with torch.no_grad():
         for attention in attentions:
             if head_fusion.startswith('mean'):
-                # mean_std fusion doesn't appear to make sense with rollout
                 attention_heads_fused = attention.mean(dim=0)
             elif head_fusion == "max":
                 attention_heads_fused = attention.amax(dim=0)
@@ -87,14 +87,13 @@ def rollout(attentions, discard_ratio, head_fusion, num_prefix_tokens=1):
             indices = indices[indices >= num_prefix_tokens]
             flat[indices] = 0
-            I = torch.eye(attention_heads_fused.size(-1)).to(attention_heads_fused.device)
             a = (attention_heads_fused + 1.0 * I) / 2
             a = a / a.sum(dim=-1)
             result = torch.matmul(a, result)
     # Look at the total attention between the prefix tokens (usually class tokens)
     # and the image patches
-    # FIXME this is token 0 vs non-prefix right now, need to cover other cases (> 1 prefix, no prefix, etc)
     mask = result[0, num_prefix_tokens:]
     width = int(mask.size(-1) ** 0.5)
     mask = mask.reshape(width, width).cpu().numpy()
@@ -110,7 +109,6 @@ def visualize_attention(
 ) -> Tuple[List[Image.Image], Image.Image]:
     """Visualize attention maps and rollout for the given image and model."""
     model, extractor = load_model(model_name)
-    model = model.to('cuda')
     attention_maps = process_image(image, model, extractor)
     num_prefix_tokens = getattr(model, 'num_prefix_tokens', 1)  # Default to 1 class token if not specified
@@ -150,7 +148,7 @@ def visualize_attention(
         # Interpolate to match image size
         attn_map = attn_map.unsqueeze(0).unsqueeze(0)
         attn_map = F.interpolate(attn_map, size=(image_np.shape[0], image_np.shape[1]), mode='bilinear', align_corners=False)
-        attn_map = attn_map.squeeze().detach().cpu().numpy()  # Detach before converting to numpy
         # Normalize attention map
         attn_map = (attn_map - attn_map.min()) / (attn_map.max() - attn_map.min())
@@ -179,6 +177,9 @@ def visualize_attention(
         visualizations.append(vis_image)
         plt.close(fig)
     # Calculate rollout
     rollout_mask = rollout(attentions_for_rollout, discard_ratio, head_fusion, num_prefix_tokens)
@@ -209,7 +210,6 @@ def visualize_attention(
     return visualizations, rollout_image
 # Create Gradio interface
 iface = gr.Interface(
     fn=visualize_attention,
@@ -231,5 +231,5 @@ iface = gr.Interface(
     description="Upload an image and select a timm model to visualize its attention maps."
 )
-# Launch the interface with share=True to create a public link
-iface.launch(share=True)

     """Load a model from timm and prepare it for attention extraction."""
     timm.layers.set_fused_attn(False)
     model = create_model(model_name, pretrained=True)
+    model = model.cuda()  # Move the model to CUDA
     model.eval()
+    extractor = AttentionExtract(model, method='fx')
     return model, extractor
 @spaces.GPU
         is_training=False
     )
+    # Preprocess the image and move to CUDA
+    tensor = transform(image).unsqueeze(0).cuda()
     # Extract attention maps
     attention_maps = extractor(tensor)
     return masked_image.astype(np.uint8)
 def rollout(attentions, discard_ratio, head_fusion, num_prefix_tokens=1):
+    device = attentions[0].device
+    result = torch.eye(attentions[0].size(-1)).to(device)
     with torch.no_grad():
         for attention in attentions:
             if head_fusion.startswith('mean'):
                 attention_heads_fused = attention.mean(dim=0)
             elif head_fusion == "max":
                 attention_heads_fused = attention.amax(dim=0)
             indices = indices[indices >= num_prefix_tokens]
             flat[indices] = 0
+            I = torch.eye(attention_heads_fused.size(-1)).to(device)
             a = (attention_heads_fused + 1.0 * I) / 2
             a = a / a.sum(dim=-1)
             result = torch.matmul(a, result)
     # Look at the total attention between the prefix tokens (usually class tokens)
     # and the image patches
     mask = result[0, num_prefix_tokens:]
     width = int(mask.size(-1) ** 0.5)
     mask = mask.reshape(width, width).cpu().numpy()
 ) -> Tuple[List[Image.Image], Image.Image]:
     """Visualize attention maps and rollout for the given image and model."""
     model, extractor = load_model(model_name)
     attention_maps = process_image(image, model, extractor)
     num_prefix_tokens = getattr(model, 'num_prefix_tokens', 1)  # Default to 1 class token if not specified
         # Interpolate to match image size
         attn_map = attn_map.unsqueeze(0).unsqueeze(0)
         attn_map = F.interpolate(attn_map, size=(image_np.shape[0], image_np.shape[1]), mode='bilinear', align_corners=False)
+        attn_map = attn_map.squeeze().cpu().numpy()  # Move to CPU before converting to numpy
         # Normalize attention map
         attn_map = (attn_map - attn_map.min()) / (attn_map.max() - attn_map.min())
         visualizations.append(vis_image)
         plt.close(fig)
+    # Ensure tensors are on CPU before converting to numpy
+    attentions_for_rollout = [attn.cpu() for attn in attentions_for_rollout]
     # Calculate rollout
     rollout_mask = rollout(attentions_for_rollout, discard_ratio, head_fusion, num_prefix_tokens)
     return visualizations, rollout_image
 # Create Gradio interface
 iface = gr.Interface(
     fn=visualize_attention,
     description="Upload an image and select a timm model to visualize its attention maps."
 )
+# Launch the interface
+iface.launch()