Added model files and updated config.json

Browse files

Files changed (14) hide show

.gitignore +3 -0
LICENSE +21 -0
config.json +1 -0
data_loader.py +47 -0
graph_construction.py +138 -0
hubconf.py +23 -0
model_components.py +115 -0
requirements.txt +10 -0
sag_vit_model.py +107 -0
tests/test_graph_construction.py +39 -0
tests/test_model_components.py +53 -0
tests/test_sag_vit_model.py +39 -0
tests/test_train.py +54 -0
train.py +196 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+data/
+__pycache__
+tests/__pycache__

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Shravan Venkatraman
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

config.json CHANGED Viewed

@@ -1,4 +1,5 @@
 {
   "d_model": 64,
   "dim_feedforward": 64,
   "gcn_hidden": 128,

 {
+  "model_type": "sag-vit",
   "d_model": 64,
   "dim_feedforward": 64,
   "gcn_hidden": 128,

data_loader.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+from torch.utils.data import DataLoader, random_split
+from torchvision import datasets, transforms
+def get_dataloaders(data_dir="path/to/data/dir", batch_size=512, train_split=0.8, img_size=224, num_workers=4):
+    """
+    Returns training and validation dataloaders for an image classification dataset.
+    Parameters:
+    - data_dir (str): Path to the directory containing image data in a folder structure compatible with ImageFolder.
+    - batch_size (int): Number of samples per batch.
+    - train_split (float): Fraction of data to use for training. Remaining is for validation.
+    - img_size (int): Target size to which all images are resized after validation.
+    - num_workers (int): Number of worker processes for data loading.
+    Image Size Validation:
+    - Minimum allowed image size: 49x49 pixels.
+    - If an image has either width or height less than 49 pixels, a ValueError is raised.
+    Returns:
+    - train_dataloader (DataLoader): DataLoader for the training split.
+    - val_dataloader (DataLoader): DataLoader for the validation split.
+    """
+    # Check if the provided image size is valid
+    if img_size < 49:
+        raise ValueError(f"Image size must be at least 49x49 pixels, but got {img_size}x{img_size}.")
+    transform = transforms.Compose([
+        transforms.Resize((img_size, img_size)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    ])
+    # Load full dataset
+    full_dataset = datasets.ImageFolder(root=data_dir, transform=transform)
+    # Split into training and validation sets
+    train_size = int(train_split * len(full_dataset))
+    val_size = len(full_dataset) - train_size
+    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])
+    # Create dataloaders
+    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
+    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
+    return train_dataloader, val_dataloader

graph_construction.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import torch
+import networkx as nx
+from torch_geometric.utils import from_networkx
+####################################################################
+# These functions reflect the methods described in Section 3.1 and 3.2
+# of the SAG-ViT paper, where high-fidelity feature patches are extracted
+# from the CNN feature maps and organized into a graph structure.
+####################################################################
+def extract_patches(feature_map, patch_size=(4, 4)):
+    """
+    Extracts non-overlapping patches from a feature map to form nodes in a graph.
+    Parameters:
+    - feature_map (Tensor): The feature map from the CNN of shape (B, C, H', W').
+      H' and W' are reduced spatial dimensions after CNN feature extraction.
+    - patch_size (tuple): Spatial size (height, width) of each patch.
+    Returns:
+    - patches (Tensor): Tensor of shape (B, N, C, patch_h, patch_w), where N is the number of patches per image.
+    """
+    b, c, h, w = feature_map.size()
+    patch_h, patch_w = patch_size
+    # Unfold extracts sliding patches; here we align so that they are non-overlapping
+    patches = feature_map.unfold(2, patch_h, patch_h).unfold(3, patch_w, patch_w)
+    # Rearrange to have patches as separate units
+    patches = patches.permute(0, 2, 3, 1, 4, 5).contiguous()
+    patches = patches.view(b, -1, c, patch_h, patch_w)
+    return patches
+def construct_graph_from_patch(patch_index, patch_shape, image_shape):
+    """
+    Constructs edges between patch nodes based on spatial adjacency (k-connectivity).
+    This follows the approach described in Section 3.2 of SAG-ViT, where patches
+    are arranged in a grid and connected to their spatial neighbors.
+    Parameters:
+    - patch_index (int): Index of the current patch node.
+    - patch_shape (tuple): (patch_height, patch_width).
+    - image_shape (tuple): (height, width) of the feature map.
+    Returns:
+    - G (nx.Graph): A graph with a single node and edges to its neighbors (to be composed globally).
+    """
+    G = nx.Graph()
+    # Compute grid dimensions (how many patches along height and width)
+    grid_height = image_shape[0] // patch_shape[0]
+    grid_width = image_shape[1] // patch_shape[1]
+    # Current node index in a flattened grid
+    current_node = patch_index
+    G.add_node(current_node)
+    # 8-neighborhood connectivity (up, down, left, right, diagonals)
+    neighbor_offsets = [(-1, 0), (1, 0), (0, -1), (0, 1),
+                        (-1, -1), (-1, 1), (1, -1), (1, 1)]
+    # Recover row, col from patch_index
+    row = current_node // grid_width
+    col = current_node % grid_width
+    for dr, dc in neighbor_offsets:
+        neighbor_row = row + dr
+        neighbor_col = col + dc
+        if 0 <= neighbor_row < grid_height and 0 <= neighbor_col < grid_width:
+            neighbor_node = neighbor_row * grid_width + neighbor_col
+            G.add_edge(current_node, neighbor_node)
+    return G
+def build_graph_from_patches(feature_map, patch_size=(4,4)):
+    """
+    Builds a global graph for each image in the batch, where each node corresponds
+    to a patch, and edges represent spatial adjacency. This graph captures local
+    spatial relationships of the patches, as outlined in Sections 3.1 and 3.2 of SAG-ViT.
+    Parameters:
+    - feature_map (Tensor): CNN output (B, C, H', W').
+    - patch_size (tuple): Size of each patch (patch_h, patch_w).
+    Returns:
+    - G_global_batch (list): A list of NetworkX graphs, one per image in the batch.
+    - patches (Tensor): The extracted patches (B, N, C, patch_h, patch_w).
+    """
+    patches = extract_patches(feature_map, patch_size)
+    batch_size = patches.size(0)
+    grid_height = feature_map.size(2) // patch_size[0]
+    grid_width = feature_map.size(3) // patch_size[1]
+    num_patches = grid_height * grid_width
+    G_global_batch = []
+    for batch_idx in range(batch_size):
+        G_global = nx.Graph()
+        # Construct a global graph by composing individual patch-based graphs
+        for patch_idx in range(num_patches):
+            G_patch = construct_graph_from_patch(
+                patch_index=patch_idx,
+                patch_shape=patch_size,
+                image_shape=(feature_map.size(2), feature_map.size(3))
+            )
+            G_global = nx.compose(G_global, G_patch)
+        G_global_batch.append(G_global)
+    return G_global_batch, patches
+def build_graph_data_from_patches(G_global_batch, patches):
+    """
+    Converts NetworkX graphs and associated patches into PyTorch Geometric Data objects.
+    Each node corresponds to a patch vectorized into a feature node embedding.
+    Parameters:
+    - G_global_batch (list): List of global graphs (one per image) in NetworkX form.
+    - patches (Tensor): (B, N, C, patch_h, patch_w) patch tensor.
+    Returns:
+    - data_list (list): List of PyTorch Geometric Data objects, where data.x are node features,
+      and data.edge_index is the adjacency from the constructed graph.
+    """
+    from_networkx_ = from_networkx  # local alias to avoid confusion
+    data_list = []
+    batch_size, num_patches, channels, patch_h, patch_w = patches.size()
+    for batch_idx, G_global in enumerate(G_global_batch):
+        # Flatten each patch into a feature vector
+        node_features = patches[batch_idx].view(num_patches, -1)
+        G_pygeom = from_networkx_(G_global)
+        G_pygeom.x = node_features
+        data_list.append(G_pygeom)
+    return data_list

hubconf.py ADDED Viewed

	@@ -0,0 +1,23 @@

+dependencies = ['torch']
+from sag_vit_model import SAGViTClassifier
+import torch
+def SAGViT(pretrained=False, **kwargs):
+    """
+    SAG-ViT model endpoint.
+    Args:
+        pretrained (bool): If True, loads pretrained weights.
+        **kwargs: Additional arguments for the model.
+    Returns:
+        model (nn.Module): The SAG-ViT model as proposed in the
+        paper: SAG-ViT: A Scale-Aware, High-Fidelity Patching
+        Approach with Graph Attention for Vision Transformers.
+        https://doi.org/10.48550/arXiv.2411.09420
+    """
+    model = SAGViTClassifier(**kwargs)
+    if pretrained:
+        checkpoint = ''
+        state_dict = torch.hub.load_state_dict_from_url(checkpoint, progress=True)
+        model.load_state_dict(state_dict)
+    return model

model_components.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch_geometric.nn import GATConv, global_mean_pool
+from torchvision import models
+###############################################################
+# These modules correspond to core building blocks of SAG-ViT:
+# 1. A CNN feature extractor for high-fidelity multi-scale feature maps.
+# 2. A Graph Attention Network (GAT) to refine patch embeddings.
+# 3. A Transformer Encoder to capture global long-range dependencies.
+# 4. An MLP classifier head.
+###############################################################
+class EfficientNetV2FeatureExtractor(nn.Module):
+    """
+    Extracts multi-scale, spatially-rich, and semantically-meaningful feature maps
+    from images using a pre-trained EfficientNetV2-S model. This corresponds
+    to Section 3.1, where a CNN backbone (EfficientNetV2-S) is used to produce rich
+    feature maps that preserve semantic information at multiple scales.
+    """
+    def __init__(self, pretrained=False):
+        super(EfficientNetV2FeatureExtractor, self).__init__()
+        # Load EfficientNetV2-S with pretrained weights
+        efficientnet = models.efficientnet_v2_s(
+            weights="IMAGENET1K_V1" if pretrained else None
+        )
+        # Extract layers up to the last block before downsampling below 16x16
+        self.extractor = nn.Sequential(*list(efficientnet.features.children())[:-2])
+    def forward(self, x):
+        """
+        Forward pass through the CNN backbone.
+        Input:
+        - x (Tensor): Input images of shape (B, 3, H, W)
+        Output:
+        - features (Tensor): Extracted feature map of shape (B, C, H', W'),
+          where H' and W' are reduced spatial dimensions.
+        """
+        features = self.extractor(x)
+        return features
+class GATGNN(nn.Module):
+    """
+    A Graph Attention Network (GAT) that processes patch-graph embeddings.
+    This module corresponds to the Graph Attention stage (Section 3.3),
+    refining local relationships between patches in a learned manner.
+    """
+    def __init__(self, in_channels, hidden_channels, out_channels, heads=8):
+        super(GATGNN, self).__init__()
+        # GAT layers:
+        # First layer maps raw patch embeddings to a higher-level representation.
+        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads)
+        # Second layer produces final node embeddings with a single head.
+        self.conv2 = GATConv(hidden_channels * heads, out_channels, heads=1)
+        self.pool = global_mean_pool
+    def forward(self, data):
+        """
+        Input:
+        - data (PyG Data): Contains x (node features), edge_index (graph edges), and batch indexing.
+        Output:
+        - x (Tensor): Aggregated graph-level embedding after mean pooling.
+        """
+        x, edge_index, batch = data.x, data.edge_index, data.batch
+        x = F.elu(self.conv1(x, edge_index))
+        x = self.conv2(x, edge_index)
+        x = self.pool(x, batch)
+        return x
+class TransformerEncoder(nn.Module):
+    """
+    A Transformer encoder to capture long-range dependencies among patch embeddings.
+    Integrates global dependencies after GAT processing, as per Section 3.3.
+    """
+    def __init__(self, d_model, nhead, num_layers, dim_feedforward):
+        super(TransformerEncoder, self).__init__()
+        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward)
+        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+    def forward(self, x):
+        """
+        Input:
+        - x (Tensor): Sequence of patch embeddings with shape (B, N, D).
+        Output:
+        - (Tensor): Transformed embeddings with global relationships integrated (B, N, D).
+        """
+        # The Transformer expects (N, B, D), so transpose first
+        x = x.transpose(0, 1)  # (N, B, D)
+        x = self.transformer_encoder(x)
+        x = x.transpose(0, 1)  # (B, N, D)
+        return x
+class MLPBlock(nn.Module):
+    """
+    An MLP classification head to map final global embeddings to classification logits.
+    """
+    def __init__(self, in_features, hidden_features, out_features):
+        super(MLPBlock, self).__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(in_features, hidden_features),
+            nn.ReLU(),
+            nn.Linear(hidden_features, out_features)
+        )
+    def forward(self, x):
+        return self.mlp(x)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+numpy==1.26.4
+pandas==2.2.3
+matplotlib==3.7.5
+seaborn==0.12.2
+tqdm==4.66.4
+scikit-learn==1.2.2
+torch==2.4.0
+torch-geometric==2.6.1
+torchvision==0.19.0
+networkx==3.3

sag_vit_model.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import torch
+from torch import nn
+from huggingface_hub import PyTorchModelHubMixin
+from torch_geometric.data import Batch
+from model_components import EfficientNetV2FeatureExtractor, GATGNN, TransformerEncoder, MLPBlock
+from graph_construction import build_graph_from_patches, build_graph_data_from_patches
+###############################################################################
+# SAG-ViT Model:
+# This class combines:
+# 1) CNN backbone to produce high-fidelity feature maps (Section 3.1),
+# 2) Graph construction and GAT to refine local patch embeddings (Section 3.2 and 3.3),
+# 3) A Transformer encoder to capture global relationships (Section 3.3),
+# 4) A final MLP classifier.
+###############################################################################
+class SAGViTClassifier(nn.Module, PyTorchModelHubMixin):
+    """
+    SAG-ViT: Scale-Aware Graph Attention Vision Transformer
+    This model integrates the following steps:
+    - Extract multi-scale features from images using a CNN backbone (EfficientNetv2 here).
+    - Partition the feature map into patches and build a graph where each node is a patch.
+    - Use a Graph Attention Network (GAT) to refine patch embeddings based on local spatial relationships.
+    - Utilize a Transformer encoder to model long-range dependencies and integrate multi-scale information.
+    - Finally, classify the resulting representation into desired classes.
+    Inputs:
+    - x (Tensor): Input images (B, 3, H, W)
+    Outputs:
+    - out (Tensor): Classification logits (B, num_classes)
+    """
+    def __init__(
+        self,
+        patch_size=(4,4),
+        num_classes=10,
+        d_model=64,
+        nhead=4,
+        num_layers=2,
+        dim_feedforward=64,
+        hidden_mlp_features=64,
+        in_channels=2560,  # Derived from patch dimensions and CNN output channels
+        gcn_hidden=128,
+        gcn_out=64
+    ):
+        super(SAGViTClassifier, self).__init__()
+        # CNN feature extractor (frozen pre-trained EfficientNetv2)
+        self.cnn = EfficientNetV2FeatureExtractor()
+        # Graph Attention Network to process patch embeddings
+        self.gcn = GATGNN(in_channels=in_channels, hidden_channels=gcn_hidden, out_channels=gcn_out)
+        # Learnable positional embedding for Transformer input
+        self.positional_embedding = nn.Parameter(torch.randn(1, 1, d_model))
+        # Extra embedding token (similar to class token) to summarize global info
+        self.extra_embedding = nn.Parameter(torch.randn(1, d_model))
+        # Transformer encoder to capture long-range global dependencies
+        self.transformer_encoder = TransformerEncoder(d_model, nhead, num_layers, dim_feedforward)
+        # MLP classification head
+        self.mlp = MLPBlock(d_model, hidden_mlp_features, num_classes)
+        self.patch_size = patch_size
+    def forward(self, x):
+        # Step 1: High-fidelity feature extraction from CNN
+        feature_map = self.cnn(x)
+        # Step 2: Build graphs from patches
+        G_global_batch, patches = build_graph_from_patches(feature_map, self.patch_size)
+        # Step 3: Convert to PyG Data format and batch
+        data_list = build_graph_data_from_patches(G_global_batch, patches)
+        device = x.device
+        batch = Batch.from_data_list(data_list).to(device)
+        # Step 4: GAT stage
+        x_gcn = self.gcn(batch)
+        # Step 5: Reshape GCN output back to (B, N, D)
+        # The number of patches per image is determined by patch size and feature map dimensions.
+        B = x.size(0)
+        D = x_gcn.size(-1)
+        # N is automatically inferred
+        # Thus x_gcn is (B, D) now. We need a sequence dimension for the Transformer.
+        # Let's treat each image-level embedding as one "patch token" plus an extra token:
+        patch_embeddings = x_gcn.unsqueeze(1)  # (B, 1, D)
+        # Add positional embedding
+        patch_embeddings = patch_embeddings + self.positional_embedding  # (B, 1, D)
+        # Add an extra learnable embedding (like a CLS token)
+        patch_embeddings = torch.cat([patch_embeddings, self.extra_embedding.unsqueeze(0).expand(B, -1, -1)], dim=1)  # (B, 2, D)
+        # Step 6: Transformer encoder
+        x_trans = self.transformer_encoder(patch_embeddings)
+        # Step 7: Global pooling (here we just take the mean)
+        x_pooled = x_trans.mean(dim=1)  # (B, D)
+        # Classification
+        out = self.mlp(x_pooled)
+        return out

tests/test_graph_construction.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import unittest
+import torch
+import networkx as nx
+from graph_construction import extract_patches, build_graph_from_patches, build_graph_data_from_patches
+class TestGraphConstruction(unittest.TestCase):
+    def test_extract_patches_shape(self):
+        # Create a dummy feature map: B=2, C=16, H=32, W=32
+        feature_map = torch.randn(2, 16, 32, 32)
+        patches = extract_patches(feature_map, patch_size=(4,4))
+        # Check dimensions: after extraction,
+        # number_of_patches = (H/4)*(W/4) = 8*8=64 per image, total 2*64=128
+        self.assertEqual(patches.shape, (2, 64, 16, 4, 4))
+    def test_build_graph_from_patches_graph_structure(self):
+        feature_map = torch.randn(1, 16, 32, 32)
+        G_batch, patches = build_graph_from_patches(feature_map, patch_size=(4,4))
+        # 1 image => G_batch[0] is the graph
+        G = G_batch[0]
+        # We have 64 patches
+        self.assertEqual(len(G.nodes), 64)
+        # Check if edges exist (8-neighborhood).
+        # Interior nodes should have edges to neighbors.
+        # Just check a random node in the middle
+        node_index = 9 # assuming row=1, col=1 in an 8x8 grid
+        self.assertTrue(len(list(G.neighbors(node_index))) > 0)
+    def test_build_graph_data_from_patches_conversion(self):
+        feature_map = torch.randn(2, 16, 32, 32)
+        G_batch, patches = build_graph_from_patches(feature_map, patch_size=(4,4))
+        data_list = build_graph_data_from_patches(G_batch, patches)
+        self.assertEqual(len(data_list), 2)
+        # Check node feature shape
+        self.assertEqual(data_list[0].x.shape[1], 16*4*4)  # C * patch_h * patch_w = 16*4*4=256
+        # Check edges are present
+        self.assertTrue(data_list[0].edge_index.shape[1] > 0)
+if __name__ == '__main__':
+    unittest.main()

tests/test_model_components.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import unittest
+import torch
+from model_components import EfficientNetV2FeatureExtractor, GATGNN, TransformerEncoder, MLPBlock
+from torch_geometric.data import Data
+class TestModelComponents(unittest.TestCase):
+    def test_efficientnetv2_extractor_output_shape(self):
+        model = EfficientNetV2FeatureExtractor()
+        model.eval()
+        x = torch.randn(2, 3, 224, 224)
+        with torch.no_grad():
+            features = model(x)
+        # Check output shape - depends on inception intermediate layer
+        # Example: shape could be (2, 768, 8, 8) depending on the chosen layer
+        self.assertEqual(features.size(0), 2)
+        self.assertTrue(features.size(1) > 0)
+        self.assertTrue(features.size(2) > 0)
+        self.assertTrue(features.size(3) > 0)
+    def test_gatgnn_forward(self):
+        # Graph with 4 nodes, each node feature dim=256
+        x = torch.randn(4, 256)
+        edge_index = torch.tensor([[0,1,1,2],[1,0,2,3]], dtype=torch.long)
+        batch = torch.tensor([0,0,0,0])
+        data = Data(x=x, edge_index=edge_index, batch=batch)
+        gnn = GATGNN(in_channels=256, hidden_channels=64, out_channels=32)
+        output = gnn(data)
+        # After pooling: should be (batch_size, out_channels) = (1,32)
+        self.assertEqual(output.shape, (1, 32))
+    def test_transformer_encoder(self):
+        # (B, N, D) = (2, 10, 64)
+        x = torch.randn(2, 10, 64)
+        encoder = TransformerEncoder(d_model=64, nhead=4, num_layers=2, dim_feedforward=64)
+        out = encoder(x)
+        # same shape as input
+        self.assertEqual(out.shape, (2, 10, 64))
+    def test_mlp_block(self):
+        mlp = MLPBlock(in_features=64, hidden_features=128, out_features=10)
+        x = torch.randn(2, 64)
+        out = mlp(x)
+        self.assertEqual(out.shape, (2,10))
+    def test_efficientnetv2_freeze(self):
+        # Ensure params are frozen
+        model = EfficientNetV2FeatureExtractor()
+        for param in model.parameters():
+            self.assertFalse(param.requires_grad)
+if __name__ == '__main__':
+    unittest.main()

tests/test_sag_vit_model.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import unittest
+import torch
+from sag_vit_model import SAGViTClassifier
+class TestSAGViTModel(unittest.TestCase):
+    def test_forward_pass(self):
+        model = SAGViTClassifier(
+            patch_size=(4,4),
+            num_classes=10,  # smaller num classes for test
+            d_model=64,
+            nhead=4,
+            num_layers=2,
+            dim_feedforward=64,
+            hidden_mlp_features=64,
+            in_channels=2560,  # from patch dimension example
+            gcn_hidden=128,
+            gcn_out=64
+        )
+        model.eval()
+        x = torch.randn(2, 3, 224, 224)
+        with torch.no_grad():
+            out = model(x)
+        # Check output shape: (B, num_classes) = (2,10)
+        self.assertEqual(out.shape, (2,10))
+    def test_empty_input(self):
+        model = SAGViTClassifier()
+        # Passing an empty tensor should fail gracefully
+        with self.assertRaises(Exception):
+            model(torch.empty(0,3,224,224))
+    def test_invalid_input_dimensions(self):
+        model = SAGViTClassifier()
+        # Incorrect dimension (e.g., missing channel)
+        with self.assertRaises(RuntimeError):
+            model(torch.randn(2, 224, 224))  # no channel dimension
+if __name__ == '__main__':
+    unittest.main()

tests/test_train.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import unittest
+from unittest.mock import MagicMock, patch
+import torch
+import torch.nn as nn
+from train import train_model
+from sag_vit_model import SAGViTClassifier
+class TestTrain(unittest.TestCase):
+    @patch("train.optim.Adam")
+    def test_train_model_loop(self, mock_adam):
+        # Mock the optimizer
+        mock_optimizer = MagicMock()
+        mock_adam.return_value = mock_optimizer
+        # Mock dataloaders with a small dummy dataset
+        # Just one batch with a couple of samples
+        train_dataloader = [ (torch.randn(2,3,224,224), torch.tensor([0,1])) ]
+        val_dataloader = [ (torch.randn(2,3,224,224), torch.tensor([0,1])) ]
+        model = SAGViTClassifier(num_classes=2)
+        criterion = nn.CrossEntropyLoss()
+        device = torch.device("cpu")
+        # Test a single epoch training
+        history = train_model(model, "TestModel", train_dataloader, val_dataloader,
+                              num_epochs=1, criterion=criterion, optimizer=mock_optimizer, device=device, patience=2, verbose=False)
+        # Check if history is properly recorded
+        self.assertIn("train_loss", history)
+        self.assertIn("val_loss", history)
+        self.assertGreaterEqual(len(history["train_loss"]), 1)
+        self.assertGreaterEqual(len(history["val_loss"]), 1)
+    def test_early_stopping(self):
+        # Mocking dataloaders where validation loss doesn't improve
+        model = SAGViTClassifier(num_classes=2)
+        criterion = nn.CrossEntropyLoss()
+        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+        device = torch.device("cpu")
+        # create a scenario where val loss won't improve
+        # first epoch normal, second epoch slightly worse
+        train_dataloader = [ (torch.randn(2,3,224,224), torch.tensor([0,1])) ]
+        val_dataloader = [ (torch.randn(2,3,224,224), torch.tensor([0,1])) ]
+        history = train_model(model, "TestModelEarlyStop", train_dataloader, val_dataloader,
+                              num_epochs=5, criterion=criterion, optimizer=optimizer, device=device, patience=1, verbose=False)
+        # Should have triggered early stopping before all 5 epochs
+        self.assertLessEqual(len(history["train_loss"]), 5)
+if __name__ == '__main__':
+    unittest.main()

train.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import os
+import torch
+from torch import nn, optim
+from tqdm import tqdm
+from huggingface_hub import HfApi
+import numpy as np
+from sklearn.metrics import (precision_score, recall_score, f1_score,
+                             roc_auc_score, cohen_kappa_score, matthews_corrcoef,
+                             confusion_matrix)
+from sag_vit_model import SAGViTClassifier
+from data_loader import get_dataloaders
+#####################################################################
+# This file provides the training loop and metric computation. It uses
+# the SAG-ViT model defined in sag_vit_model.py, and the data from data_loader.py.
+# The training loop is adapted to implement early stopping and track various metrics.
+#####################################################################
+def train_model(model, model_name, train_loader, val_loader, num_epochs, criterion, optimizer, device, patience=8, verbose=True):
+    """
+    Trains the SAG-ViT model and evaluates it on the validation set.
+    Implements early stopping based on validation loss.
+    Parameters:
+    - model (nn.Module): The SAG-ViT model.
+    - model_name (str): A name to identify the model (used for saving checkpoints).
+    - train_loader, val_loader: DataLoaders for training and validation.
+    - num_epochs (int): Maximum number of epochs.
+    - criterion (nn.Module): Loss function.
+    - optimizer (torch.optim.Optimizer): Optimization algorithm.
+    - device (torch.device): Device to run the computations on (CPU/GPU).
+    - patience (int): Early stopping patience.
+    Returns:
+    - history (dict): Dictionary containing training and validation metrics per epoch.
+    """
+    history = {
+        'train_loss': [], 'train_acc': [], 'train_prec': [], 'train_rec': [], 'train_f1': [],
+        'train_auc': [], 'train_mcc': [], 'train_cohen_kappa': [], 'train_confusion_matrix': [],
+        'val_loss': [], 'val_acc': [], 'val_prec': [], 'val_rec': [], 'val_f1': [],
+        'val_auc': [], 'val_mcc': [], 'val_cohen_kappa': [], 'val_confusion_matrix': []
+    }
+    best_val_loss = float('inf')
+    patience_counter = 0
+    best_model_state = None
+    for epoch in range(num_epochs):
+        print(f'Epoch {epoch+1}/{num_epochs}')
+        model.train()
+        train_loss_total, correct, total = 0, 0, 0
+        all_preds, all_labels, all_probs = [], [], []
+        # Training loop
+        for batch_idx, (X, y) in enumerate(tqdm(train_loader)):
+            inputs, labels = X.to(device), y.to(device)
+            optimizer.zero_grad()
+            outputs = model(inputs)
+            loss = criterion(outputs, labels)
+            loss.backward()
+            optimizer.step()
+            train_loss_total += loss.item()
+            probs = torch.softmax(outputs, dim=1)
+            _, preds = torch.max(outputs, 1)
+            correct += (preds == labels).sum().item()
+            total += labels.size(0)
+            all_preds.extend(preds.cpu().numpy())
+            all_labels.extend(labels.cpu().numpy())
+            all_probs.extend(probs.detach().cpu().numpy())
+        # Compute training metrics
+        train_acc = correct / total
+        train_prec = precision_score(all_labels, all_preds, average='macro', zero_division=0)
+        train_rec = recall_score(all_labels, all_preds, average='macro')
+        train_f1 = f1_score(all_labels, all_preds, average='macro')
+        train_cohen_kappa = cohen_kappa_score(all_labels, all_preds)
+        train_mcc = matthews_corrcoef(all_labels, all_preds)
+        train_confusion = confusion_matrix(all_labels, all_preds)
+        history['train_loss'].append(train_loss_total / len(train_loader))
+        history['train_acc'].append(train_acc)
+        history['train_prec'].append(train_prec)
+        history['train_rec'].append(train_rec)
+        history['train_f1'].append(train_f1)
+        history['train_cohen_kappa'].append(train_cohen_kappa)
+        history['train_mcc'].append(train_mcc)
+        history['train_confusion_matrix'].append(train_confusion)
+        # Validation
+        model.eval()
+        val_loss_total, correct, total = 0, 0, 0
+        all_preds, all_labels, all_probs = [], [], []
+        with torch.no_grad():
+            for batch_idx, (X, y) in enumerate(tqdm(val_loader)):
+                inputs, labels = X.to(device), y.to(device)
+                outputs = model(inputs)
+                loss = criterion(outputs, labels)
+                val_loss_total += loss.item()
+                probs = torch.softmax(outputs, dim=1)
+                _, preds = torch.max(outputs, 1)
+                correct += (preds == labels).sum().item()
+                total += labels.size(0)
+                all_preds.extend(preds.cpu().numpy())
+                all_labels.extend(labels.cpu().numpy())
+                all_probs.extend(probs.detach().cpu().numpy())
+        # Compute validation metrics
+        val_acc = correct / total
+        val_prec = precision_score(all_labels, all_preds, average='macro', zero_division=0)
+        val_rec = recall_score(all_labels, all_preds, average='macro')
+        val_f1 = f1_score(all_labels, all_preds, average='macro')
+        val_cohen_kappa = cohen_kappa_score(all_labels, all_preds)
+        val_mcc = matthews_corrcoef(all_labels, all_preds)
+        val_confusion = confusion_matrix(all_labels, all_preds)
+        history['val_loss'].append(val_loss_total / len(val_loader))
+        history['val_acc'].append(val_acc)
+        history['val_prec'].append(val_prec)
+        history['val_rec'].append(val_rec)
+        history['val_f1'].append(val_f1)
+        history['val_cohen_kappa'].append(val_cohen_kappa)
+        history['val_mcc'].append(val_mcc)
+        history['val_confusion_matrix'].append(val_confusion)
+        # Print epoch summary
+        if verbose:
+            print(f"Train Loss: {history['train_loss'][-1]:.4f}, Train Acc: {history['train_acc'][-1]:.4f}, "
+                f"Val Loss: {history['val_loss'][-1]:.4f}, Val Acc: {history['val_acc'][-1]:.4f}")
+        # Early stopping
+        current_val_loss = history['val_loss'][-1]
+        if current_val_loss < best_val_loss:
+            best_val_loss = current_val_loss
+            best_model_state = model.state_dict()
+            patience_counter = 0
+        else:
+            patience_counter += 1
+            print(f"Patience counter: {patience_counter}/{patience}")
+            if patience_counter >= patience:
+                print("Early stopping triggered.")
+                model.load_state_dict(best_model_state)
+                torch.save(model.state_dict(), f'{model_name}.pth')
+                return history
+    model.load_state_dict(best_model_state)
+    torch.save(model.state_dict(), f'{model_name}.pth')
+    return history
+if __name__ == "__main__":
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Training on device: {device}")
+    data_dir = "data/PlantVillage" # "path/to/data/dir"
+    num_classes = len(os.listdir(data_dir))
+    train_loader, val_loader = get_dataloaders(data_dir=data_dir, img_size=224, batch_size=32) # Minimum image size should be atleast (49, 49)
+    model = SAGViTClassifier(num_classes=num_classes).to(device)
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(model.parameters(), lr=0.0001)
+    num_epochs = 100
+    history = train_model(
+        model,
+        'SAG-ViT',
+        train_loader,
+        val_loader,
+        num_epochs,
+        criterion,
+        optimizer,
+        device
+    )
+    # You may save history to a CSV or analyze it further as needed.
+    # Example:
+    # import pandas as pd
+    # history_df = pd.DataFrame(history)
+    # history_df.to_csv("training_history.csv", index=False)
+    # Load the saved model back (best practice before pushing)
+    model.load_state_dict(torch.load("SAG-ViT.pth"))
+    model.eval()
+    # Push the model to the Hugging Face Hub
+    model.push_to_hub("shravvvv/SAG-ViT", commit_message="Initial model push", private=True, trust_remote_code=True)