import torch
import torch.nn as nn

class FourDimensionalTransformer(nn.Module):
    def __init__(self, num_layers=16, embed_dim=7, num_heads=1, num_extra_tokens=16, num_classes=10):
        super(FourDimensionalTransformer, self).__init__()
        self.embed_dim = embed_dim
        self.num_extra_tokens = num_extra_tokens

        # Input embedding layer to map the input to the desired embedding dimension.
        self.embedding = nn.Conv2d(3, embed_dim, kernel_size=1)

        # Learnable extra tokens (to augment the spatial tokens).
        self.extra_tokens = nn.Parameter(torch.randn(num_extra_tokens, embed_dim))

        # Build a stack of self-attention layers with layer normalization.
        self.attention_layers = nn.ModuleList([
            nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads)
            for _ in range(num_layers)
        ])
        self.layer_norms = nn.ModuleList([
            nn.LayerNorm(embed_dim)
            for _ in range(num_layers)
        ])
        
        # GRU cell for recurrent updating—mimicking working memory or recurrent feedback.
        self.gru = nn.GRUCell(embed_dim, embed_dim)

        # Final classification head: Flattened after applying self-attention + GRU.
        self.fc = nn.Linear((16 + num_extra_tokens) * embed_dim, num_classes)

    def forward(self, x):
        # x: [batch, 3, height=4, width=4]
        batch_size = x.size(0)
        
        # Embed the input: [batch, 3, height, width] -> [batch, embed_dim, height, width]
        x = self.embedding(x)
        
        # Flatten spatial dimensions: [batch, embed_dim, height, width] -> [batch, embed_dim, height * width]
        # Then permute to [sequence_length, batch, embed_dim] for attention.
        x = x.view(batch_size, self.embed_dim, -1).permute(2, 0, 1)  # [height * width, batch, embed_dim]

        # Expand and concatenate extra tokens: extra_tokens [num_extra_tokens, embed_dim]
        # becomes [num_extra_tokens, batch, embed_dim] and concatenated along sequence dim.
        extra_tokens = self.extra_tokens.unsqueeze(1).expand(-1, batch_size, -1)
        x = torch.cat([x, extra_tokens], dim=0)  # [height * width + num_extra_tokens, batch, embed_dim]

        # Process through the transformer layers with recurrent GRU updates.
        for attn, norm in zip(self.attention_layers, self.layer_norms):
            residual = x
            attn_out, _ = attn(x, x, x)
            # Residual connection and layer normalization.
            x = norm(residual + attn_out)
            
            # --- Brain-inspired recurrent update ---
            # Reshape tokens to apply GRUCell in parallel.
            seq_len, batch, embed_dim = x.shape
            x_flat = x.view(seq_len * batch, embed_dim)
            # Use the same x_flat as both input and hidden state.
            x_updated_flat = self.gru(x_flat, x_flat)
            x = x_updated_flat.view(seq_len, batch, embed_dim)
            # --- End recurrent update ---
        
        # Rearrange back to [batch, sequence_length, embed_dim] and flatten.
        x = x.permute(1, 0, 2).contiguous()
        x = x.view(batch_size, -1)
        
        # Final fully connected layer (classification head).
        out = self.fc(x)
        return out

# Example usage:
input_tensor = torch.rand(2, 3, 4, 4)  # [batch=2, channels=3, height=4, width=4]
model = FourDimensionalTransformer(num_layers=16, embed_dim=7, num_heads=1, num_extra_tokens=16, num_classes=10)
output = model(input_tensor)

print("Output shape:", output.shape)
print("Output:", output)