Guest / tensor_network.py
Prositron's picture
Update tensor_network.py
86c4d14 verified
raw
history blame
3.58 kB
import torch
import torch.nn as nn
class FourDimensionalTransformer(nn.Module):
def __init__(self, num_layers=16, embed_dim=7, num_heads=1, num_extra_tokens=16, num_classes=10):
super(FourDimensionalTransformer, self).__init__()
self.embed_dim = embed_dim
self.num_extra_tokens = num_extra_tokens
# Input embedding layer to map the input to the desired embedding dimension.
self.embedding = nn.Conv2d(3, embed_dim, kernel_size=1)
# Learnable extra tokens (to augment the spatial tokens).
self.extra_tokens = nn.Parameter(torch.randn(num_extra_tokens, embed_dim))
# Build a stack of self-attention layers with layer normalization.
self.attention_layers = nn.ModuleList([
nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads)
for _ in range(num_layers)
])
self.layer_norms = nn.ModuleList([
nn.LayerNorm(embed_dim)
for _ in range(num_layers)
])
# GRU cell for recurrent updating—mimicking working memory or recurrent feedback.
self.gru = nn.GRUCell(embed_dim, embed_dim)
# Final classification head: Flattened after applying self-attention + GRU.
self.fc = nn.Linear((16 + num_extra_tokens) * embed_dim, num_classes)
def forward(self, x):
# x: [batch, 3, height=4, width=4]
batch_size = x.size(0)
# Embed the input: [batch, 3, height, width] -> [batch, embed_dim, height, width]
x = self.embedding(x)
# Flatten spatial dimensions: [batch, embed_dim, height, width] -> [batch, embed_dim, height * width]
# Then permute to [sequence_length, batch, embed_dim] for attention.
x = x.view(batch_size, self.embed_dim, -1).permute(2, 0, 1) # [height * width, batch, embed_dim]
# Expand and concatenate extra tokens: extra_tokens [num_extra_tokens, embed_dim]
# becomes [num_extra_tokens, batch, embed_dim] and concatenated along sequence dim.
extra_tokens = self.extra_tokens.unsqueeze(1).expand(-1, batch_size, -1)
x = torch.cat([x, extra_tokens], dim=0) # [height * width + num_extra_tokens, batch, embed_dim]
# Process through the transformer layers with recurrent GRU updates.
for attn, norm in zip(self.attention_layers, self.layer_norms):
residual = x
attn_out, _ = attn(x, x, x)
# Residual connection and layer normalization.
x = norm(residual + attn_out)
# --- Brain-inspired recurrent update ---
# Reshape tokens to apply GRUCell in parallel.
seq_len, batch, embed_dim = x.shape
x_flat = x.view(seq_len * batch, embed_dim)
# Use the same x_flat as both input and hidden state.
x_updated_flat = self.gru(x_flat, x_flat)
x = x_updated_flat.view(seq_len, batch, embed_dim)
# --- End recurrent update ---
# Rearrange back to [batch, sequence_length, embed_dim] and flatten.
x = x.permute(1, 0, 2).contiguous()
x = x.view(batch_size, -1)
# Final fully connected layer (classification head).
out = self.fc(x)
return out
# Example usage:
input_tensor = torch.rand(2, 3, 4, 4) # [batch=2, channels=3, height=4, width=4]
model = FourDimensionalTransformer(num_layers=16, embed_dim=7, num_heads=1, num_extra_tokens=16, num_classes=10)
output = model(input_tensor)
print("Output shape:", output.shape)
print("Output:", output)