Prositron
/

Guest

Text Generation

Inference Endpoints

Model card Files Files and versions Community

Guest / tensor_network.py

Prositron's picture

Update tensor_network.py

86c4d14 verified 15 days ago

3.58 kB

	import torch
	import torch.nn as nn

	class FourDimensionalTransformer(nn.Module):
	def __init__(self, num_layers=16, embed_dim=7, num_heads=1, num_extra_tokens=16, num_classes=10):
	super(FourDimensionalTransformer, self).__init__()
	self.embed_dim = embed_dim
	self.num_extra_tokens = num_extra_tokens

	# Input embedding layer to map the input to the desired embedding dimension.
	self.embedding = nn.Conv2d(3, embed_dim, kernel_size=1)

	# Learnable extra tokens (to augment the spatial tokens).
	self.extra_tokens = nn.Parameter(torch.randn(num_extra_tokens, embed_dim))

	# Build a stack of self-attention layers with layer normalization.
	self.attention_layers = nn.ModuleList([
	nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads)
	for _ in range(num_layers)
	])
	self.layer_norms = nn.ModuleList([
	nn.LayerNorm(embed_dim)
	for _ in range(num_layers)
	])

	# GRU cell for recurrent updating—mimicking working memory or recurrent feedback.
	self.gru = nn.GRUCell(embed_dim, embed_dim)

	# Final classification head: Flattened after applying self-attention + GRU.
	self.fc = nn.Linear((16 + num_extra_tokens) * embed_dim, num_classes)

	def forward(self, x):
	# x: [batch, 3, height=4, width=4]
	batch_size = x.size(0)

	# Embed the input: [batch, 3, height, width] -> [batch, embed_dim, height, width]
	x = self.embedding(x)

	# Flatten spatial dimensions: [batch, embed_dim, height, width] -> [batch, embed_dim, height * width]
	# Then permute to [sequence_length, batch, embed_dim] for attention.
	x = x.view(batch_size, self.embed_dim, -1).permute(2, 0, 1) # [height * width, batch, embed_dim]

	# Expand and concatenate extra tokens: extra_tokens [num_extra_tokens, embed_dim]
	# becomes [num_extra_tokens, batch, embed_dim] and concatenated along sequence dim.
	extra_tokens = self.extra_tokens.unsqueeze(1).expand(-1, batch_size, -1)
	x = torch.cat([x, extra_tokens], dim=0) # [height * width + num_extra_tokens, batch, embed_dim]

	# Process through the transformer layers with recurrent GRU updates.
	for attn, norm in zip(self.attention_layers, self.layer_norms):
	residual = x
	attn_out, _ = attn(x, x, x)
	# Residual connection and layer normalization.
	x = norm(residual + attn_out)

	# --- Brain-inspired recurrent update ---
	# Reshape tokens to apply GRUCell in parallel.
	seq_len, batch, embed_dim = x.shape
	x_flat = x.view(seq_len * batch, embed_dim)
	# Use the same x_flat as both input and hidden state.
	x_updated_flat = self.gru(x_flat, x_flat)
	x = x_updated_flat.view(seq_len, batch, embed_dim)
	# --- End recurrent update ---

	# Rearrange back to [batch, sequence_length, embed_dim] and flatten.
	x = x.permute(1, 0, 2).contiguous()
	x = x.view(batch_size, -1)

	# Final fully connected layer (classification head).
	out = self.fc(x)
	return out

	# Example usage:
	input_tensor = torch.rand(2, 3, 4, 4) # [batch=2, channels=3, height=4, width=4]
	model = FourDimensionalTransformer(num_layers=16, embed_dim=7, num_heads=1, num_extra_tokens=16, num_classes=10)
	output = model(input_tensor)

	print("Output shape:", output.shape)
	print("Output:", output)