File size: 5,521 Bytes
4e1467d d97c361 4e1467d 0b6a10a 4e1467d 1bcfe48 c13ef0b 4e1467d c13ef0b 0b6a10a 4e1467d 0b6a10a 405f5b1 c13ef0b 4e1467d d97c361 0b6a10a 405f5b1 c13ef0b 4e1467d 405f5b1 c13ef0b 405f5b1 c13ef0b 405f5b1 4e1467d 405f5b1 0b6a10a 4e1467d c13ef0b 0b6a10a c13ef0b 405f5b1 c13ef0b 405f5b1 c13ef0b 405f5b1 2896dec 4e1467d 405f5b1 4e1467d 0b6a10a 4e1467d 0b6a10a d97c361 0b6a10a 4e1467d 0b6a10a 4e1467d 0b6a10a d97c361 0b6a10a d97c361 0b6a10a d97c361 0b6a10a d97c361 0b6a10a 405f5b1 1bcfe48 0b6a10a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import torch as t
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
import wandb
from fancy_einsum import einsum
from einops import rearrange, repeat, reduce
from utils import OsSoluConfig
# TODO: Add hooks to the model.
# TODO: Add support for mixing dense and sparse attention.
class OsSoluModel(nn.Module):
"""An open-source implementation of a SoLU-based transformer. This is a GPT-style architecture model
where the nonlinearity in the MLP block is replaced with SoLU(x) = x * softmax(x)."""
def __init__(self, config: OsSoluConfig) -> None:
super().__init__()
self.config = config
self.embed_positions = nn.Embedding(config.max_positional_embeddings, config.d_model)
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
self.dropout = nn.Dropout(config.dropout)
self.transformer_blocks = nn.ModuleList([GPT2Block(config) for _ in range(config.num_blocks)])
self.final_ln = nn.LayerNorm(config.d_model, config.ln_eps)
def forward(self, x: t.Tensor) -> t.Tensor:
positional_embeddings = self.embed_positions(t.arange(x.size(1), device=x.device))
token_embeddings = self.embed_tokens(x)
embeddings = positional_embeddings + token_embeddings
out = self.dropout(embeddings)
for block in self.transformer_blocks:
out = block(out)
# Unembedding is not separate, so we just einsum with token embedding weights.
out = einsum("vocab hidden, batch seq hidden -> batch seq vocab", self.embed_tokens.weight, out)
return out
class SoLU(nn.Module):
"""A simple wrapper around the SoLU function such that it can be used as a layer in a model."""
def __init__(self):
super().__init__()
def forward(self, x: t.Tensor) -> t.Tensor:
return x * x.softmax(dim=-1)
class GPT2Block(nn.Module):
def __init__(self, config: OsSoluConfig) -> None:
super().__init__()
self.config = config
self.layer_norm1 = nn.LayerNorm(config.d_model, config.ln_eps)
self.attention = UnidirectionalAttention(config) if config.self_attention_type == "unidirectional" else RotaryAttention(config)
nonlinearity = SoLU() if config.nonlinearity == "solu" else nn.ReLU()
self.MLP = nn.Sequential(
nn.LayerNorm(config.d_model, config.ln_eps),
nn.Linear(config.d_model, 4*config.d_model),
nonlinearity,
nn.Linear(4*config.d_model, config.d_model),
nn.Dropout(config.dropout)
)
def forward(self, x: t.Tensor) -> t.Tensor:
x = x + self.attention(self.layer_norm1(x))
x = x + self.MLP(x)
return x
class UnidirectionalAttention(nn.Module):
def __init__(self, config: OsSoluConfig) -> None:
super().__init__()
self.num_heads = config.num_heads
self.d_model = config.d_model
self.project_q = nn.Linear(config.d_model, config.d_model)
self.project_k = nn.Linear(config.d_model, config.d_model)
self.project_v = nn.Linear(config.d_model, config.d_model)
self.project_out = nn.Linear(config.d_model, config.d_model)
self.LARGE_NEGATIVE_VALUE = -1e5
def hidden_to_heads(self, tensor: t.Tensor) -> t.Tensor:
return rearrange(tensor, "b s (nh hs) -> b nh s hs", nh=self.num_heads)
def compute_pre_softmax_attn_pattern(self, x: t.Tensor) -> t.Tensor:
Q = self.project_q(x)
K = self.project_k(x)
Q = self.hidden_to_heads(Q)
K = self.hidden_to_heads(K)
attention_pattern = einsum(
"batch num_heads seqlen_q head_size, "
"batch num_heads seqlen_k head_size ->"
"batch num_heads seqlen_q seqlen_k",
Q, K)
return attention_pattern
def forward(self, x: t.Tensor) -> t.Tensor:
batch, seqlen, hidden_size = x.shape
attention_pattern = self.compute_pre_softmax_attn_pattern(x)
V = self.project_v(x)
# Masking attention. Since GPT is unidirectional, it should only attend to previous tokens.
if seqlen > 1:
fst_range = t.arange(seqlen, device=x.device).unsqueeze(0).T
snd_range = t.arange(seqlen, device=x.device).unsqueeze(0)
bool_array = fst_range < snd_range
attention_pattern[..., bool_array] = self.LARGE_NEGATIVE_VALUE
attention_pattern = attention_pattern / t.sqrt(t.tensor(self.d_model // self.num_heads))
attention_score = attention_pattern.softmax(dim=-1)
V = self.hidden_to_heads(V)
out = einsum(
"batch num_heads seqlen_q seqlen_k,"
"batch num_heads seqlen_k head_size ->"
"batch num_heads seqlen_q head_size",
attention_score, V)
out = rearrange(out, "b nh s hs -> b s (nh hs)")
out = self.project_out(out)
return out
class RotaryAttention(nn.Module):
def __init__(self, config: OsSoluConfig) -> None:
super().__init__()
self.config = config
def forward(self, x: t.Tensor) -> t.Tensor:
# TODO: implement rotary self-attention
pass
class LayerNorm(nn.Module):
def __init__(self, config: OsSoluConfig) -> None:
super().__init__()
self.config = config
def forward(self, x: t.Tensor) -> t.Tensor:
# TODO: implement layernorm with hooks on normalisation only.
pass |