Spaces:
Sleeping
Sleeping
# ------------------------------------------------------------------------ | |
# Copyright (c) 2023-present, BAAI. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ------------------------------------------------------------------------ | |
"""Text decoder.""" | |
try: | |
from flash_attn import flash_attn_func | |
from flash_attn import flash_attn_with_kvcache | |
from flash_attn.layers.rotary import apply_rotary_emb | |
except ImportError: | |
flash_attn_func = None | |
flash_attn_with_kvcache = None | |
apply_rotary_emb = None | |
import torch | |
from torch import nn | |
class TransformerCache(nn.Module): | |
"""Transformer cache module.""" | |
def __init__(self, device=None, dtype=None): | |
super(TransformerCache, self).__init__() | |
self.device = device | |
self.dtype = dtype | |
self.start_pos = 0 | |
self.cache_dict = {} | |
def init_seq(self, max_batch_size): | |
seq_lens = torch.zeros(max_batch_size, dtype=torch.int32, device=self.device) | |
self.cache_dict["seq_lens"] = seq_lens | |
def init_rotary(self, seq_len, dim, theta=10000.0): | |
grid = torch.arange(seq_len, dtype=torch.float32).unsqueeze_(-1) | |
freq = torch.pow(theta, torch.arange(0, dim, 2)[: dim // 2].float().div_(dim)) | |
broadcast_freq = grid.mul(freq.reciprocal_().unsqueeze_(0)) | |
cache_cos = broadcast_freq.cos().view((-1, dim // 2)) | |
cache_sin = broadcast_freq.sin().view((-1, dim // 2)) | |
self.cache_dict["cos"] = cache_cos.to(self.device, self.dtype) | |
self.cache_dict["sin"] = cache_sin.to(self.device, self.dtype) | |
def init_kv(self, mixer, kv_size): | |
cache_k = torch.zeros(*kv_size, dtype=self.dtype, device=self.device) | |
cache_v = torch.zeros(*kv_size, dtype=self.dtype, device=self.device) | |
self.cache_dict[f"{id(mixer)}_k"] = cache_k | |
self.cache_dict[f"{id(mixer)}_v"] = cache_v | |
def set_seq(self, start_pos=0, end_pos=None): | |
self.start_pos = start_pos | |
if "seq_lens" in self.cache_dict: | |
self.cache_dict["seq_lens"].fill_(start_pos) | |
if "cos" in self.cache_dict and end_pos is not None: | |
self.cache_dict["seq_cos"] = self.cache_dict["cos"][self.start_pos : end_pos] | |
self.cache_dict["seq_sin"] = self.cache_dict["sin"][self.start_pos : end_pos] | |
def forward_rotary(self, q, k, inplace=False): | |
cos = self.cache_dict.get("seq_cos", self.cache_dict.get("cos", None)) | |
sin = self.cache_dict.get("seq_sin", self.cache_dict.get("sin", None)) | |
if cos is None or sin is None: | |
return q, k | |
q = apply_rotary_emb(q, cos, sin, interleaved=True, inplace=inplace) | |
k = apply_rotary_emb(k, cos, sin, interleaved=True, inplace=inplace) | |
return q, k | |
def forward_flash(self, mixer, q, k, v): | |
cache_k = self.cache_dict.get(f"{id(mixer)}_k", None) | |
cache_v = self.cache_dict.get(f"{id(mixer)}_v", None) | |
flash_args = {"softmax_scale": mixer.scale, "causal": True} | |
if cache_k is None or cache_v is None: | |
flash_args["dropout_p"] = mixer.dropout.p if mixer.training else 0 | |
return flash_attn_func(q, k, v, **flash_args) | |
flash_args["cache_seqlens"] = self.cache_dict["seq_lens"][: q.shape[0]] | |
return flash_attn_with_kvcache(q, cache_k, cache_v, k, v, **flash_args) | |
class Attention(nn.Module): | |
"""Self-Attention layer.""" | |
def __init__(self, dim, num_heads, bias=True): | |
super(Attention, self).__init__() | |
self.qkv = nn.Linear(dim, dim * 3, bias=bias) | |
self.proj = nn.Linear(dim, dim, bias=bias) | |
self.head_dim = dim // num_heads | |
self.num_heads = num_heads | |
self.scale = self.head_dim**-0.5 | |
self.dropout = nn.Dropout(0.1, inplace=False) | |
self.cache = nn.Module() | |
def forward(self, x): | |
qkv_shape = (-1, x.size(1), 3, self.num_heads, self.head_dim) | |
q, k, v = self.qkv(x).view(qkv_shape).unbind(dim=2) | |
q, k = self.cache.forward_rotary(q, k, inplace=True) | |
o = self.cache.forward_flash(self, q, k, v) | |
return self.proj(o.flatten(2)) | |
class MLP(nn.Module): | |
"""Two layers MLP.""" | |
def __init__(self, dim, mlp_dim, bias=True): | |
super(MLP, self).__init__() | |
self.fc1 = nn.Linear(dim, mlp_dim, bias=bias) | |
self.fc2 = nn.Linear(mlp_dim, dim, bias=bias) | |
self.activation = nn.GELU() | |
def forward(self, x): | |
return self.fc2(self.activation(self.fc1(x))) | |
class Block(nn.Module): | |
"""Transformer block.""" | |
def __init__(self, dim, num_heads, mlp_dim, bias=True): | |
super(Block, self).__init__() | |
self.attn = Attention(dim, num_heads, bias=bias) | |
self.mlp = MLP(dim, mlp_dim, bias=bias) | |
self.norm1 = nn.LayerNorm(dim) | |
self.norm2 = nn.LayerNorm(dim) | |
self.dropout = nn.Dropout(0.1, inplace=True) | |
def forward(self, x): | |
x = self.dropout(self.attn(self.norm1(x))).add_(x) | |
return self.dropout(self.mlp(self.norm2(x))).add_(x) | |
class Transformer(nn.Module): | |
"""Causal transformer decoder.""" | |
def __init__(self, depth, dim, num_heads, mlp_dim, vocab_size): | |
super(Transformer, self).__init__() | |
self.dim = dim | |
self.num_heads = num_heads | |
self.head_dim = dim // num_heads | |
self.vocab_size = vocab_size | |
self.tok_embeddings = nn.Embedding(vocab_size, dim) | |
self.blocks = nn.ModuleList(Block(dim, num_heads, mlp_dim) for _ in range(depth)) | |
self.norm = nn.LayerNorm(dim) | |
self.text_proj = nn.Linear(dim, vocab_size, bias=False) | |
def forward(self, prompts, tokens, start_pos=0): | |
prompt_len = prompts.size(1) | |
start_pos = start_pos + (prompt_len if start_pos > 0 else 0) | |
end_pos = start_pos + tokens.size(1) + (0 if start_pos > 0 else prompt_len) | |
self.cache.set_seq(start_pos, end_pos) | |
x = self.tok_embeddings(tokens) | |
x = x if start_pos > 0 else torch.cat([prompts, x], dim=1) | |
for blk in self.blocks: | |
x = blk(x) | |
x = self.norm(x[:, 0 if start_pos > 0 else prompt_len :]) | |
return self.text_proj(x).float() | |
class TextDecoder(nn.Module): | |
"""Module to decode texts.""" | |
def __init__( | |
self, | |
depth, | |
embed_dim, | |
num_heads, | |
mlp_ratio, | |
prompt_embed_dim, | |
max_seq_len, | |
vocab_size, | |
): | |
super(TextDecoder, self).__init__() | |
self.max_seq_len = max_seq_len | |
self.max_text_len = self.max_seq_len - 1 | |
self.encoder = nn.Linear(prompt_embed_dim, embed_dim, bias=False) | |
self.transformer = Transformer( | |
depth=depth, | |
dim=embed_dim, | |
mlp_dim=embed_dim * mlp_ratio, | |
num_heads=num_heads, | |
vocab_size=vocab_size, | |
) | |
def reset_cache(self, max_batch_size=1, max_seq_len=None): | |
device, dtype = self.encoder.weight.device, self.encoder.weight.dtype | |
max_seq_len = self.max_seq_len if max_seq_len is None else max_seq_len | |
num_heads, head_dim = self.transformer.num_heads, self.transformer.head_dim | |
self.transformer.cache = TransformerCache(device=device, dtype=dtype) | |
self.transformer.cache.init_seq(max_batch_size) | |
self.transformer.cache.init_rotary(max_seq_len, head_dim, theta=10000.0) | |
kv_cache_size = (max_batch_size, max_seq_len, num_heads, head_dim) | |
for blk in self.transformer.blocks: | |
blk.attn.__dict__["cache"] = self.transformer.cache | |
self.transformer.cache.init_kv(blk.attn, kv_cache_size) if not self.training else None | |
def get_prompts(self, prompt_tokens): | |
return self.encoder(prompt_tokens) | |
def get_outputs(self, inputs, start_pos=0): | |
return {"text_pred": self.transformer(inputs["prompts"], inputs["tokens"], start_pos)} | |
def forward(self, inputs, start_pos=0): | |
return self.get_outputs(inputs, start_pos) | |