File size: 7,203 Bytes
c660edb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import math
from typing import Optional, Tuple, TypeVar
import torch.nn as nn
import torch
import triton

from functools import lru_cache


from .triton_flash_blocksparse_attn import get_local_strided_sparse_attention_op, _get_sparse_attn_mask, blocksparse_flash_attn_padded_fwd, blocksparse_flash_attn_varlen_fwd


Layout = Tuple[torch.LongTensor, torch.LongTensor]


def create_sparse_attn_mask(
    n_heads: int,
    max_seq_len: int,
    max_seq_len_k: int,
    dtype: torch.dtype,
    device: torch.device,
    BLOCK: int,
    local_blocks: int,
    vert_stride: int,
    homo_head: bool,
    return_dense: bool
) -> Tuple[Layout, torch.Tensor, Optional[torch.Tensor]]:
    layout, block_sparse_pattern, _ = _get_sparse_attn_mask(
        n_heads=n_heads,
        q_len=max_seq_len,
        N_CTX=max_seq_len_k,
        dtype=dtype,
        device=device,
        BLOCK=BLOCK,
        local_blocks=local_blocks,
        vert_stride=vert_stride,
        homo_head=homo_head,
        return_dense=return_dense
    )
    return layout, block_sparse_pattern


class BlockSparseAttentionLayer(nn.Module):
    def __init__(
        self,
        n_heads: int,
        max_seq_len: int,
        sparse_block_size: int,
        local_blocks: int,
        vert_stride: int,
        kernel_block_size: Optional[int] = None,
        homo_head: bool = False,
        active_head_range: Optional[Tuple[int]] = None
    ) -> None:
        super().__init__()

        self.n_heads = n_heads
        self.max_seq_len = max_seq_len
        self.sparse_block_size = sparse_block_size
        self.kernel_block_size = kernel_block_size or sparse_block_size
        self.local_blocks = local_blocks
        self.vert_stride = vert_stride
        self.homo_head = homo_head
        self.active_head_range = active_head_range

        # Internal Parameters used by the layer
        self._sparse_block_mask = None
        self._sparse_layout = None
        self._dtype = None
        self._device = None

        # TODO(bapatra): Ideally, I'd want to keep all the code for
        # forward to be handled here, and not branch for training and inference.
        # However, that refactor would need a lot of testing. For now, using the
        # training op as is, and will refactor again later.
    
    def prune_blocksparse_layout_to_heads(self, h_start: int, h_end: int) -> None:
        self._sparse_block_mask = self._sparse_block_mask[h_start: h_end]
        self._sparse_layout[0] = self._sparse_layout[0][h_start: h_end]
        self._sparse_layout[1] = self._sparse_layout[1][h_start: h_end]
    
    def _initialize_internals(
        self,
        dtype: torch.dtype,
        device: torch.device
    ) -> None:
        self._dtype, self._device = dtype, device
        self._sparse_layout, self._sparse_block_mask = create_sparse_attn_mask(
            n_heads=self.n_heads,
            max_seq_len=self.max_seq_len,
            max_seq_len_k=self.max_seq_len,
            dtype=dtype,
            device=device,
            BLOCK=self.sparse_block_size,
            local_blocks=self.local_blocks,
            vert_stride=self.vert_stride,
            homo_head=self.homo_head,
            return_dense=False,
        )
        if (not self.homo_head) and (self.active_head_range is not None):
            assert len(self.active_head_range) == 2, "\"active_head_range\" should be a tuple of start/end index of the heads."
            h_start, h_end = self.active_head_range
            self.prune_blocksparse_layout_to_heads(h_start=h_start, h_end=h_end)

        assert self.sparse_block_size % self.kernel_block_size == 0,  f"The sparse block size must be a multiple of {self.kernel_block_size}. Found {self.sparse_block_size}."
        assert self.kernel_block_size >=16 and math.log2(self.kernel_block_size) % 1 == 0, f"block_size must be power of 2 and at least 16, but {self.kernel_block_size} is given"
        if self.sparse_block_size // self.kernel_block_size > 1:
            _mul = self.sparse_block_size // self.kernel_block_size
            # need to consider if block_m and block_n are different
            self._sparse_block_mask = torch.kron(self._sparse_block_mask, self._sparse_block_mask.new_ones(_mul, _mul))
            num_sparse_blocks = self._sparse_block_mask.size(-1)
            block_causal_mask = torch.arange(0, num_sparse_blocks)[:, None] >= torch.arange(0, num_sparse_blocks)[None]
            self._sparse_block_mask *= block_causal_mask.type_as(self._sparse_block_mask)


    def forward(
        self,
        q: torch.Tensor,
        k: torch.Tensor,
        v: torch.Tensor,
        sm_scale: float,
        *,
        # Arguments Related to Block Attention Inference
        left_paddings: Optional[torch.LongTensor] = None,
        seqlens: Optional[torch.LongTensor] = None,
        # Arguements Related to Variable Length Inference
        cu_seqlens_k: Optional[torch.LongTensor] = None,
        cu_seqlens_q: Optional[torch.LongTensor] = None,
    ) -> torch.Tensor:

        if left_paddings is None and seqlens is None and cu_seqlens_k is None and cu_seqlens_q is None:
            blocksparse_op = get_local_strided_sparse_attention_op(
                n_heads=self.n_heads,
                max_seq_len=self.max_seq_len,
                sparse_block_size=self.sparse_block_size,
                kernel_block_size=self.kernel_block_size,
                local_blocks=self.local_blocks,
                vert_stride=self.vert_stride,
                homo_head=self.homo_head,
                device=q.device,
                inference=not self.training
            )
            return blocksparse_op(q, k, v, sm_scale)

        assert not torch.is_grad_enabled(), "Variable Length Inference / Batched inference is not supported during training. Please run it in a torch.no_grad() context"
        # First set internals if they have not been set
        if self._sparse_block_mask is None or (self._dtype != q.dtype) or (self._device != q.device):
            self._initialize_internals(dtype=q.dtype, device=q.device)
        
        if k.dim() == 3:
            assert cu_seqlens_k is not None
            return blocksparse_flash_attn_varlen_fwd(
                q=q,
                k=k,
                v=v,
                cu_seqlens_k=cu_seqlens_k,
                cu_seqlens_q=cu_seqlens_q,
                sm_scale=sm_scale,
                sparse_layout=self._sparse_layout,
                block_size=self.kernel_block_size,
                max_seqlen=self.max_seq_len,
            )
        if k.dim() == 4:
            assert not (left_paddings is None and seqlens is None), "Either left_paddings or seqlens must be provided for batched inference."
            return blocksparse_flash_attn_padded_fwd(
                q=q,
                k=k,
                v=v,
                sm_scale=sm_scale,
                sparse_layout=self._sparse_layout,
                left_paddings=left_paddings,
                seqlens=seqlens,
                block_size=self.kernel_block_size,
                max_seqlen=self.max_seq_len,
            )
        raise ValueError('q/k/v must be either 3 dim for variable-length input or 4 dim for fixed-length.')