radna
/

Triton-InternViT-6B-448px-V1-5

@@ -25,27 +25,17 @@ import triton.language as tl
 # TORCH_HAS_FP8E4B8 = hasattr(torch, 'float8_e4m3fnuz')
 # AMD E5M2B16
-TORCH_HAS_FP8E5B16 = hasattr(torch, "float8_e5m2fnuz")
 @triton.jit
-def _attn_fwd_inner(
- acc,
- l_i,
- m_i,
- q,
- K_block_ptr,
- V_block_ptr,
- start_m,
- BLOCK_M: tl.constexpr,
- BLOCK_DMODEL: tl.constexpr,
- BLOCK_N: tl.constexpr,
- STAGE: tl.constexpr,
- offs_m: tl.constexpr,
- offs_n: tl.constexpr,
- N_CTX,
- pre_load_v: tl.constexpr,
-):
  # range of values handled by this stage
  if STAGE == 1:
  lo, hi = 0, start_m * BLOCK_M
@@ -93,119 +83,37 @@ def _attn_fwd_inner(
 # re-tuning.
 @triton.autotune(
  configs=[
- triton.Config(
- {
- "BLOCK_M": 64,
- "BLOCK_N": 16,
- "waves_per_eu": 2,
- "slice_k_tile": 0,
- "pre_load_v": False,
- },
- num_stages=1,
- num_warps=2,
- ),
- triton.Config(
- {
- "BLOCK_M": 64,
- "BLOCK_N": 16,
- "waves_per_eu": 2,
- "slice_k_tile": 32,
- "pre_load_v": False,
- },
- num_stages=1,
- num_warps=2,
- ),
- triton.Config(
- {
- "BLOCK_M": 32,
- "BLOCK_N": 32,
- "waves_per_eu": 2,
- "slice_k_tile": 0,
- "pre_load_v": False,
- },
- num_stages=1,
- num_warps=1,
- ),
- triton.Config(
- {
- "BLOCK_M": 32,
- "BLOCK_N": 32,
- "waves_per_eu": 2,
- "slice_k_tile": 32,
- "pre_load_v": False,
- },
- num_stages=1,
- num_warps=1,
- ),
- triton.Config(
- {
- "BLOCK_M": 64,
- "BLOCK_N": 32,
- "waves_per_eu": 2,
- "slice_k_tile": 0,
- "pre_load_v": False,
- },
- num_stages=1,
- num_warps=2,
- ),
- triton.Config(
- {
- "BLOCK_M": 32,
- "BLOCK_N": 16,
- "waves_per_eu": 3,
- "slice_k_tile": 0,
- "pre_load_v": True,
- },
- num_stages=1,
- num_warps=1,
- ),
- triton.Config(
- {
- "BLOCK_M": 32,
- "BLOCK_N": 16,
- "waves_per_eu": 3,
- "slice_k_tile": 0,
- "pre_load_v": False,
- },
- num_stages=1,
- num_warps=1,
- ),
  ],
- key=["Z", "H", "N_CTX", "STAGE", "BLOCK_DMODEL"],
 )
 @triton.jit
-def _attn_fwd(
- Q,
- K,
- V,
- sm_scale,
- M,
- Out,
- stride_qz,
- stride_qh,
- stride_qm,
- stride_qk,
- stride_kz,
- stride_kh,
- stride_kn,
- stride_kk,
- stride_vz,
- stride_vh,
- stride_vk,
- stride_vn,
- stride_oz,
- stride_oh,
- stride_om,
- stride_on,
- Z,
- H,
- N_CTX,
- BLOCK_DMODEL: tl.constexpr,
- STAGE: tl.constexpr,
- BLOCK_M: tl.constexpr,
- BLOCK_N: tl.constexpr,
- pre_load_v: tl.constexpr,
-):
  start_m = tl.program_id(0)
  off_hz = tl.program_id(1)
  qvk_offset = off_hz * stride_qh
@@ -261,45 +169,23 @@ def _attn_fwd(
  # For causal = True, STAGE = 3 and _attn_fwd_inner gets 1 as its STAGE
  # For causal = False, STAGE = 1, and _attn_fwd_inner gets 3 as its STAGE
  if STAGE & 1:
- acc, l_i, m_i = _attn_fwd_inner(
- acc,
- l_i,
- m_i,
- q,
- K_block_ptr,
- V_block_ptr,
- start_m,
- BLOCK_M,
- BLOCK_DMODEL,
- BLOCK_N,
- 4 - STAGE,
- offs_m,
- offs_n,
- N_CTX,
- pre_load_v,
- )
  # stage 2: on-band
  if STAGE & 2:
  # barrier makes it easier for compielr to schedule the
  # two loops independently
  tl.debug_barrier()
- acc, l_i, m_i = _attn_fwd_inner(
- acc,
- l_i,
- m_i,
- q,
- K_block_ptr,
- V_block_ptr,
- start_m,
- BLOCK_M,
- BLOCK_DMODEL,
- BLOCK_N,
- 2,
- offs_m,
- offs_n,
- N_CTX,
- pre_load_v,
- )
  # epilogue
  # write back m
  acc = acc / l_i[:, None]
@@ -309,46 +195,36 @@ def _attn_fwd(
 @triton.jit
-def _attn_bwd_preprocess(
- O, DO, Delta, Z, H, N_CTX, BLOCK_M: tl.constexpr, D_HEAD: tl.constexpr
-):
  off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
  off_hz = tl.program_id(1)
  off_n = tl.arange(0, D_HEAD)
- o = tl.load(O + off_hz * D_HEAD * N_CTX + off_m[:, None] * D_HEAD + off_n[None, :])
- do = tl.load(
- DO + off_hz * D_HEAD * N_CTX + off_m[:, None] * D_HEAD + off_n[None, :]
- ).to(tl.float32)
  delta = tl.sum(o * do, axis=1)
  tl.store(Delta + off_hz * N_CTX + off_m, delta)
 # The main inner-loop logic for computing dK and dV.
 @triton.jit
-def _attn_bwd_dkdv(
- dk,
- dv,
- Q,
- k,
- v,
- sm_scale,
- DO,
- M,
- D,
- # shared by Q/K/V/DO.
- stride_tok,
- stride_d,
- H,
- N_CTX,
- BLOCK_M1: tl.constexpr,
- BLOCK_N1: tl.constexpr,
- BLOCK_DMODEL: tl.constexpr,
- # Filled in by the wrapper.
- start_n,
- start_m,
- num_steps,
- MASK: tl.constexpr,
-):
  offs_m = start_m + tl.arange(0, BLOCK_M1)
  offs_n = start_n + tl.arange(0, BLOCK_N1)
  offs_k = tl.arange(0, BLOCK_DMODEL)
@@ -358,7 +234,7 @@ def _attn_bwd_dkdv(
  strides=(stride_d, stride_tok),
  offsets=(0, start_m),
  block_shape=(BLOCK_DMODEL, BLOCK_M1),
- order=(0, 1),
  )
  DO_block_ptr = tl.make_block_ptr(
  base=DO,
@@ -366,7 +242,7 @@ def _attn_bwd_dkdv(
  strides=(stride_tok, stride_d),
  offsets=(start_m, 0),
  block_shape=(BLOCK_M1, BLOCK_DMODEL),
- order=(1, 0),
  )
  # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
  tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
@@ -381,7 +257,7 @@ def _attn_bwd_dkdv(
  pT = tl.math.exp2(qkT - m[None, :])
  # Autoregressive masking.
  if MASK:
- mask = offs_m[None, :] >= offs_n[:, None]
  pT = tl.where(mask, pT, 0.0)
  do = tl.load(DO_block_ptr)
  # Compute dV.
@@ -404,28 +280,17 @@ def _attn_bwd_dkdv(
 # the main inner-loop logic for computing dQ
 @triton.jit
-def _attn_bwd_dq(
- dq,
- q,
- K,
- V,
- do,
- m,
- D,
- # shared by Q/K/V/DO.
- stride_tok,
- stride_d,
- H,
- N_CTX,
- BLOCK_M2: tl.constexpr,
- BLOCK_N2: tl.constexpr,
- BLOCK_DMODEL: tl.constexpr,
- # Filled in by the wrapper.
- start_m,
- start_n,
- num_steps,
- MASK: tl.constexpr,
-):
  offs_m = start_m + tl.arange(0, BLOCK_M2)
  offs_n = start_n + tl.arange(0, BLOCK_N2)
  offs_k = tl.arange(0, BLOCK_DMODEL)
@@ -435,7 +300,7 @@ def _attn_bwd_dq(
  strides=(stride_d, stride_tok),
  offsets=(0, start_n),
  block_shape=(BLOCK_DMODEL, BLOCK_N2),
- order=(0, 1),
  )
  VT_block_ptr = tl.make_block_ptr(
  base=V,
@@ -443,7 +308,7 @@ def _attn_bwd_dq(
  strides=(stride_d, stride_tok),
  offsets=(0, start_n),
  block_shape=(BLOCK_DMODEL, BLOCK_N2),
- order=(0, 1),
  )
  # D (= delta) is pre-divided by ds_scale.
  Di = tl.load(D + offs_m)
@@ -458,7 +323,7 @@ def _attn_bwd_dq(
  # Autoregressive masking.
  if MASK:
  offs_n = curr_n + tl.arange(0, BLOCK_N2)
- mask = offs_m[:, None] >= offs_n[None, :]
  p = tl.where(mask, p, 0.0)
  # Compute dP and dS.
  vT = tl.load(VT_block_ptr)
@@ -477,135 +342,42 @@ def _attn_bwd_dq(
 @triton.autotune(
  configs=[
- triton.Config(
- {
- "BLOCK_M1": 32,
- "BLOCK_N1": 64,
- "BLOCK_M2": 64,
- "BLOCK_N2": 32,
- "BLK_SLICE_FACTOR": 1,
- },
- num_stages=1,
- num_warps=4,
- ),
- triton.Config(
- {
- "BLOCK_M1": 32,
- "BLOCK_N1": 64,
- "BLOCK_M2": 64,
- "BLOCK_N2": 32,
- "BLK_SLICE_FACTOR": 2,
- },
- num_stages=1,
- num_warps=4,
- ),
- triton.Config(
- {
- "BLOCK_M1": 64,
- "BLOCK_N1": 128,
- "BLOCK_M2": 128,
- "BLOCK_N2": 64,
- "BLK_SLICE_FACTOR": 1,
- },
- num_stages=1,
- num_warps=4,
- ),
- triton.Config(
- {
- "BLOCK_M1": 64,
- "BLOCK_N1": 128,
- "BLOCK_M2": 128,
- "BLOCK_N2": 64,
- "BLK_SLICE_FACTOR": 2,
- },
- num_stages=1,
- num_warps=4,
- ),
- triton.Config(
- {
- "BLOCK_M1": 64,
- "BLOCK_N1": 64,
- "BLOCK_M2": 64,
- "BLOCK_N2": 64,
- "BLK_SLICE_FACTOR": 1,
- },
- num_stages=1,
- num_warps=4,
- ),
- triton.Config(
- {
- "BLOCK_M1": 64,
- "BLOCK_N1": 64,
- "BLOCK_M2": 64,
- "BLOCK_N2": 64,
- "BLK_SLICE_FACTOR": 2,
- },
- num_stages=1,
- num_warps=4,
- ),
- triton.Config(
- {
- "BLOCK_M1": 32,
- "BLOCK_N1": 128,
- "BLOCK_M2": 128,
- "BLOCK_N2": 32,
- "BLK_SLICE_FACTOR": 1,
- },
- num_stages=1,
- num_warps=4,
- ),
- triton.Config(
- {
- "BLOCK_M1": 32,
- "BLOCK_N1": 128,
- "BLOCK_M2": 128,
- "BLOCK_N2": 32,
- "BLK_SLICE_FACTOR": 2,
- },
- num_stages=1,
- num_warps=4,
- ),
- triton.Config(
- {
- "BLOCK_M1": 32,
- "BLOCK_N1": 128,
- "BLOCK_M2": 128,
- "BLOCK_N2": 32,
- "BLK_SLICE_FACTOR": 2,
- },
- num_stages=1,
- num_warps=8,
- ),
  ],
- key=["H", "N_CTX", "BLOCK_DMODEL"],
 )
 @triton.jit
-def _attn_bwd(
- Q,
- K,
- V,
- sm_scale,
- DO,
- DQ,
- DK,
- DV,
- M,
- D,
- # shared by Q/K/V/DO.
- stride_z,
- stride_h,
- stride_tok,
- stride_d,
- # H = 16, N_CTX = 1024
- H,
- N_CTX,
- BLOCK_DMODEL: tl.constexpr,
- BLOCK_M1: tl.constexpr,
- BLOCK_N1: tl.constexpr,
- BLOCK_M2: tl.constexpr,
- BLOCK_N2: tl.constexpr,
- BLK_SLICE_FACTOR: tl.constexpr,
-):
  LN2: tl.constexpr = 0.6931471824645996 # = ln(2)
  bhid = tl.program_id(2)
@@ -661,54 +433,31 @@ def _attn_bwd(
  num_steps = BLOCK_N1 // MASK_BLOCK_M1
- dk, dv = _attn_bwd_dkdv(
- dk,
- dv,
- Q,
- k,
- v,
- sm_scale,
- DO,
- M,
- D,
- stride_tok,
- stride_d,
- H,
- N_CTX,
- MASK_BLOCK_M1,
- BLOCK_N1,
- BLOCK_DMODEL,
- start_n,
- start_m,
- num_steps,
- MASK=True,
- )
  start_m += num_steps * MASK_BLOCK_M1
  num_steps = (N_CTX - start_m) // BLOCK_M1
  # Compute dK and dV for non-masked blocks.
  dk, dv = _attn_bwd_dkdv(
- dk,
- dv,
- Q,
- k,
- v,
- sm_scale,
  DO,
- M,
- D,
- stride_tok,
- stride_d,
- H,
- N_CTX,
- BLOCK_M1,
- BLOCK_N1,
- BLOCK_DMODEL,
- start_n,
- start_m,
- num_steps,
- MASK=False,
  )
  DV_block_ptrs = tl.make_block_ptr(
@@ -717,7 +466,7 @@ def _attn_bwd(
  strides=(stride_tok, stride_d),
  offsets=(start_n, 0),
  block_shape=(BLOCK_N1, BLOCK_DMODEL),
- order=(1, 0),
  )
  tl.store(DV_block_ptrs, dv.to(tl.float16))
@@ -729,7 +478,7 @@ def _attn_bwd(
  strides=(stride_tok, stride_d),
  offsets=(start_n, 0),
  block_shape=(BLOCK_N1, BLOCK_DMODEL),
- order=(1, 0),
  )
  tl.store(DK_block_ptrs, dk.to(tl.float16))
@@ -746,7 +495,7 @@ def _attn_bwd(
  strides=(stride_tok, stride_d),
  offsets=(start_m, 0),
  block_shape=(BLOCK_M2, BLOCK_DMODEL),
- order=(1, 0),
  )
  DO_block_ptr = tl.make_block_ptr(
@@ -755,7 +504,7 @@ def _attn_bwd(
  strides=(stride_tok, stride_d),
  offsets=(start_m, 0),
  block_shape=(BLOCK_M2, BLOCK_DMODEL),
- order=(1, 0),
  )
  q = tl.load(Q_block_ptr)
  do = tl.load(DO_block_ptr)
@@ -770,49 +519,25 @@ def _attn_bwd(
  # not due to anything important. I just wanted to reuse the loop
  # structure for dK & dV above as much as possible.
  num_steps = BLOCK_M2 // MASK_BLOCK_N2
- dq = _attn_bwd_dq(
- dq,
- q,
- K,
- V,
- do,
- m,
- D,
- stride_tok,
- stride_d,
- H,
- N_CTX,
- BLOCK_M2,
- MASK_BLOCK_N2,
- BLOCK_DMODEL,
- start_m,
- end_n - num_steps * MASK_BLOCK_N2,
- num_steps,
- MASK=True,
- )
  end_n -= num_steps * MASK_BLOCK_N2
  # stage 2
  num_steps = end_n // BLOCK_N2
- dq = _attn_bwd_dq(
- dq,
- q,
- K,
- V,
- do,
- m,
- D,
- stride_tok,
- stride_d,
- H,
- N_CTX,
- BLOCK_M2,
- BLOCK_N2,
- BLOCK_DMODEL,
- start_m,
- end_n - num_steps * BLOCK_N2,
- num_steps,
- MASK=False,
- )
  # Write back dQ.
  DQ_block_ptr = tl.make_block_ptr(
  base=DQ,
@@ -820,7 +545,7 @@ def _attn_bwd(
  strides=(stride_tok, stride_d),
  offsets=(start_m, 0),
  block_shape=(BLOCK_M2, BLOCK_DMODEL),
- order=(1, 0),
  )
  dq *= LN2
  tl.store(DQ_block_ptr, dq.to(tl.float16))
@@ -849,41 +574,20 @@ class _attention(torch.autograd.Function):
  num_stages = 7 if Lk >= 64 else 3
  stage = 3 if causal else 1
- def grid(META):
- return (
- triton.cdiv(q.shape[2], META["BLOCK_M"]),
- q.shape[0] * q.shape[1],
- 1,
- )
- M = torch.empty(
- (q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32
  )
  _attn_fwd[grid](
- q,
- k,
- v,
- sm_scale,
- M,
- o,
- q.stride(0),
- q.stride(1),
- q.stride(2),
- q.stride(3),
- k.stride(0),
- k.stride(1),
- k.stride(2),
- k.stride(3),
- v.stride(0),
- v.stride(1),
- v.stride(2),
- v.stride(3),
- o.stride(0),
- o.stride(1),
- o.stride(2),
- o.stride(3),
- q.shape[0],
- q.shape[1],
  N_CTX=q.shape[2],
  BLOCK_DMODEL=Lk,
  STAGE=stage,
@@ -925,39 +629,26 @@ class _attention(torch.autograd.Function):
  pre_grid = (N_CTX // PRE_BLOCK, BATCH * N_HEAD)
  delta = torch.empty_like(M)
  _attn_bwd_preprocess[pre_grid](
- o,
- do,
  delta,
- BATCH,
- N_HEAD,
- N_CTX,
- BLOCK_M=PRE_BLOCK,
- D_HEAD=ctx.BLOCK_DMODEL,
  )
- def grid(META):
- return (triton.cdiv(N_CTX, META["BLOCK_N1"]), 1, BATCH * N_HEAD)
  _attn_bwd[grid](
- q,
- arg_k,
- v,
- ctx.sm_scale,
- do,
- dq,
- dk,
- dv,
- M,
- delta,
- q.stride(0),
- q.stride(1),
- q.stride(2),
- q.stride(3),
- N_HEAD,
- N_CTX,
- BLOCK_DMODEL=ctx.BLOCK_DMODEL,
  )
  return dq, dk, dv, None, None

 # TORCH_HAS_FP8E4B8 = hasattr(torch, 'float8_e4m3fnuz')
 # AMD E5M2B16
+TORCH_HAS_FP8E5B16 = hasattr(torch, 'float8_e5m2fnuz')
 @triton.jit
+def _attn_fwd_inner(acc, l_i, m_i, q,
+ K_block_ptr, V_block_ptr,
+ start_m,
+ BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr,
+ STAGE: tl.constexpr, offs_m: tl.constexpr, offs_n: tl.constexpr,
+ N_CTX,
+ pre_load_v: tl.constexpr):
  # range of values handled by this stage
  if STAGE == 1:
  lo, hi = 0, start_m * BLOCK_M
 # re-tuning.
 @triton.autotune(
  configs=[
+ triton.Config({'BLOCK_M': 64, 'BLOCK_N': 16, 'waves_per_eu': 2,
+ 'slice_k_tile': 0, 'pre_load_v': False}, num_stages=1, num_warps=2),
+ triton.Config({'BLOCK_M': 64, 'BLOCK_N': 16, 'waves_per_eu': 2,
+ 'slice_k_tile': 32, 'pre_load_v': False}, num_stages=1, num_warps=2),
+ triton.Config({'BLOCK_M': 32, 'BLOCK_N': 32, 'waves_per_eu': 2,
+ 'slice_k_tile': 0, 'pre_load_v': False}, num_stages=1, num_warps=1),
+ triton.Config({'BLOCK_M': 32, 'BLOCK_N': 32, 'waves_per_eu': 2,
+ 'slice_k_tile': 32, 'pre_load_v': False}, num_stages=1, num_warps=1),
+ triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'waves_per_eu': 2,
+ 'slice_k_tile': 0, 'pre_load_v': False}, num_stages=1, num_warps=2),
+ triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 3,
+ 'slice_k_tile': 0, 'pre_load_v': True}, num_stages=1, num_warps=1),
+ triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 3,
+ 'slice_k_tile': 0, 'pre_load_v': False}, num_stages=1, num_warps=1),
  ],
+ key=['Z', 'H', 'N_CTX', 'STAGE', 'BLOCK_DMODEL'],
 )
 @triton.jit
+def _attn_fwd(Q, K, V, sm_scale, M, Out,
+ stride_qz, stride_qh, stride_qm, stride_qk,
+ stride_kz, stride_kh, stride_kn, stride_kk,
+ stride_vz, stride_vh, stride_vk, stride_vn,
+ stride_oz, stride_oh, stride_om, stride_on,
+ Z, H,
+ N_CTX,
+ BLOCK_DMODEL: tl.constexpr,
+ STAGE: tl.constexpr,
+ BLOCK_M: tl.constexpr,
+ BLOCK_N: tl.constexpr,
+ pre_load_v: tl.constexpr,
+ ):
  start_m = tl.program_id(0)
  off_hz = tl.program_id(1)
  qvk_offset = off_hz * stride_qh
  # For causal = True, STAGE = 3 and _attn_fwd_inner gets 1 as its STAGE
  # For causal = False, STAGE = 1, and _attn_fwd_inner gets 3 as its STAGE
  if STAGE & 1:
+ acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q, K_block_ptr, V_block_ptr,
+ start_m,
+ BLOCK_M, BLOCK_DMODEL, BLOCK_N,
+ 4 - STAGE, offs_m, offs_n, N_CTX,
+ pre_load_v,
+ )
  # stage 2: on-band
  if STAGE & 2:
  # barrier makes it easier for compielr to schedule the
  # two loops independently
  tl.debug_barrier()
+ acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q, K_block_ptr, V_block_ptr,
+ start_m,
+ BLOCK_M, BLOCK_DMODEL, BLOCK_N,
+ 2, offs_m, offs_n, N_CTX,
+ pre_load_v,
+ )
  # epilogue
  # write back m
  acc = acc / l_i[:, None]
 @triton.jit
+def _attn_bwd_preprocess(O, DO,
+ Delta,
+ Z, H, N_CTX,
+ BLOCK_M: tl.constexpr, D_HEAD: tl.constexpr
+ ):
  off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
  off_hz = tl.program_id(1)
  off_n = tl.arange(0, D_HEAD)
+ o = tl.load(O + off_hz * D_HEAD * N_CTX +
+ off_m[:, None] * D_HEAD + off_n[None, :])
+ do = tl.load(DO + off_hz * D_HEAD * N_CTX +
+ off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
  delta = tl.sum(o * do, axis=1)
  tl.store(Delta + off_hz * N_CTX + off_m, delta)
 # The main inner-loop logic for computing dK and dV.
 @triton.jit
+def _attn_bwd_dkdv(dk, dv,
+ Q, k, v, sm_scale,
+ DO,
+ M, D,
+ # shared by Q/K/V/DO.
+ stride_tok, stride_d,
+ H, N_CTX, BLOCK_M1: tl.constexpr,
+ BLOCK_N1: tl.constexpr,
+ BLOCK_DMODEL: tl.constexpr,
+ # Filled in by the wrapper.
+ start_n, start_m, num_steps,
+ MASK: tl.constexpr):
  offs_m = start_m + tl.arange(0, BLOCK_M1)
  offs_n = start_n + tl.arange(0, BLOCK_N1)
  offs_k = tl.arange(0, BLOCK_DMODEL)
  strides=(stride_d, stride_tok),
  offsets=(0, start_m),
  block_shape=(BLOCK_DMODEL, BLOCK_M1),
+ order=(0, 1)
  )
  DO_block_ptr = tl.make_block_ptr(
  base=DO,
  strides=(stride_tok, stride_d),
  offsets=(start_m, 0),
  block_shape=(BLOCK_M1, BLOCK_DMODEL),
+ order=(1, 0)
  )
  # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
  tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
  pT = tl.math.exp2(qkT - m[None, :])
  # Autoregressive masking.
  if MASK:
+ mask = (offs_m[None, :] >= offs_n[:, None])
  pT = tl.where(mask, pT, 0.0)
  do = tl.load(DO_block_ptr)
  # Compute dV.
 # the main inner-loop logic for computing dQ
 @triton.jit
+def _attn_bwd_dq(dq, q, K, V,
+ do, m, D,
+ # shared by Q/K/V/DO.
+ stride_tok, stride_d,
+ H, N_CTX,
+ BLOCK_M2: tl.constexpr,
+ BLOCK_N2: tl.constexpr,
+ BLOCK_DMODEL: tl.constexpr,
+ # Filled in by the wrapper.
+ start_m, start_n, num_steps,
+ MASK: tl.constexpr):
  offs_m = start_m + tl.arange(0, BLOCK_M2)
  offs_n = start_n + tl.arange(0, BLOCK_N2)
  offs_k = tl.arange(0, BLOCK_DMODEL)
  strides=(stride_d, stride_tok),
  offsets=(0, start_n),
  block_shape=(BLOCK_DMODEL, BLOCK_N2),
+ order=(0, 1)
  )
  VT_block_ptr = tl.make_block_ptr(
  base=V,
  strides=(stride_d, stride_tok),
  offsets=(0, start_n),
  block_shape=(BLOCK_DMODEL, BLOCK_N2),
+ order=(0, 1)
  )
  # D (= delta) is pre-divided by ds_scale.
  Di = tl.load(D + offs_m)
  # Autoregressive masking.
  if MASK:
  offs_n = curr_n + tl.arange(0, BLOCK_N2)
+ mask = (offs_m[:, None] >= offs_n[None, :])
  p = tl.where(mask, p, 0.0)
  # Compute dP and dS.
  vT = tl.load(VT_block_ptr)
 @triton.autotune(
  configs=[
+ triton.Config({'BLOCK_M1': 32, 'BLOCK_N1': 64, 'BLOCK_M2': 64, 'BLOCK_N2': 32, 'BLK_SLICE_FACTOR': 1},
+ num_stages=1, num_warps=4),
+ triton.Config({'BLOCK_M1': 32, 'BLOCK_N1': 64, 'BLOCK_M2': 64, 'BLOCK_N2': 32, 'BLK_SLICE_FACTOR': 2},
+ num_stages=1, num_warps=4),
+ triton.Config({'BLOCK_M1': 64, 'BLOCK_N1': 128, 'BLOCK_M2': 128, 'BLOCK_N2': 64, 'BLK_SLICE_FACTOR': 1},
+ num_stages=1, num_warps=4),
+ triton.Config({'BLOCK_M1': 64, 'BLOCK_N1': 128, 'BLOCK_M2': 128, 'BLOCK_N2': 64, 'BLK_SLICE_FACTOR': 2},
+ num_stages=1, num_warps=4),
+ triton.Config({'BLOCK_M1': 64, 'BLOCK_N1': 64, 'BLOCK_M2': 64, 'BLOCK_N2': 64, 'BLK_SLICE_FACTOR': 1},
+ num_stages=1, num_warps=4),
+ triton.Config({'BLOCK_M1': 64, 'BLOCK_N1': 64, 'BLOCK_M2': 64, 'BLOCK_N2': 64, 'BLK_SLICE_FACTOR': 2},
+ num_stages=1, num_warps=4),
+ triton.Config({'BLOCK_M1': 32, 'BLOCK_N1': 128, 'BLOCK_M2': 128, 'BLOCK_N2': 32, 'BLK_SLICE_FACTOR': 1},
+ num_stages=1, num_warps=4),
+ triton.Config({'BLOCK_M1': 32, 'BLOCK_N1': 128, 'BLOCK_M2': 128, 'BLOCK_N2': 32, 'BLK_SLICE_FACTOR': 2},
+ num_stages=1, num_warps=4),
+ triton.Config({'BLOCK_M1': 32, 'BLOCK_N1': 128, 'BLOCK_M2': 128, 'BLOCK_N2': 32, 'BLK_SLICE_FACTOR': 2},
+ num_stages=1, num_warps=8),
  ],
+ key=['H', 'N_CTX', 'BLOCK_DMODEL'],
 )
 @triton.jit
+def _attn_bwd(Q, K, V, sm_scale,
+ DO,
+ DQ, DK, DV,
+ M, D,
+ # shared by Q/K/V/DO.
+ stride_z, stride_h, stride_tok, stride_d,
+ # H = 16, N_CTX = 1024
+ H, N_CTX,
+ BLOCK_DMODEL: tl.constexpr,
+ BLOCK_M1: tl.constexpr,
+ BLOCK_N1: tl.constexpr,
+ BLOCK_M2: tl.constexpr,
+ BLOCK_N2: tl.constexpr,
+ BLK_SLICE_FACTOR: tl.constexpr):
  LN2: tl.constexpr = 0.6931471824645996 # = ln(2)
  bhid = tl.program_id(2)
  num_steps = BLOCK_N1 // MASK_BLOCK_M1
+ dk, dv = _attn_bwd_dkdv(dk, dv,
+ Q, k, v, sm_scale,
+ DO,
+ M, D,
+ stride_tok, stride_d,
+ H, N_CTX,
+ MASK_BLOCK_M1, BLOCK_N1, BLOCK_DMODEL,
+ start_n, start_m, num_steps,
+ MASK=True
+ )
  start_m += num_steps * MASK_BLOCK_M1
  num_steps = (N_CTX - start_m) // BLOCK_M1
  # Compute dK and dV for non-masked blocks.
  dk, dv = _attn_bwd_dkdv(
+ dk, dv,
+ Q, k, v, sm_scale,
  DO,
+ M, D,
+ stride_tok, stride_d,
+ H, N_CTX,
+ BLOCK_M1, BLOCK_N1, BLOCK_DMODEL,
+ start_n, start_m, num_steps,
+ MASK=False
  )
  DV_block_ptrs = tl.make_block_ptr(
  strides=(stride_tok, stride_d),
  offsets=(start_n, 0),
  block_shape=(BLOCK_N1, BLOCK_DMODEL),
+ order=(1, 0)
  )
  tl.store(DV_block_ptrs, dv.to(tl.float16))
  strides=(stride_tok, stride_d),
  offsets=(start_n, 0),
  block_shape=(BLOCK_N1, BLOCK_DMODEL),
+ order=(1, 0)
  )
  tl.store(DK_block_ptrs, dk.to(tl.float16))
  strides=(stride_tok, stride_d),
  offsets=(start_m, 0),
  block_shape=(BLOCK_M2, BLOCK_DMODEL),
+ order=(1, 0)
  )
  DO_block_ptr = tl.make_block_ptr(
  strides=(stride_tok, stride_d),
  offsets=(start_m, 0),
  block_shape=(BLOCK_M2, BLOCK_DMODEL),
+ order=(1, 0)
  )
  q = tl.load(Q_block_ptr)
  do = tl.load(DO_block_ptr)
  # not due to anything important. I just wanted to reuse the loop
  # structure for dK & dV above as much as possible.
  num_steps = BLOCK_M2 // MASK_BLOCK_N2
+ dq = _attn_bwd_dq(dq, q, K, V,
+ do, m, D,
+ stride_tok, stride_d,
+ H, N_CTX,
+ BLOCK_M2, MASK_BLOCK_N2, BLOCK_DMODEL,
+ start_m, end_n - num_steps * MASK_BLOCK_N2, num_steps,
+ MASK=True
+ )
  end_n -= num_steps * MASK_BLOCK_N2
  # stage 2
  num_steps = end_n // BLOCK_N2
+ dq = _attn_bwd_dq(dq, q, K, V,
+ do, m, D,
+ stride_tok, stride_d,
+ H, N_CTX,
+ BLOCK_M2, BLOCK_N2, BLOCK_DMODEL,
+ start_m, end_n - num_steps * BLOCK_N2, num_steps,
+ MASK=False
+ )
  # Write back dQ.
  DQ_block_ptr = tl.make_block_ptr(
  base=DQ,
  strides=(stride_tok, stride_d),
  offsets=(start_m, 0),
  block_shape=(BLOCK_M2, BLOCK_DMODEL),
+ order=(1, 0)
  )
  dq *= LN2
  tl.store(DQ_block_ptr, dq.to(tl.float16))
  num_stages = 7 if Lk >= 64 else 3
  stage = 3 if causal else 1
+ def grid(META): return (
+ triton.cdiv(q.shape[2], META['BLOCK_M']),
+ q.shape[0] * q.shape[1],
+ 1
  )
+ M = torch.empty((q.shape[0] * q.shape[1], q.shape[2]),
+ device=q.device, dtype=torch.float32)
  _attn_fwd[grid](
+ q, k, v, sm_scale, M, o,
+ q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+ k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+ v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+ o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+ q.shape[0], q.shape[1],
  N_CTX=q.shape[2],
  BLOCK_DMODEL=Lk,
  STAGE=stage,
  pre_grid = (N_CTX // PRE_BLOCK, BATCH * N_HEAD)
  delta = torch.empty_like(M)
  _attn_bwd_preprocess[pre_grid](
+ o, do,
  delta,
+ BATCH, N_HEAD, N_CTX,
+ BLOCK_M=PRE_BLOCK, D_HEAD=ctx.BLOCK_DMODEL
  )
+ def grid(META): return (
+ triton.cdiv(N_CTX, META['BLOCK_N1']),
+ 1,
+ BATCH * N_HEAD
+ )
  _attn_bwd[grid](
+ q, arg_k, v, ctx.sm_scale, do, dq, dk, dv,
+ M, delta,
+ q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+ N_HEAD, N_CTX,
+ BLOCK_DMODEL=ctx.BLOCK_DMODEL
  )
  return dq, dk, dv, None, None
+attention = _attention.apply