Live2Diff / live2diff /animatediff /models /stream_motion_module.py
leoxing1996
add demo
d16b52d
import torch
import torch.nn.functional as F
from einops import rearrange
from .attention import CrossAttention
from .positional_encoding import PositionalEncoding
class StreamTemporalAttention(CrossAttention):
"""
* window_size: The max length of attention window.
* sink_size: The number sink token.
* positional_rule: absolute, relative
Therefore, the seq length of temporal self-attention will be:
sink_length + cache_size
"""
def __init__(
self,
attention_mode=None,
cross_frame_attention_mode=None,
temporal_position_encoding=False,
temporal_position_encoding_max_len=32,
window_size=8,
sink_size=0,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
self.attention_mode = self._orig_attention_mode = attention_mode
self.is_cross_attention = kwargs["cross_attention_dim"] is not None
self.pos_encoder = PositionalEncoding(
kwargs["query_dim"],
dropout=0.0,
max_len=temporal_position_encoding_max_len,
)
self.window_size = window_size
self.sink_size = sink_size
self.cache_size = self.window_size - self.sink_size
assert self.cache_size >= 0, (
"cache_size must be greater or equal to 0. Please check your configuration. "
f"window_size: {window_size}, sink_size: {sink_size}, "
f"cache_size: {self.cache_size}"
)
self.motion_module_idx = None
def set_index(self, idx):
self.motion_module_idx = idx
@torch.no_grad()
def set_cache(self, denoising_steps_num: int):
"""
larger buffer index means cleaner latent
"""
device = next(self.parameters()).device
dtype = next(self.parameters()).dtype
# [t, 2, hw, L, c], 2 means k and v
kv_cache = torch.zeros(
denoising_steps_num,
2,
self.h * self.w,
self.window_size,
self.kv_channels,
device=device,
dtype=dtype,
)
self.denoising_steps_num = denoising_steps_num
return kv_cache
@torch.no_grad()
def prepare_pe_buffer(self):
"""In AnimateDiff, Temporal Self-attention use absolute positional encoding:
q = w_q * (x + pe) + bias
k = w_k * (x + pe) + bias
v = w_v * (x + pe) + bias
If we want to conduct relative positional encoding with kv-cache, we should pre-calcute
`w_q/k/v * pe` and then cache `w_q/k/v * x + bias`
"""
pe_list = self.pos_encoder.pe[:, : self.window_size] # [1, window_size, ch]
q_pe = F.linear(pe_list, self.to_q.weight)
k_pe = F.linear(pe_list, self.to_k.weight)
v_pe = F.linear(pe_list, self.to_v.weight)
self.register_buffer("q_pe", q_pe)
self.register_buffer("k_pe", k_pe)
self.register_buffer("v_pe", v_pe)
def prepare_qkv_full_and_cache(self, hidden_states, kv_cache, pe_idx, update_idx):
"""
hidden_states: [(N * bhw), F, c],
kv_cache: [2, N, hw, L, c]
* for warmup case: `N` should be 1 and `F` should be warmup_size (`sink_size`)
* for streaming case: `N` should be `denoising_steps_num` and `F` should be `chunk_size`
"""
q_layer = self.to_q(hidden_states)
k_layer = self.to_k(hidden_states)
v_layer = self.to_v(hidden_states)
q_layer = rearrange(q_layer, "(n bhw) f c -> n bhw f c", n=self.denoising_steps_num)
k_layer = rearrange(k_layer, "(n bhw) f c -> n bhw f c", n=self.denoising_steps_num)
v_layer = rearrange(v_layer, "(n bhw) f c -> n bhw f c", n=self.denoising_steps_num)
# onnx & trt friendly indexing
for idx in range(self.denoising_steps_num):
kv_cache[idx, 0, :, update_idx[idx]] = k_layer[idx, :, 0]
kv_cache[idx, 1, :, update_idx[idx]] = v_layer[idx, :, 0]
k_full = kv_cache[:, 0]
v_full = kv_cache[:, 1]
kv_idx = pe_idx
q_idx = torch.stack([kv_idx[idx, update_idx[idx]] for idx in range(self.denoising_steps_num)]).unsqueeze_(
1
) # [timesteps, 1]
pe_k = torch.cat(
[self.k_pe.index_select(1, kv_idx[idx]) for idx in range(self.denoising_steps_num)], dim=0
) # [n, window_size, c]
pe_v = torch.cat(
[self.v_pe.index_select(1, kv_idx[idx]) for idx in range(self.denoising_steps_num)], dim=0
) # [n, window_size, c]
pe_q = torch.cat(
[self.q_pe.index_select(1, q_idx[idx]) for idx in range(self.denoising_steps_num)], dim=0
) # [n, window_size, c]
q_layer = q_layer + pe_q.unsqueeze(1)
k_full = k_full + pe_k.unsqueeze(1)
v_full = v_full + pe_v.unsqueeze(1)
q_layer = rearrange(q_layer, "n bhw f c -> (n bhw) f c")
k_full = rearrange(k_full, "n bhw f c -> (n bhw) f c")
v_full = rearrange(v_full, "n bhw f c -> (n bhw) f c")
return q_layer, k_full, v_full
def forward(
self,
hidden_states,
encoder_hidden_states=None,
attention_mask=None,
video_length=None,
temporal_attention_mask=None,
kv_cache=None,
pe_idx=None,
update_idx=None,
*args,
**kwargs,
):
"""
temporal_attention_mask: attention mask specific for the temporal self-attention.
"""
d = hidden_states.shape[1]
hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
if self.group_norm is not None:
hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
query_layer, key_full, value_full = self.prepare_qkv_full_and_cache(
hidden_states, kv_cache, pe_idx, update_idx
)
# [(n * hw * b), f, c] -> [(n * hw * b * head), f, c // head]
query_layer = self.reshape_heads_to_batch_dim(query_layer)
key_full = self.reshape_heads_to_batch_dim(key_full)
value_full = self.reshape_heads_to_batch_dim(value_full)
if temporal_attention_mask is not None:
q_size = query_layer.shape[1]
# [n, self.window_size] -> [n, hw, q_size, window_size]
temporal_attention_mask_ = temporal_attention_mask[:, None, None, :].repeat(1, self.h * self.w, q_size, 1)
temporal_attention_mask_ = rearrange(temporal_attention_mask_, "n hw Q KV -> (n hw) Q KV")
temporal_attention_mask_ = temporal_attention_mask_.repeat_interleave(self.heads, dim=0)
else:
temporal_attention_mask_ = None
# attention, what we cannot get enough of
if hasattr(F, "scaled_dot_product_attention"):
hidden_states = self._memory_efficient_attention_pt20(
query_layer, key_full, value_full, attention_mask=temporal_attention_mask_
)
elif self._use_memory_efficient_attention_xformers:
hidden_states = self._memory_efficient_attention_xformers(
query_layer, key_full, value_full, attention_mask=temporal_attention_mask_
)
# Some versions of xformers return output in fp32, cast it back to the dtype of the input
hidden_states = hidden_states.to(query_layer.dtype)
else:
hidden_states = self._attention(query_layer, key_full, value_full, temporal_attention_mask_)
# linear proj
hidden_states = self.to_out[0](hidden_states)
# dropout
hidden_states = self.to_out[1](hidden_states)
hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
return hidden_states