Spaces:
Paused
Paused
# Adapted from Diffusers and Open-Sora-Plan | |
import torch | |
from diffusers.utils import logging | |
logger = logging.get_logger(__name__) | |
class PositionGetter3D(object): | |
""" return positions of patches """ | |
def __init__(self, ): | |
self.cache_positions = {} | |
def __call__(self, b, t, h, w, device): | |
if not (b, t,h,w) in self.cache_positions: | |
x = torch.arange(w, device=device) | |
y = torch.arange(h, device=device) | |
z = torch.arange(t, device=device) | |
pos = torch.cartesian_prod(z, y, x) | |
pos = pos.reshape(t * h * w, 3).transpose(0, 1).reshape(3, 1, -1).contiguous().expand(3, b, -1).clone() | |
poses = (pos[0].contiguous(), pos[1].contiguous(), pos[2].contiguous()) | |
max_poses = (int(poses[0].max()), int(poses[1].max()), int(poses[2].max())) | |
self.cache_positions[b, t, h, w] = (poses, max_poses) | |
pos = self.cache_positions[b, t, h, w] | |
return pos | |
class RoPE3D(torch.nn.Module): | |
def __init__(self, freq=10000.0, F0=1.0, interpolation_scale_thw=(1, 1, 1)): | |
super().__init__() | |
self.base = freq | |
self.F0 = F0 | |
self.interpolation_scale_t = interpolation_scale_thw[0] | |
self.interpolation_scale_h = interpolation_scale_thw[1] | |
self.interpolation_scale_w = interpolation_scale_thw[2] | |
self.cache = {} | |
def get_cos_sin(self, D, seq_len, device, dtype, interpolation_scale=1): | |
if (D, seq_len, device, dtype) not in self.cache: | |
inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D)) | |
t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) / interpolation_scale | |
freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype) | |
freqs = torch.cat((freqs, freqs), dim=-1) | |
cos = freqs.cos() # (Seq, Dim) | |
sin = freqs.sin() | |
self.cache[D, seq_len, device, dtype] = (cos, sin) | |
return self.cache[D, seq_len, device, dtype] | |
def rotate_half(x): | |
x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2:] | |
return torch.cat((-x2, x1), dim=-1) | |
def apply_rope1d(self, tokens, pos1d, cos, sin): | |
assert pos1d.ndim == 2 | |
# for (batch_size x ntokens x nheads x dim) | |
cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :] | |
sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :] | |
return (tokens * cos) + (self.rotate_half(tokens) * sin) | |
def forward(self, tokens, positions): | |
""" | |
input: | |
* tokens: batch_size x nheads x ntokens x dim | |
* positions: batch_size x ntokens x 3 (t, y and x position of each token) | |
output: | |
* tokens after appplying RoPE3D (batch_size x nheads x ntokens x x dim) | |
""" | |
assert tokens.size(3) % 3 == 0, "number of dimensions should be a multiple of three" | |
D = tokens.size(3) // 3 | |
poses, max_poses = positions | |
assert len(poses) == 3 and poses[0].ndim == 2# Batch, Seq, 3 | |
cos_t, sin_t = self.get_cos_sin(D, max_poses[0] + 1, tokens.device, tokens.dtype, self.interpolation_scale_t) | |
cos_y, sin_y = self.get_cos_sin(D, max_poses[1] + 1, tokens.device, tokens.dtype, self.interpolation_scale_h) | |
cos_x, sin_x = self.get_cos_sin(D, max_poses[2] + 1, tokens.device, tokens.dtype, self.interpolation_scale_w) | |
# split features into three along the feature dimension, and apply rope1d on each half | |
t, y, x = tokens.chunk(3, dim=-1) | |
t = self.apply_rope1d(t, poses[0], cos_t, sin_t) | |
y = self.apply_rope1d(y, poses[1], cos_y, sin_y) | |
x = self.apply_rope1d(x, poses[2], cos_x, sin_x) | |
tokens = torch.cat((t, y, x), dim=-1) | |
return tokens | |