import math from typing import Optional import torch import torch.nn as nn import torch.nn.functional as F from conformer import ConformerBlock from diffusers.models.activations import get_activation from einops import pack, rearrange, repeat from matcha.models.components.transformer import BasicTransformerBlock class SinusoidalPosEmb(torch.nn.Module): def __init__(self, dim): super().__init__() self.dim = dim assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even" def forward(self, x, scale=1000): if x.ndim < 1: x = x.unsqueeze(0) device = x.device half_dim = self.dim // 2 emb = math.log(10000) / (half_dim - 1) emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb) emb = scale * x.unsqueeze(1) * emb.unsqueeze(0) emb = torch.cat((emb.sin(), emb.cos()), dim=-1) return emb class Block1D(torch.nn.Module): def __init__(self, dim, dim_out, groups=8): super().__init__() self.block = torch.nn.Sequential( torch.nn.Conv1d(dim, dim_out, 3, padding=1), torch.nn.GroupNorm(groups, dim_out), nn.Mish(), ) def forward(self, x, mask): output = self.block(x * mask) return output * mask class ResnetBlock1D(torch.nn.Module): def __init__(self, dim, dim_out, time_emb_dim, groups=8): super().__init__() self.mlp = torch.nn.Sequential(nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out)) self.block1 = Block1D(dim, dim_out, groups=groups) self.block2 = Block1D(dim_out, dim_out, groups=groups) self.res_conv = torch.nn.Conv1d(dim, dim_out, 1) def forward(self, x, mask, time_emb): h = self.block1(x, mask) h += self.mlp(time_emb).unsqueeze(-1) h = self.block2(h, mask) output = h + self.res_conv(x * mask) return output class Downsample1D(nn.Module): def __init__(self, dim): super().__init__() self.conv = torch.nn.Conv1d(dim, dim, 3, 2, 1) def forward(self, x): return self.conv(x) class TimestepEmbedding(nn.Module): def __init__( self, in_channels: int, time_embed_dim: int, act_fn: str = "silu", out_dim: int = None, post_act_fn: Optional[str] = None, cond_proj_dim=None, ): super().__init__() self.linear_1 = nn.Linear(in_channels, time_embed_dim) if cond_proj_dim is not None: self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False) else: self.cond_proj = None self.act = get_activation(act_fn) if out_dim is not None: time_embed_dim_out = out_dim else: time_embed_dim_out = time_embed_dim self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out) if post_act_fn is None: self.post_act = None else: self.post_act = get_activation(post_act_fn) def forward(self, sample, condition=None): if condition is not None: sample = sample + self.cond_proj(condition) sample = self.linear_1(sample) if self.act is not None: sample = self.act(sample) sample = self.linear_2(sample) if self.post_act is not None: sample = self.post_act(sample) return sample class Upsample1D(nn.Module): """A 1D upsampling layer with an optional convolution. Parameters: channels (`int`): number of channels in the inputs and outputs. use_conv (`bool`, default `False`): option to use a convolution. use_conv_transpose (`bool`, default `False`): option to use a convolution transpose. out_channels (`int`, optional): number of output channels. Defaults to `channels`. """ def __init__(self, channels, use_conv=False, use_conv_transpose=True, out_channels=None, name="conv"): super().__init__() self.channels = channels self.out_channels = out_channels or channels self.use_conv = use_conv self.use_conv_transpose = use_conv_transpose self.name = name self.conv = None if use_conv_transpose: self.conv = nn.ConvTranspose1d(channels, self.out_channels, 4, 2, 1) elif use_conv: self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1) def forward(self, inputs): assert inputs.shape[1] == self.channels if self.use_conv_transpose: return self.conv(inputs) outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest") if self.use_conv: outputs = self.conv(outputs) return outputs class ConformerWrapper(ConformerBlock): def __init__( # pylint: disable=useless-super-delegation self, *, dim, dim_head=64, heads=8, ff_mult=4, conv_expansion_factor=2, conv_kernel_size=31, attn_dropout=0, ff_dropout=0, conv_dropout=0, conv_causal=False, ): super().__init__( dim=dim, dim_head=dim_head, heads=heads, ff_mult=ff_mult, conv_expansion_factor=conv_expansion_factor, conv_kernel_size=conv_kernel_size, attn_dropout=attn_dropout, ff_dropout=ff_dropout, conv_dropout=conv_dropout, conv_causal=conv_causal, ) def forward( self, hidden_states, attention_mask, encoder_hidden_states=None, encoder_attention_mask=None, timestep=None, ): return super().forward(x=hidden_states, mask=attention_mask.bool()) class Decoder(nn.Module): def __init__( self, in_channels, out_channels, channels=(256, 256), dropout=0.05, attention_head_dim=64, n_blocks=1, num_mid_blocks=2, num_heads=4, act_fn="snake", down_block_type="transformer", mid_block_type="transformer", up_block_type="transformer", ): super().__init__() channels = tuple(channels) self.in_channels = in_channels self.out_channels = out_channels self.time_embeddings = SinusoidalPosEmb(in_channels) time_embed_dim = channels[0] * 4 self.time_mlp = TimestepEmbedding( in_channels=in_channels, time_embed_dim=time_embed_dim, act_fn="silu", ) self.down_blocks = nn.ModuleList([]) self.mid_blocks = nn.ModuleList([]) self.up_blocks = nn.ModuleList([]) output_channel = in_channels for i in range(len(channels)): # pylint: disable=consider-using-enumerate input_channel = output_channel output_channel = channels[i] is_last = i == len(channels) - 1 resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) transformer_blocks = nn.ModuleList( [ self.get_block( down_block_type, output_channel, attention_head_dim, num_heads, dropout, act_fn, ) for _ in range(n_blocks) ] ) downsample = ( Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1) ) self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample])) for i in range(num_mid_blocks): input_channel = channels[-1] out_channels = channels[-1] resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) transformer_blocks = nn.ModuleList( [ self.get_block( mid_block_type, output_channel, attention_head_dim, num_heads, dropout, act_fn, ) for _ in range(n_blocks) ] ) self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks])) channels = channels[::-1] + (channels[0],) for i in range(len(channels) - 1): input_channel = channels[i] output_channel = channels[i + 1] is_last = i == len(channels) - 2 resnet = ResnetBlock1D( dim=2 * input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim, ) transformer_blocks = nn.ModuleList( [ self.get_block( up_block_type, output_channel, attention_head_dim, num_heads, dropout, act_fn, ) for _ in range(n_blocks) ] ) upsample = ( Upsample1D(output_channel, use_conv_transpose=True) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1) ) self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample])) self.final_block = Block1D(channels[-1], channels[-1]) self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1) self.initialize_weights() # nn.init.normal_(self.final_proj.weight) @staticmethod def get_block(block_type, dim, attention_head_dim, num_heads, dropout, act_fn): if block_type == "conformer": block = ConformerWrapper( dim=dim, dim_head=attention_head_dim, heads=num_heads, ff_mult=1, conv_expansion_factor=2, ff_dropout=dropout, attn_dropout=dropout, conv_dropout=dropout, conv_kernel_size=31, ) elif block_type == "transformer": block = BasicTransformerBlock( dim=dim, num_attention_heads=num_heads, attention_head_dim=attention_head_dim, dropout=dropout, activation_fn=act_fn, ) else: raise ValueError(f"Unknown block type {block_type}") return block def initialize_weights(self): for m in self.modules(): if isinstance(m, nn.Conv1d): nn.init.kaiming_normal_(m.weight, nonlinearity="relu") if m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.GroupNorm): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): nn.init.kaiming_normal_(m.weight, nonlinearity="relu") if m.bias is not None: nn.init.constant_(m.bias, 0) def forward(self, x, mask, mu, t, spks=None, cond=None): """Forward pass of the UNet1DConditional model. Args: x (torch.Tensor): shape (batch_size, in_channels, time) mask (_type_): shape (batch_size, 1, time) t (_type_): shape (batch_size) spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None. cond (_type_, optional): placeholder for future use. Defaults to None. Raises: ValueError: _description_ ValueError: _description_ Returns: _type_: _description_ """ t = self.time_embeddings(t) t = self.time_mlp(t) x = pack([x, mu], "b * t")[0] if spks is not None: spks = repeat(spks, "b c -> b c t", t=x.shape[-1]) x = pack([x, spks], "b * t")[0] hiddens = [] masks = [mask] for resnet, transformer_blocks, downsample in self.down_blocks: mask_down = masks[-1] x = resnet(x, mask_down, t) x = rearrange(x, "b c t -> b t c") mask_down = rearrange(mask_down, "b 1 t -> b t") for transformer_block in transformer_blocks: x = transformer_block( hidden_states=x, attention_mask=mask_down, timestep=t, ) x = rearrange(x, "b t c -> b c t") mask_down = rearrange(mask_down, "b t -> b 1 t") hiddens.append(x) # Save hidden states for skip connections x = downsample(x * mask_down) masks.append(mask_down[:, :, ::2]) masks = masks[:-1] mask_mid = masks[-1] for resnet, transformer_blocks in self.mid_blocks: x = resnet(x, mask_mid, t) x = rearrange(x, "b c t -> b t c") mask_mid = rearrange(mask_mid, "b 1 t -> b t") for transformer_block in transformer_blocks: x = transformer_block( hidden_states=x, attention_mask=mask_mid, timestep=t, ) x = rearrange(x, "b t c -> b c t") mask_mid = rearrange(mask_mid, "b t -> b 1 t") for resnet, transformer_blocks, upsample in self.up_blocks: mask_up = masks.pop() x = resnet(pack([x, hiddens.pop()], "b * t")[0], mask_up, t) x = rearrange(x, "b c t -> b t c") mask_up = rearrange(mask_up, "b 1 t -> b t") for transformer_block in transformer_blocks: x = transformer_block( hidden_states=x, attention_mask=mask_up, timestep=t, ) x = rearrange(x, "b t c -> b c t") mask_up = rearrange(mask_up, "b t -> b 1 t") x = upsample(x * mask_up) x = self.final_block(x, mask_up) output = self.final_proj(x * mask_up) return output * mask