|
|
|
import math |
|
from typing import Tuple |
|
|
|
import torch |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
|
|
from TTS.tts.layers.delightful_tts.conv_layers import Conv1dGLU, DepthWiseConv1d, PointwiseConv1d |
|
from TTS.tts.layers.delightful_tts.networks import GLUActivation |
|
|
|
|
|
def calc_same_padding(kernel_size: int) -> Tuple[int, int]: |
|
pad = kernel_size // 2 |
|
return (pad, pad - (kernel_size + 1) % 2) |
|
|
|
|
|
class Conformer(nn.Module): |
|
def __init__( |
|
self, |
|
dim: int, |
|
n_layers: int, |
|
n_heads: int, |
|
speaker_embedding_dim: int, |
|
p_dropout: float, |
|
kernel_size_conv_mod: int, |
|
lrelu_slope: float, |
|
): |
|
""" |
|
A Transformer variant that integrates both CNNs and Transformers components. |
|
Conformer proposes a novel combination of self-attention and convolution, in which self-attention |
|
learns the global interaction while the convolutions efficiently capture the local correlations. |
|
|
|
Args: |
|
dim (int): Number of the dimensions for the model. |
|
n_layers (int): Number of model layers. |
|
n_heads (int): The number of attention heads. |
|
speaker_embedding_dim (int): Number of speaker embedding dimensions. |
|
p_dropout (float): Probabilty of dropout. |
|
kernel_size_conv_mod (int): Size of kernels for convolution modules. |
|
|
|
Inputs: inputs, mask |
|
- **inputs** (batch, time, dim): Tensor containing input vector |
|
- **encoding** (batch, time, dim): Positional embedding tensor |
|
- **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked |
|
Returns: |
|
- **outputs** (batch, time, dim): Tensor produced by Conformer Encoder. |
|
""" |
|
super().__init__() |
|
d_k = d_v = dim // n_heads |
|
self.layer_stack = nn.ModuleList( |
|
[ |
|
ConformerBlock( |
|
dim, |
|
n_heads, |
|
d_k, |
|
d_v, |
|
kernel_size_conv_mod=kernel_size_conv_mod, |
|
dropout=p_dropout, |
|
speaker_embedding_dim=speaker_embedding_dim, |
|
lrelu_slope=lrelu_slope, |
|
) |
|
for _ in range(n_layers) |
|
] |
|
) |
|
|
|
def forward( |
|
self, |
|
x: torch.Tensor, |
|
mask: torch.Tensor, |
|
speaker_embedding: torch.Tensor, |
|
encoding: torch.Tensor, |
|
) -> torch.Tensor: |
|
""" |
|
Shapes: |
|
- x: :math:`[B, T_src, C]` |
|
- mask: :math: `[B]` |
|
- speaker_embedding: :math: `[B, C]` |
|
- encoding: :math: `[B, T_max2, C]` |
|
""" |
|
|
|
attn_mask = mask.view((mask.shape[0], 1, 1, mask.shape[1])) |
|
for enc_layer in self.layer_stack: |
|
x = enc_layer( |
|
x, |
|
mask=mask, |
|
slf_attn_mask=attn_mask, |
|
speaker_embedding=speaker_embedding, |
|
encoding=encoding, |
|
) |
|
return x |
|
|
|
|
|
class ConformerBlock(torch.nn.Module): |
|
def __init__( |
|
self, |
|
d_model: int, |
|
n_head: int, |
|
d_k: int, |
|
d_v: int, |
|
kernel_size_conv_mod: int, |
|
speaker_embedding_dim: int, |
|
dropout: float, |
|
lrelu_slope: float = 0.3, |
|
): |
|
""" |
|
A Conformer block is composed of four modules stacked together, |
|
A feed-forward module, a self-attention module, a convolution module, |
|
and a second feed-forward module in the end. The block starts with two Feed forward |
|
modules sandwiching the Multi-Headed Self-Attention module and the Conv module. |
|
|
|
Args: |
|
d_model (int): The dimension of model |
|
n_head (int): The number of attention heads. |
|
kernel_size_conv_mod (int): Size of kernels for convolution modules. |
|
speaker_embedding_dim (int): Number of speaker embedding dimensions. |
|
emotion_embedding_dim (int): Number of emotion embedding dimensions. |
|
dropout (float): Probabilty of dropout. |
|
|
|
Inputs: inputs, mask |
|
- **inputs** (batch, time, dim): Tensor containing input vector |
|
- **encoding** (batch, time, dim): Positional embedding tensor |
|
- **slf_attn_mask** (batch, 1, 1, time1): Tensor containing indices to be masked in self attention module |
|
- **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked |
|
Returns: |
|
- **outputs** (batch, time, dim): Tensor produced by the Conformer Block. |
|
""" |
|
super().__init__() |
|
if isinstance(speaker_embedding_dim, int): |
|
self.conditioning = Conv1dGLU( |
|
d_model=d_model, |
|
kernel_size=kernel_size_conv_mod, |
|
padding=kernel_size_conv_mod // 2, |
|
embedding_dim=speaker_embedding_dim, |
|
) |
|
|
|
self.ff = FeedForward(d_model=d_model, dropout=dropout, kernel_size=3, lrelu_slope=lrelu_slope) |
|
self.conformer_conv_1 = ConformerConvModule( |
|
d_model, kernel_size=kernel_size_conv_mod, dropout=dropout, lrelu_slope=lrelu_slope |
|
) |
|
self.ln = nn.LayerNorm(d_model) |
|
self.slf_attn = ConformerMultiHeadedSelfAttention(d_model=d_model, num_heads=n_head, dropout_p=dropout) |
|
self.conformer_conv_2 = ConformerConvModule( |
|
d_model, kernel_size=kernel_size_conv_mod, dropout=dropout, lrelu_slope=lrelu_slope |
|
) |
|
|
|
def forward( |
|
self, |
|
x: torch.Tensor, |
|
speaker_embedding: torch.Tensor, |
|
mask: torch.Tensor, |
|
slf_attn_mask: torch.Tensor, |
|
encoding: torch.Tensor, |
|
) -> torch.Tensor: |
|
""" |
|
Shapes: |
|
- x: :math:`[B, T_src, C]` |
|
- mask: :math: `[B]` |
|
- slf_attn_mask: :math: `[B, 1, 1, T_src]` |
|
- speaker_embedding: :math: `[B, C]` |
|
- emotion_embedding: :math: `[B, C]` |
|
- encoding: :math: `[B, T_max2, C]` |
|
""" |
|
if speaker_embedding is not None: |
|
x = self.conditioning(x, embeddings=speaker_embedding) |
|
x = self.ff(x) + x |
|
x = self.conformer_conv_1(x) + x |
|
res = x |
|
x = self.ln(x) |
|
x, _ = self.slf_attn(query=x, key=x, value=x, mask=slf_attn_mask, encoding=encoding) |
|
x = x + res |
|
x = x.masked_fill(mask.unsqueeze(-1), 0) |
|
|
|
x = self.conformer_conv_2(x) + x |
|
return x |
|
|
|
|
|
class FeedForward(nn.Module): |
|
def __init__( |
|
self, |
|
d_model: int, |
|
kernel_size: int, |
|
dropout: float, |
|
lrelu_slope: float, |
|
expansion_factor: int = 4, |
|
): |
|
""" |
|
Feed Forward module for conformer block. |
|
|
|
Args: |
|
d_model (int): The dimension of model. |
|
kernel_size (int): Size of the kernels for conv layers. |
|
dropout (float): probability of dropout. |
|
expansion_factor (int): The factor by which to project the number of channels. |
|
lrelu_slope (int): the negative slope factor for the leaky relu activation. |
|
|
|
Inputs: inputs |
|
- **inputs** (batch, time, dim): Tensor containing input vector |
|
Returns: |
|
- **outputs** (batch, time, dim): Tensor produced by the feed forward module. |
|
""" |
|
super().__init__() |
|
self.dropout = nn.Dropout(dropout) |
|
self.ln = nn.LayerNorm(d_model) |
|
self.conv_1 = nn.Conv1d( |
|
d_model, |
|
d_model * expansion_factor, |
|
kernel_size=kernel_size, |
|
padding=kernel_size // 2, |
|
) |
|
self.act = nn.LeakyReLU(lrelu_slope) |
|
self.conv_2 = nn.Conv1d(d_model * expansion_factor, d_model, kernel_size=1) |
|
|
|
def forward(self, x: torch.Tensor) -> torch.Tensor: |
|
""" |
|
Shapes: |
|
x: :math: `[B, T, C]` |
|
""" |
|
x = self.ln(x) |
|
x = x.permute((0, 2, 1)) |
|
x = self.conv_1(x) |
|
x = x.permute((0, 2, 1)) |
|
x = self.act(x) |
|
x = self.dropout(x) |
|
x = x.permute((0, 2, 1)) |
|
x = self.conv_2(x) |
|
x = x.permute((0, 2, 1)) |
|
x = self.dropout(x) |
|
x = 0.5 * x |
|
return x |
|
|
|
|
|
class ConformerConvModule(nn.Module): |
|
def __init__( |
|
self, |
|
d_model: int, |
|
expansion_factor: int = 2, |
|
kernel_size: int = 7, |
|
dropout: float = 0.1, |
|
lrelu_slope: float = 0.3, |
|
): |
|
""" |
|
Convolution module for conformer. Starts with a gating machanism. |
|
a pointwise convolution and a gated linear unit (GLU). This is followed |
|
by a single 1-D depthwise convolution layer. Batchnorm is deployed just after the convolution |
|
to help with training. it also contains an expansion factor to project the number of channels. |
|
|
|
Args: |
|
d_model (int): The dimension of model. |
|
expansion_factor (int): The factor by which to project the number of channels. |
|
kernel_size (int): Size of kernels for convolution modules. |
|
dropout (float): Probabilty of dropout. |
|
lrelu_slope (float): The slope coefficient for leaky relu activation. |
|
|
|
Inputs: inputs |
|
- **inputs** (batch, time, dim): Tensor containing input vector |
|
Returns: |
|
- **outputs** (batch, time, dim): Tensor produced by the conv module. |
|
|
|
""" |
|
super().__init__() |
|
inner_dim = d_model * expansion_factor |
|
self.ln_1 = nn.LayerNorm(d_model) |
|
self.conv_1 = PointwiseConv1d(d_model, inner_dim * 2) |
|
self.conv_act = GLUActivation(slope=lrelu_slope) |
|
self.depthwise = DepthWiseConv1d( |
|
inner_dim, |
|
inner_dim, |
|
kernel_size=kernel_size, |
|
padding=calc_same_padding(kernel_size)[0], |
|
) |
|
self.ln_2 = nn.GroupNorm(1, inner_dim) |
|
self.activation = nn.LeakyReLU(lrelu_slope) |
|
self.conv_2 = PointwiseConv1d(inner_dim, d_model) |
|
self.dropout = nn.Dropout(dropout) |
|
|
|
def forward(self, x: torch.Tensor) -> torch.Tensor: |
|
""" |
|
Shapes: |
|
x: :math: `[B, T, C]` |
|
""" |
|
x = self.ln_1(x) |
|
x = x.permute(0, 2, 1) |
|
x = self.conv_1(x) |
|
x = self.conv_act(x) |
|
x = self.depthwise(x) |
|
x = self.ln_2(x) |
|
x = self.activation(x) |
|
x = self.conv_2(x) |
|
x = x.permute(0, 2, 1) |
|
x = self.dropout(x) |
|
return x |
|
|
|
|
|
class ConformerMultiHeadedSelfAttention(nn.Module): |
|
""" |
|
Conformer employ multi-headed self-attention (MHSA) while integrating an important technique from Transformer-XL, |
|
the relative sinusoidal positional encoding scheme. The relative positional encoding allows the self-attention |
|
module to generalize better on different input length and the resulting encoder is more robust to the variance of |
|
the utterance length. Conformer use prenorm residual units with dropout which helps training |
|
and regularizing deeper models. |
|
Args: |
|
d_model (int): The dimension of model |
|
num_heads (int): The number of attention heads. |
|
dropout_p (float): probability of dropout |
|
Inputs: inputs, mask |
|
- **inputs** (batch, time, dim): Tensor containing input vector |
|
- **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked |
|
Returns: |
|
- **outputs** (batch, time, dim): Tensor produces by relative multi headed self attention module. |
|
""" |
|
|
|
def __init__(self, d_model: int, num_heads: int, dropout_p: float): |
|
super().__init__() |
|
self.attention = RelativeMultiHeadAttention(d_model=d_model, num_heads=num_heads) |
|
self.dropout = nn.Dropout(p=dropout_p) |
|
|
|
def forward( |
|
self, |
|
query: torch.Tensor, |
|
key: torch.Tensor, |
|
value: torch.Tensor, |
|
mask: torch.Tensor, |
|
encoding: torch.Tensor, |
|
) -> Tuple[torch.Tensor, torch.Tensor]: |
|
batch_size, seq_length, _ = key.size() |
|
encoding = encoding[:, : key.shape[1]] |
|
encoding = encoding.repeat(batch_size, 1, 1) |
|
outputs, attn = self.attention(query, key, value, pos_embedding=encoding, mask=mask) |
|
outputs = self.dropout(outputs) |
|
return outputs, attn |
|
|
|
|
|
class RelativeMultiHeadAttention(nn.Module): |
|
""" |
|
Multi-head attention with relative positional encoding. |
|
This concept was proposed in the "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" |
|
Args: |
|
d_model (int): The dimension of model |
|
num_heads (int): The number of attention heads. |
|
Inputs: query, key, value, pos_embedding, mask |
|
- **query** (batch, time, dim): Tensor containing query vector |
|
- **key** (batch, time, dim): Tensor containing key vector |
|
- **value** (batch, time, dim): Tensor containing value vector |
|
- **pos_embedding** (batch, time, dim): Positional embedding tensor |
|
- **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked |
|
Returns: |
|
- **outputs**: Tensor produces by relative multi head attention module. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
d_model: int = 512, |
|
num_heads: int = 16, |
|
): |
|
super().__init__() |
|
assert d_model % num_heads == 0, "d_model % num_heads should be zero." |
|
self.d_model = d_model |
|
self.d_head = int(d_model / num_heads) |
|
self.num_heads = num_heads |
|
self.sqrt_dim = math.sqrt(d_model) |
|
|
|
self.query_proj = nn.Linear(d_model, d_model) |
|
self.key_proj = nn.Linear(d_model, d_model, bias=False) |
|
self.value_proj = nn.Linear(d_model, d_model, bias=False) |
|
self.pos_proj = nn.Linear(d_model, d_model, bias=False) |
|
|
|
self.u_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head)) |
|
self.v_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head)) |
|
torch.nn.init.xavier_uniform_(self.u_bias) |
|
torch.nn.init.xavier_uniform_(self.v_bias) |
|
self.out_proj = nn.Linear(d_model, d_model) |
|
|
|
def forward( |
|
self, |
|
query: torch.Tensor, |
|
key: torch.Tensor, |
|
value: torch.Tensor, |
|
pos_embedding: torch.Tensor, |
|
mask: torch.Tensor, |
|
) -> Tuple[torch.Tensor, torch.Tensor]: |
|
batch_size = query.shape[0] |
|
query = self.query_proj(query).view(batch_size, -1, self.num_heads, self.d_head) |
|
key = self.key_proj(key).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3) |
|
value = self.value_proj(value).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3) |
|
pos_embedding = self.pos_proj(pos_embedding).view(batch_size, -1, self.num_heads, self.d_head) |
|
u_bias = self.u_bias.expand_as(query) |
|
v_bias = self.v_bias.expand_as(query) |
|
a = (query + u_bias).transpose(1, 2) |
|
content_score = a @ key.transpose(2, 3) |
|
b = (query + v_bias).transpose(1, 2) |
|
pos_score = b @ pos_embedding.permute(0, 2, 3, 1) |
|
pos_score = self._relative_shift(pos_score) |
|
|
|
score = content_score + pos_score |
|
score = score * (1.0 / self.sqrt_dim) |
|
|
|
score.masked_fill_(mask, -1e9) |
|
|
|
attn = F.softmax(score, -1) |
|
|
|
context = (attn @ value).transpose(1, 2) |
|
context = context.contiguous().view(batch_size, -1, self.d_model) |
|
|
|
return self.out_proj(context), attn |
|
|
|
def _relative_shift(self, pos_score: torch.Tensor) -> torch.Tensor: |
|
batch_size, num_heads, seq_length1, seq_length2 = pos_score.size() |
|
zeros = torch.zeros((batch_size, num_heads, seq_length1, 1), device=pos_score.device) |
|
padded_pos_score = torch.cat([zeros, pos_score], dim=-1) |
|
padded_pos_score = padded_pos_score.view(batch_size, num_heads, seq_length2 + 1, seq_length1) |
|
pos_score = padded_pos_score[:, :, 1:].view_as(pos_score) |
|
return pos_score |
|
|
|
|
|
class MultiHeadAttention(nn.Module): |
|
""" |
|
input: |
|
query --- [N, T_q, query_dim] |
|
key --- [N, T_k, key_dim] |
|
output: |
|
out --- [N, T_q, num_units] |
|
""" |
|
|
|
def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int): |
|
super().__init__() |
|
self.num_units = num_units |
|
self.num_heads = num_heads |
|
self.key_dim = key_dim |
|
|
|
self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False) |
|
self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False) |
|
self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False) |
|
|
|
def forward(self, query: torch.Tensor, key: torch.Tensor) -> torch.Tensor: |
|
querys = self.W_query(query) |
|
keys = self.W_key(key) |
|
values = self.W_value(key) |
|
split_size = self.num_units // self.num_heads |
|
querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0) |
|
keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) |
|
values = torch.stack(torch.split(values, split_size, dim=2), dim=0) |
|
|
|
scores = torch.matmul(querys, keys.transpose(2, 3)) |
|
scores = scores / (self.key_dim**0.5) |
|
scores = F.softmax(scores, dim=3) |
|
|
|
out = torch.matmul(scores, values) |
|
out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) |
|
return out |
|
|