|
import math |
|
from typing import Tuple |
|
|
|
import numpy as np |
|
import torch |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
|
|
from TTS.tts.layers.delightful_tts.conv_layers import ConvNorm |
|
|
|
|
|
def initialize_embeddings(shape: Tuple[int]) -> torch.Tensor: |
|
assert len(shape) == 2, "Can only initialize 2-D embedding matrices ..." |
|
|
|
return torch.randn(shape) * np.sqrt(2 / shape[1]) |
|
|
|
|
|
def positional_encoding(d_model: int, length: int, device: torch.device) -> torch.Tensor: |
|
pe = torch.zeros(length, d_model, device=device) |
|
position = torch.arange(0, length, dtype=torch.float, device=device).unsqueeze(1) |
|
div_term = torch.exp(torch.arange(0, d_model, 2, device=device).float() * -(math.log(10000.0) / d_model)) |
|
pe[:, 0::2] = torch.sin(position * div_term) |
|
pe[:, 1::2] = torch.cos(position * div_term) |
|
pe = pe.unsqueeze(0) |
|
return pe |
|
|
|
|
|
class BottleneckLayer(nn.Module): |
|
""" |
|
Bottleneck layer for reducing the dimensionality of a tensor. |
|
|
|
Args: |
|
in_dim: The number of input dimensions. |
|
reduction_factor: The factor by which to reduce the number of dimensions. |
|
norm: The normalization method to use. Can be "weightnorm" or "instancenorm". |
|
non_linearity: The non-linearity to use. Can be "relu" or "leakyrelu". |
|
kernel_size: The size of the convolutional kernel. |
|
use_partial_padding: Whether to use partial padding with the convolutional kernel. |
|
|
|
Shape: |
|
- Input: :math:`[N, in_dim]` where `N` is the batch size and `in_dim` is the number of input dimensions. |
|
|
|
- Output: :math:`[N, out_dim]` where `out_dim` is the number of output dimensions. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
in_dim, |
|
reduction_factor, |
|
norm="weightnorm", |
|
non_linearity="relu", |
|
kernel_size=3, |
|
use_partial_padding=False, |
|
): |
|
super(BottleneckLayer, self).__init__() |
|
|
|
self.reduction_factor = reduction_factor |
|
reduced_dim = int(in_dim / reduction_factor) |
|
self.out_dim = reduced_dim |
|
if self.reduction_factor > 1: |
|
fn = ConvNorm(in_dim, reduced_dim, kernel_size=kernel_size, use_weight_norm=(norm == "weightnorm")) |
|
if norm == "instancenorm": |
|
fn = nn.Sequential(fn, nn.InstanceNorm1d(reduced_dim, affine=True)) |
|
|
|
self.projection_fn = fn |
|
self.non_linearity = nn.ReLU() |
|
if non_linearity == "leakyrelu": |
|
self.non_linearity = nn.LeakyReLU() |
|
|
|
def forward(self, x): |
|
if self.reduction_factor > 1: |
|
x = self.projection_fn(x) |
|
x = self.non_linearity(x) |
|
return x |
|
|
|
|
|
class GLUActivation(nn.Module): |
|
"""Class that implements the Gated Linear Unit (GLU) activation function. |
|
|
|
The GLU activation function is a variant of the Leaky ReLU activation function, |
|
where the output of the activation function is gated by an input tensor. |
|
|
|
""" |
|
|
|
def __init__(self, slope: float): |
|
super().__init__() |
|
self.lrelu = nn.LeakyReLU(slope) |
|
|
|
def forward(self, x: torch.Tensor) -> torch.Tensor: |
|
out, gate = x.chunk(2, dim=1) |
|
x = out * self.lrelu(gate) |
|
return x |
|
|
|
|
|
class StyleEmbedAttention(nn.Module): |
|
def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int): |
|
super().__init__() |
|
self.num_units = num_units |
|
self.num_heads = num_heads |
|
self.key_dim = key_dim |
|
|
|
self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False) |
|
self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False) |
|
self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False) |
|
|
|
def forward(self, query: torch.Tensor, key_soft: torch.Tensor) -> torch.Tensor: |
|
values = self.W_value(key_soft) |
|
split_size = self.num_units // self.num_heads |
|
values = torch.stack(torch.split(values, split_size, dim=2), dim=0) |
|
|
|
out_soft = scores_soft = None |
|
querys = self.W_query(query) |
|
keys = self.W_key(key_soft) |
|
|
|
|
|
querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0) |
|
|
|
keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) |
|
|
|
|
|
|
|
scores_soft = torch.matmul(querys, keys.transpose(2, 3)) |
|
scores_soft = scores_soft / (self.key_dim**0.5) |
|
scores_soft = F.softmax(scores_soft, dim=3) |
|
|
|
|
|
|
|
out_soft = torch.matmul(scores_soft, values) |
|
out_soft = torch.cat(torch.split(out_soft, 1, dim=0), dim=3).squeeze(0) |
|
|
|
return out_soft |
|
|
|
|
|
class EmbeddingPadded(nn.Module): |
|
def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int): |
|
super().__init__() |
|
padding_mult = torch.ones((num_embeddings, 1), dtype=torch.int64) |
|
padding_mult[padding_idx] = 0 |
|
self.register_buffer("padding_mult", padding_mult) |
|
self.embeddings = nn.parameter.Parameter(initialize_embeddings((num_embeddings, embedding_dim))) |
|
|
|
def forward(self, idx: torch.Tensor) -> torch.Tensor: |
|
embeddings_zeroed = self.embeddings * self.padding_mult |
|
x = F.embedding(idx, embeddings_zeroed) |
|
return x |
|
|
|
|
|
class EmbeddingProjBlock(nn.Module): |
|
def __init__(self, embedding_dim: int): |
|
super().__init__() |
|
self.layers = nn.ModuleList( |
|
[ |
|
nn.Linear(embedding_dim, embedding_dim), |
|
nn.LeakyReLU(0.3), |
|
nn.Linear(embedding_dim, embedding_dim), |
|
nn.LeakyReLU(0.3), |
|
] |
|
) |
|
|
|
def forward(self, x: torch.Tensor) -> torch.Tensor: |
|
res = x |
|
for layer in self.layers: |
|
x = layer(x) |
|
x = x + res |
|
return x |
|
|
|
|
|
class LinearNorm(nn.Module): |
|
def __init__(self, in_features: int, out_features: int, bias: bool = False): |
|
super().__init__() |
|
self.linear = nn.Linear(in_features, out_features, bias) |
|
|
|
nn.init.xavier_uniform_(self.linear.weight) |
|
if bias: |
|
nn.init.constant_(self.linear.bias, 0.0) |
|
|
|
def forward(self, x: torch.Tensor) -> torch.Tensor: |
|
x = self.linear(x) |
|
return x |
|
|
|
|
|
class STL(nn.Module): |
|
""" |
|
A PyTorch module for the Style Token Layer (STL) as described in |
|
"A Style-Based Generator Architecture for Generative Adversarial Networks" |
|
(https://arxiv.org/abs/1812.04948) |
|
|
|
The STL applies a multi-headed attention mechanism over the learned style tokens, |
|
using the text input as the query and the style tokens as the keys and values. |
|
The output of the attention mechanism is used as the text's style embedding. |
|
|
|
Args: |
|
token_num (int): The number of style tokens. |
|
n_hidden (int): Number of hidden dimensions. |
|
""" |
|
|
|
def __init__(self, n_hidden: int, token_num: int): |
|
super(STL, self).__init__() |
|
|
|
num_heads = 1 |
|
E = n_hidden |
|
self.token_num = token_num |
|
self.embed = nn.Parameter(torch.FloatTensor(self.token_num, E // num_heads)) |
|
d_q = E // 2 |
|
d_k = E // num_heads |
|
self.attention = StyleEmbedAttention(query_dim=d_q, key_dim=d_k, num_units=E, num_heads=num_heads) |
|
|
|
torch.nn.init.normal_(self.embed, mean=0, std=0.5) |
|
|
|
def forward(self, x: torch.Tensor) -> torch.Tensor: |
|
N = x.size(0) |
|
query = x.unsqueeze(1) |
|
|
|
keys_soft = torch.tanh(self.embed).unsqueeze(0).expand(N, -1, -1) |
|
|
|
|
|
emotion_embed_soft = self.attention(query, keys_soft) |
|
|
|
return emotion_embed_soft |
|
|