Spaces:
Running
on
T4
Running
on
T4
File size: 6,643 Bytes
9e275b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
# Copyright 2019 Tomoki Hayashi
# MIT License (https://opensource.org/licenses/MIT)
# Adapted by Florian Lux 2021
import torch
from Architectures.GeneralLayers.ConditionalLayerNorm import AdaIN1d
from Architectures.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
from Architectures.GeneralLayers.LayerNorm import LayerNorm
from Utility.utils import integrate_with_utt_embed
class DurationPredictor(torch.nn.Module):
"""
Duration predictor module.
This is a module of duration predictor described
in `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
The duration predictor predicts a duration of each frame in log domain
from the hidden embeddings of encoder.
.. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
https://arxiv.org/pdf/1905.09263.pdf
Note:
The calculation domain of outputs is different
between in `forward` and in `inference`. In `forward`,
the outputs are calculated in log domain but in `inference`,
those are calculated in linear domain.
"""
def __init__(self, idim,
n_layers=2,
n_chans=384,
kernel_size=3,
dropout_rate=0.1,
offset=1.0,
utt_embed_dim=None,
embedding_integration="AdaIN"):
"""
Initialize duration predictor module.
Args:
idim (int): Input dimension.
n_layers (int, optional): Number of convolutional layers.
n_chans (int, optional): Number of channels of convolutional layers.
kernel_size (int, optional): Kernel size of convolutional layers.
dropout_rate (float, optional): Dropout rate.
offset (float, optional): Offset value to avoid nan in log domain.
"""
super(DurationPredictor, self).__init__()
self.offset = offset
self.conv = torch.nn.ModuleList()
self.dropouts = torch.nn.ModuleList()
self.norms = torch.nn.ModuleList()
self.embedding_projections = torch.nn.ModuleList()
self.utt_embed_dim = utt_embed_dim
self.use_conditional_layernorm_embedding_integration = embedding_integration in ["AdaIN", "ConditionalLayerNorm"]
for idx in range(n_layers):
if utt_embed_dim is not None:
if embedding_integration == "AdaIN":
self.embedding_projections += [AdaIN1d(style_dim=utt_embed_dim, num_features=idim)]
elif embedding_integration == "ConditionalLayerNorm":
self.embedding_projections += [ConditionalLayerNorm(speaker_embedding_dim=utt_embed_dim, hidden_dim=idim)]
else:
self.embedding_projections += [torch.nn.Linear(utt_embed_dim + idim, idim)]
else:
self.embedding_projections += [lambda x: x]
in_chans = idim if idx == 0 else n_chans
self.conv += [torch.nn.Sequential(torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, ),
torch.nn.ReLU())]
self.norms += [LayerNorm(n_chans, dim=1)]
self.dropouts += [torch.nn.Dropout(dropout_rate)]
self.linear = torch.nn.Linear(n_chans, 1)
def _forward(self, xs, x_masks=None, is_inference=False, utt_embed=None):
xs = xs.transpose(1, -1) # (B, idim, Tmax)
for f, c, d, p in zip(self.conv, self.norms, self.dropouts, self.embedding_projections):
xs = f(xs) # (B, C, Tmax)
if self.utt_embed_dim is not None:
xs = integrate_with_utt_embed(hs=xs.transpose(1, 2), utt_embeddings=utt_embed, projection=p, embedding_training=self.use_conditional_layernorm_embedding_integration).transpose(1, 2)
xs = c(xs)
xs = d(xs)
# NOTE: targets are transformed to log domain in the loss calculation, so this will learn to predict in the log space, which makes the value range easier to handle.
xs = self.linear(xs.transpose(1, -1)).squeeze(-1) # (B, Tmax)
if is_inference:
# NOTE: since we learned to predict in the log domain, we have to invert the log during inference.
xs = torch.clamp(torch.round(xs.exp() - self.offset), min=0).long() # avoid negative value
else:
xs = xs.masked_fill(x_masks, 0.0)
return xs
def forward(self, xs, padding_mask=None, utt_embed=None):
"""
Calculate forward propagation.
Args:
xs (Tensor): Batch of input sequences (B, Tmax, idim).
padding_mask (ByteTensor, optional):
Batch of masks indicating padded part (B, Tmax).
Returns:
Tensor: Batch of predicted durations in log domain (B, Tmax).
"""
return self._forward(xs, padding_mask, False, utt_embed=utt_embed)
def inference(self, xs, padding_mask=None, utt_embed=None):
"""
Inference duration.
Args:
xs (Tensor): Batch of input sequences (B, Tmax, idim).
padding_mask (ByteTensor, optional):
Batch of masks indicating padded part (B, Tmax).
Returns:
LongTensor: Batch of predicted durations in linear domain (B, Tmax).
"""
return self._forward(xs, padding_mask, True, utt_embed=utt_embed)
class DurationPredictorLoss(torch.nn.Module):
"""
Loss function module for duration predictor.
The loss value is Calculated in log domain to make it Gaussian.
"""
def __init__(self, offset=1.0, reduction="mean"):
"""
Args:
offset (float, optional): Offset value to avoid nan in log domain.
reduction (str): Reduction type in loss calculation.
"""
super(DurationPredictorLoss, self).__init__()
self.criterion = torch.nn.MSELoss(reduction=reduction)
self.offset = offset
def forward(self, outputs, targets):
"""
Calculate forward propagation.
Args:
outputs (Tensor): Batch of prediction durations in log domain (B, T)
targets (LongTensor): Batch of groundtruth durations in linear domain (B, T)
Returns:
Tensor: Mean squared error loss value.
Note:
`outputs` is in log domain but `targets` is in linear domain.
"""
# NOTE: outputs is in log domain while targets in linear
targets = torch.log(targets.float() + self.offset)
loss = self.criterion(outputs, targets)
return loss
|