|
import torch |
|
import torch.nn.functional as F |
|
from torch import nn |
|
from torch.nn.utils import spectral_norm |
|
from torch.nn.utils.parametrizations import weight_norm |
|
|
|
from TTS.utils.audio.torch_transforms import TorchSTFT |
|
from TTS.vocoder.models.hifigan_discriminator import MultiPeriodDiscriminator |
|
|
|
LRELU_SLOPE = 0.1 |
|
|
|
|
|
class SpecDiscriminator(nn.Module): |
|
"""docstring for Discriminator.""" |
|
|
|
def __init__(self, fft_size=1024, hop_length=120, win_length=600, use_spectral_norm=False): |
|
super().__init__() |
|
norm_f = weight_norm if use_spectral_norm is False else spectral_norm |
|
self.fft_size = fft_size |
|
self.hop_length = hop_length |
|
self.win_length = win_length |
|
self.stft = TorchSTFT(fft_size, hop_length, win_length) |
|
self.discriminators = nn.ModuleList( |
|
[ |
|
norm_f(nn.Conv2d(1, 32, kernel_size=(3, 9), padding=(1, 4))), |
|
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))), |
|
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))), |
|
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))), |
|
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))), |
|
] |
|
) |
|
|
|
self.out = norm_f(nn.Conv2d(32, 1, 3, 1, 1)) |
|
|
|
def forward(self, y): |
|
fmap = [] |
|
with torch.no_grad(): |
|
y = y.squeeze(1) |
|
y = self.stft(y) |
|
y = y.unsqueeze(1) |
|
for _, d in enumerate(self.discriminators): |
|
y = d(y) |
|
y = F.leaky_relu(y, LRELU_SLOPE) |
|
fmap.append(y) |
|
|
|
y = self.out(y) |
|
fmap.append(y) |
|
|
|
return torch.flatten(y, 1, -1), fmap |
|
|
|
|
|
class MultiResSpecDiscriminator(torch.nn.Module): |
|
def __init__( |
|
self, fft_sizes=[1024, 2048, 512], hop_sizes=[120, 240, 50], win_lengths=[600, 1200, 240], window="hann_window" |
|
): |
|
super().__init__() |
|
self.discriminators = nn.ModuleList( |
|
[ |
|
SpecDiscriminator(fft_sizes[0], hop_sizes[0], win_lengths[0], window), |
|
SpecDiscriminator(fft_sizes[1], hop_sizes[1], win_lengths[1], window), |
|
SpecDiscriminator(fft_sizes[2], hop_sizes[2], win_lengths[2], window), |
|
] |
|
) |
|
|
|
def forward(self, x): |
|
scores = [] |
|
feats = [] |
|
for d in self.discriminators: |
|
score, feat = d(x) |
|
scores.append(score) |
|
feats.append(feat) |
|
|
|
return scores, feats |
|
|
|
|
|
class UnivnetDiscriminator(nn.Module): |
|
"""Univnet discriminator wrapping MPD and MSD.""" |
|
|
|
def __init__(self): |
|
super().__init__() |
|
self.mpd = MultiPeriodDiscriminator() |
|
self.msd = MultiResSpecDiscriminator() |
|
|
|
def forward(self, x): |
|
""" |
|
Args: |
|
x (Tensor): input waveform. |
|
|
|
Returns: |
|
List[Tensor]: discriminator scores. |
|
List[List[Tensor]]: list of list of features from each layers of each discriminator. |
|
""" |
|
scores, feats = self.mpd(x) |
|
scores_, feats_ = self.msd(x) |
|
return scores + scores_, feats + feats_ |
|
|