|
import re |
|
import torch |
|
import importlib |
|
import numpy as np |
|
from matplotlib import pyplot as plt |
|
|
|
from TTS.tts.utils.visual import plot_spectrogram |
|
|
|
|
|
def interpolate_vocoder_input(scale_factor, spec): |
|
"""Interpolate spectrogram by the scale factor. |
|
It is mainly used to match the sampling rates of |
|
the tts and vocoder models. |
|
|
|
Args: |
|
scale_factor (float): scale factor to interpolate the spectrogram |
|
spec (np.array): spectrogram to be interpolated |
|
|
|
Returns: |
|
torch.tensor: interpolated spectrogram. |
|
""" |
|
print(" > before interpolation :", spec.shape) |
|
spec = torch.tensor(spec).unsqueeze(0).unsqueeze(0) |
|
spec = torch.nn.functional.interpolate(spec, |
|
scale_factor=scale_factor, |
|
recompute_scale_factor=True, |
|
mode='bilinear', |
|
align_corners=False).squeeze(0) |
|
print(" > after interpolation :", spec.shape) |
|
return spec |
|
|
|
|
|
def plot_results(y_hat, y, ap, global_step, name_prefix): |
|
""" Plot vocoder model results """ |
|
|
|
|
|
y_hat = y_hat[0].squeeze(0).detach().cpu().numpy() |
|
y = y[0].squeeze(0).detach().cpu().numpy() |
|
|
|
spec_fake = ap.melspectrogram(y_hat).T |
|
spec_real = ap.melspectrogram(y).T |
|
spec_diff = np.abs(spec_fake - spec_real) |
|
|
|
|
|
fig_wave = plt.figure() |
|
plt.subplot(2, 1, 1) |
|
plt.plot(y) |
|
plt.title("groundtruth speech") |
|
plt.subplot(2, 1, 2) |
|
plt.plot(y_hat) |
|
plt.title(f"generated speech @ {global_step} steps") |
|
plt.tight_layout() |
|
plt.close() |
|
|
|
figures = { |
|
name_prefix + "spectrogram/fake": plot_spectrogram(spec_fake), |
|
name_prefix + "spectrogram/real": plot_spectrogram(spec_real), |
|
name_prefix + "spectrogram/diff": plot_spectrogram(spec_diff), |
|
name_prefix + "speech_comparison": fig_wave, |
|
} |
|
return figures |
|
|
|
|
|
def to_camel(text): |
|
text = text.capitalize() |
|
return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) |
|
|
|
|
|
def setup_wavernn(c): |
|
print(" > Model: WaveRNN") |
|
MyModel = importlib.import_module("TTS.vocoder.models.wavernn") |
|
MyModel = getattr(MyModel, "WaveRNN") |
|
model = MyModel( |
|
rnn_dims=c.wavernn_model_params['rnn_dims'], |
|
fc_dims=c.wavernn_model_params['fc_dims'], |
|
mode=c.mode, |
|
mulaw=c.mulaw, |
|
pad=c.padding, |
|
use_aux_net=c.wavernn_model_params['use_aux_net'], |
|
use_upsample_net=c.wavernn_model_params['use_upsample_net'], |
|
upsample_factors=c.wavernn_model_params['upsample_factors'], |
|
feat_dims=c.audio['num_mels'], |
|
compute_dims=c.wavernn_model_params['compute_dims'], |
|
res_out_dims=c.wavernn_model_params['res_out_dims'], |
|
num_res_blocks=c.wavernn_model_params['num_res_blocks'], |
|
hop_length=c.audio["hop_length"], |
|
sample_rate=c.audio["sample_rate"], |
|
) |
|
return model |
|
|
|
|
|
def setup_generator(c): |
|
print(" > Generator Model: {}".format(c.generator_model)) |
|
MyModel = importlib.import_module('TTS.vocoder.models.' + |
|
c.generator_model.lower()) |
|
MyModel = getattr(MyModel, to_camel(c.generator_model)) |
|
if c.generator_model.lower() in 'melgan_generator': |
|
model = MyModel( |
|
in_channels=c.audio['num_mels'], |
|
out_channels=1, |
|
proj_kernel=7, |
|
base_channels=512, |
|
upsample_factors=c.generator_model_params['upsample_factors'], |
|
res_kernel=3, |
|
num_res_blocks=c.generator_model_params['num_res_blocks']) |
|
if c.generator_model in 'melgan_fb_generator': |
|
pass |
|
if c.generator_model.lower() in 'multiband_melgan_generator': |
|
model = MyModel( |
|
in_channels=c.audio['num_mels'], |
|
out_channels=4, |
|
proj_kernel=7, |
|
base_channels=384, |
|
upsample_factors=c.generator_model_params['upsample_factors'], |
|
res_kernel=3, |
|
num_res_blocks=c.generator_model_params['num_res_blocks']) |
|
if c.generator_model.lower() in 'fullband_melgan_generator': |
|
model = MyModel( |
|
in_channels=c.audio['num_mels'], |
|
out_channels=1, |
|
proj_kernel=7, |
|
base_channels=512, |
|
upsample_factors=c.generator_model_params['upsample_factors'], |
|
res_kernel=3, |
|
num_res_blocks=c.generator_model_params['num_res_blocks']) |
|
if c.generator_model.lower() in 'parallel_wavegan_generator': |
|
model = MyModel( |
|
in_channels=1, |
|
out_channels=1, |
|
kernel_size=3, |
|
num_res_blocks=c.generator_model_params['num_res_blocks'], |
|
stacks=c.generator_model_params['stacks'], |
|
res_channels=64, |
|
gate_channels=128, |
|
skip_channels=64, |
|
aux_channels=c.audio['num_mels'], |
|
dropout=0.0, |
|
bias=True, |
|
use_weight_norm=True, |
|
upsample_factors=c.generator_model_params['upsample_factors']) |
|
if c.generator_model.lower() in 'wavegrad': |
|
model = MyModel( |
|
in_channels=c['audio']['num_mels'], |
|
out_channels=1, |
|
use_weight_norm=c['model_params']['use_weight_norm'], |
|
x_conv_channels=c['model_params']['x_conv_channels'], |
|
y_conv_channels=c['model_params']['y_conv_channels'], |
|
dblock_out_channels=c['model_params']['dblock_out_channels'], |
|
ublock_out_channels=c['model_params']['ublock_out_channels'], |
|
upsample_factors=c['model_params']['upsample_factors'], |
|
upsample_dilations=c['model_params']['upsample_dilations']) |
|
return model |
|
|
|
|
|
def setup_discriminator(c): |
|
print(" > Discriminator Model: {}".format(c.discriminator_model)) |
|
if 'parallel_wavegan' in c.discriminator_model: |
|
MyModel = importlib.import_module( |
|
'TTS.vocoder.models.parallel_wavegan_discriminator') |
|
else: |
|
MyModel = importlib.import_module('TTS.vocoder.models.' + |
|
c.discriminator_model.lower()) |
|
MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) |
|
if c.discriminator_model in 'random_window_discriminator': |
|
model = MyModel( |
|
cond_channels=c.audio['num_mels'], |
|
hop_length=c.audio['hop_length'], |
|
uncond_disc_donwsample_factors=c. |
|
discriminator_model_params['uncond_disc_donwsample_factors'], |
|
cond_disc_downsample_factors=c. |
|
discriminator_model_params['cond_disc_downsample_factors'], |
|
cond_disc_out_channels=c. |
|
discriminator_model_params['cond_disc_out_channels'], |
|
window_sizes=c.discriminator_model_params['window_sizes']) |
|
if c.discriminator_model in 'melgan_multiscale_discriminator': |
|
model = MyModel( |
|
in_channels=1, |
|
out_channels=1, |
|
kernel_sizes=(5, 3), |
|
base_channels=c.discriminator_model_params['base_channels'], |
|
max_channels=c.discriminator_model_params['max_channels'], |
|
downsample_factors=c. |
|
discriminator_model_params['downsample_factors']) |
|
if c.discriminator_model == 'residual_parallel_wavegan_discriminator': |
|
model = MyModel( |
|
in_channels=1, |
|
out_channels=1, |
|
kernel_size=3, |
|
num_layers=c.discriminator_model_params['num_layers'], |
|
stacks=c.discriminator_model_params['stacks'], |
|
res_channels=64, |
|
gate_channels=128, |
|
skip_channels=64, |
|
dropout=0.0, |
|
bias=True, |
|
nonlinear_activation="LeakyReLU", |
|
nonlinear_activation_params={"negative_slope": 0.2}, |
|
) |
|
if c.discriminator_model == 'parallel_wavegan_discriminator': |
|
model = MyModel( |
|
in_channels=1, |
|
out_channels=1, |
|
kernel_size=3, |
|
num_layers=c.discriminator_model_params['num_layers'], |
|
conv_channels=64, |
|
dilation_factor=1, |
|
nonlinear_activation="LeakyReLU", |
|
nonlinear_activation_params={"negative_slope": 0.2}, |
|
bias=True |
|
) |
|
return model |
|
|
|
|
|
|
|
|
|
|
|
|