Spaces:
Runtime error
Runtime error
File size: 6,279 Bytes
2b7bf83 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
# This is the configuration file for LibriTTS dataset.
# This configuration is based on StyleMelGAN paper but
# uses MSE loss instead of Hinge loss. And I found that
# batch_size = 8 is also working good. So maybe if you
# want to accelerate the training, you can reduce the
# batch size (e.g. 8 or 16). Upsampling scales is modified
# to fit the shift size 300 pt.
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
sampling_rate: 24000 # Sampling rate.
fft_size: 2048 # FFT size.
hop_size: 300 # Hop size.
win_length: 1200 # Window length.
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
num_mels: 80 # Number of mel basis.
fmin: 80 # Minimum freq in mel basis calculation.
fmax: 7600 # Maximum frequency in mel basis calculation.
global_gain_scale: 1.0 # Will be multiplied to all of waveform.
trim_silence: false # Whether to trim the start and end of silence.
trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
trim_frame_size: 1024 # Frame size in trimming.
trim_hop_size: 256 # Hop size in trimming.
format: "hdf5" # Feature file format. " npy " or " hdf5 " is supported.
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
generator_type: "StyleMelGANGenerator" # Generator type.
generator_params:
in_channels: 128
aux_channels: 80
channels: 64
out_channels: 1
kernel_size: 9
dilation: 2
bias: True
noise_upsample_scales: [10, 2, 2, 2]
noise_upsample_activation: "LeakyReLU"
noise_upsample_activation_params:
negative_slope: 0.2
upsample_scales: [5, 1, 5, 1, 3, 1, 2, 2, 1]
upsample_mode: "nearest"
gated_function: "softmax"
use_weight_norm: True
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
discriminator_type: "StyleMelGANDiscriminator" # Discriminator type.
discriminator_params:
repeats: 4
window_sizes: [512, 1024, 2048, 4096]
pqmf_params:
- [1, None, None, None]
- [2, 62, 0.26700, 9.0]
- [4, 62, 0.14200, 9.0]
- [8, 62, 0.07949, 9.0]
discriminator_params:
out_channels: 1
kernel_sizes: [5, 3]
channels: 16
max_downsample_channels: 512
bias: True
downsample_scales: [4, 4, 4, 1]
nonlinear_activation: "LeakyReLU"
nonlinear_activation_params:
negative_slope: 0.2
use_weight_norm: True
###########################################################
# STFT LOSS SETTING #
###########################################################
stft_loss_params:
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
window: "hann_window" # Window function for STFT-based loss
lambda_aux: 1.0 # Loss balancing coefficient for aux loss.
###########################################################
# ADVERSARIAL LOSS SETTING #
###########################################################
lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
generator_adv_loss_params:
average_by_discriminators: false # Whether to average loss by #discriminators.
discriminator_adv_loss_params:
average_by_discriminators: false # Whether to average loss by #discriminators.
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 32 # Batch size.
batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: true # Whether to pin memory in Pytorch DataLoader.
num_workers: 2 # Number of workers in Pytorch DataLoader.
remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
allow_cache: false # Whether to allow cache in dataset. If true, it requires cpu memory.
###########################################################
# OPTIMIZER & SCHEDULER SETTING #
###########################################################
generator_optimizer_type: Adam
generator_optimizer_params:
lr: 1.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
generator_scheduler_type: MultiStepLR
generator_scheduler_params:
gamma: 0.5
milestones:
- 100000
- 300000
- 500000
- 700000
- 900000
generator_grad_norm: -1
discriminator_optimizer_type: Adam
discriminator_optimizer_params:
lr: 2.0e-4
betas: [0.5, 0.9]
weight_decay: 0.0
discriminator_scheduler_type: MultiStepLR
discriminator_scheduler_params:
gamma: 0.5
milestones:
- 200000
- 400000
- 600000
- 800000
discriminator_grad_norm: -1
###########################################################
# INTERVAL SETTING #
###########################################################
discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
train_max_steps: 1500000 # Number of training steps.
save_interval_steps: 50000 # Interval steps to save checkpoint.
eval_interval_steps: 1000 # Interval steps to evaluate the network.
log_interval_steps: 100 # Interval steps to record the training log.
###########################################################
# OTHER SETTING #
###########################################################
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|