|
from dataclasses import dataclass, field |
|
from typing import List |
|
|
|
from TTS.tts.configs.shared_configs import BaseTTSConfig |
|
|
|
|
|
@dataclass |
|
class NeuralhmmTTSConfig(BaseTTSConfig): |
|
""" |
|
Define parameters for Neural HMM TTS model. |
|
|
|
Example: |
|
|
|
>>> from TTS.tts.configs.overflow_config import OverflowConfig |
|
>>> config = OverflowConfig() |
|
|
|
Args: |
|
model (str): |
|
Model name used to select the right model class to initilize. Defaults to `Overflow`. |
|
run_eval_steps (int): |
|
Run evalulation epoch after N steps. If None, waits until training epoch is completed. Defaults to None. |
|
save_step (int): |
|
Save local checkpoint every save_step steps. Defaults to 500. |
|
plot_step (int): |
|
Plot training stats on the logger every plot_step steps. Defaults to 1. |
|
model_param_stats (bool): |
|
Log model parameters stats on the logger dashboard. Defaults to False. |
|
force_generate_statistics (bool): |
|
Force generate mel normalization statistics. Defaults to False. |
|
mel_statistics_parameter_path (str): |
|
Path to the mel normalization statistics.If the model doesn't finds a file there it will generate statistics. |
|
Defaults to None. |
|
num_chars (int): |
|
Number of characters used by the model. It must be defined before initializing the model. Defaults to None. |
|
state_per_phone (int): |
|
Generates N states per phone. Similar, to `add_blank` parameter in GlowTTS but in Overflow it is upsampled by model's encoder. Defaults to 2. |
|
encoder_in_out_features (int): |
|
Channels of encoder input and character embedding tensors. Defaults to 512. |
|
encoder_n_convolutions (int): |
|
Number of convolution layers in the encoder. Defaults to 3. |
|
out_channels (int): |
|
Channels of the final model output. It must match the spectragram size. Defaults to 80. |
|
ar_order (int): |
|
Autoregressive order of the model. Defaults to 1. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio. |
|
sampling_temp (float): |
|
Variation added to the sample from the latent space of neural HMM. Defaults to 0.334. |
|
deterministic_transition (bool): |
|
deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True. |
|
duration_threshold (float): |
|
Threshold for duration quantiles. Defaults to 0.55. Tune this to change the speaking rate of the synthesis, where lower values defines a slower speaking rate and higher values defines a faster speaking rate. |
|
use_grad_checkpointing (bool): |
|
Use gradient checkpointing to save memory. In a multi-GPU setting currently pytorch does not supports gradient checkpoint inside a loop so we will have to turn it off then.Adjust depending on whatever get more batch size either by using a single GPU or multi-GPU. Defaults to True. |
|
max_sampling_time (int): |
|
Maximum sampling time while synthesising latents from neural HMM. Defaults to 1000. |
|
prenet_type (str): |
|
`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the |
|
Prenet. Defaults to `original`. |
|
prenet_dim (int): |
|
Dimension of the Prenet. Defaults to 256. |
|
prenet_n_layers (int): |
|
Number of layers in the Prenet. Defaults to 2. |
|
prenet_dropout (float): |
|
Dropout rate of the Prenet. Defaults to 0.5. |
|
prenet_dropout_at_inference (bool): |
|
Use dropout at inference time. Defaults to False. |
|
memory_rnn_dim (int): |
|
Dimension of the memory LSTM to process the prenet output. Defaults to 1024. |
|
outputnet_size (list[int]): |
|
Size of the output network inside the neural HMM. Defaults to [1024]. |
|
flat_start_params (dict): |
|
Parameters for the flat start initialization of the neural HMM. Defaults to `{"mean": 0.0, "std": 1.0, "transition_p": 0.14}`. |
|
It will be recomputed when you pass the dataset. |
|
std_floor (float): |
|
Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint. Defaults to 0.01. |
|
It is called `variance flooring` in standard HMM literature. |
|
optimizer (str): |
|
Optimizer to use for training. Defaults to `adam`. |
|
optimizer_params (dict): |
|
Parameters for the optimizer. Defaults to `{"weight_decay": 1e-6}`. |
|
grad_clip (float): |
|
Gradient clipping threshold. Defaults to 40_000. |
|
lr (float): |
|
Learning rate. Defaults to 1e-3. |
|
lr_scheduler (str): |
|
Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or |
|
`TTS.utils.training`. Defaults to `None`. |
|
min_seq_len (int): |
|
Minimum input sequence length to be used at training. |
|
max_seq_len (int): |
|
Maximum input sequence length to be used at training. Larger values result in more VRAM usage. |
|
""" |
|
|
|
model: str = "NeuralHMM_TTS" |
|
|
|
|
|
run_eval_steps: int = 100 |
|
save_step: int = 500 |
|
plot_step: int = 1 |
|
model_param_stats: bool = False |
|
|
|
|
|
force_generate_statistics: bool = False |
|
mel_statistics_parameter_path: str = None |
|
|
|
|
|
num_chars: int = None |
|
state_per_phone: int = 2 |
|
encoder_in_out_features: int = 512 |
|
encoder_n_convolutions: int = 3 |
|
|
|
|
|
out_channels: int = 80 |
|
ar_order: int = 1 |
|
sampling_temp: float = 0 |
|
deterministic_transition: bool = True |
|
duration_threshold: float = 0.43 |
|
use_grad_checkpointing: bool = True |
|
max_sampling_time: int = 1000 |
|
|
|
|
|
prenet_type: str = "original" |
|
prenet_dim: int = 256 |
|
prenet_n_layers: int = 2 |
|
prenet_dropout: float = 0.5 |
|
prenet_dropout_at_inference: bool = True |
|
memory_rnn_dim: int = 1024 |
|
|
|
|
|
outputnet_size: List[int] = field(default_factory=lambda: [1024]) |
|
flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14}) |
|
std_floor: float = 0.001 |
|
|
|
|
|
optimizer: str = "Adam" |
|
optimizer_params: dict = field(default_factory=lambda: {"weight_decay": 1e-6}) |
|
grad_clip: float = 40000.0 |
|
lr: float = 1e-3 |
|
lr_scheduler: str = None |
|
|
|
|
|
min_text_len: int = 10 |
|
max_text_len: int = 500 |
|
min_audio_len: int = 512 |
|
|
|
|
|
test_sentences: List[str] = field( |
|
default_factory=lambda: [ |
|
"Be a voice, not an echo.", |
|
] |
|
) |
|
|
|
|
|
r: int = 1 |
|
use_d_vector_file: bool = False |
|
use_speaker_embedding: bool = False |
|
|
|
def check_values(self): |
|
"""Validate the hyperparameters. |
|
|
|
Raises: |
|
AssertionError: when the parameters network is not defined |
|
AssertionError: transition probability is not between 0 and 1 |
|
""" |
|
assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model." |
|
assert ( |
|
len(self.outputnet_size) >= 1 |
|
), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}" |
|
assert ( |
|
0 < self.flat_start_params["transition_p"] < 1 |
|
), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}" |
|
|