log_dir: "runs/run_mel_seed_uvit_xlsr_tiny" | |
save_freq: 1 | |
log_interval: 10 | |
save_interval: 500 | |
device: "cuda" | |
epochs: 1000 # number of epochs for first stage training (pre-training) | |
batch_size: 2 | |
batch_length: 100 # maximum duration of audio in a batch (in seconds) | |
max_len: 80 # maximum number of frames | |
pretrained_model: "DiT_uvit_tat_xlsr_ema.pth" | |
pretrained_encoder: "" | |
load_only_params: False # set to true if do not want to load epoch numbers and optimizer parameters | |
preprocess_params: | |
sr: 22050 | |
spect_params: | |
n_fft: 1024 | |
win_length: 1024 | |
hop_length: 256 | |
n_mels: 80 | |
fmin: 0 | |
fmax: 8000 | |
model_params: | |
dit_type: "DiT" # uDiT or DiT | |
reg_loss_type: "l1" # l1 or l2 | |
diffusion_type: "flow" | |
timbre_shifter: | |
se_db_path: "./modules/openvoice/checkpoints_v2/converter/se_db.pt" | |
ckpt_path: './modules/openvoice/checkpoints_v2/converter' | |
vocoder: | |
type: "hifigan" | |
speech_tokenizer: | |
type: 'xlsr' | |
output_layer: 12 | |
name: 'facebook/wav2vec2-xls-r-300m' | |
style_encoder: | |
dim: 192 | |
campplus_path: "campplus_cn_common.bin" | |
length_regulator: | |
channels: 384 | |
is_discrete: false | |
in_channels: 1024 | |
content_codebook_size: 1024 | |
sampling_ratios: [1, 1, 1, 1] | |
vector_quantize: false | |
n_codebooks: 2 | |
quantizer_dropout: 0.0 | |
f0_condition: false | |
n_f0_bins: 512 | |
DiT: | |
hidden_dim: 384 | |
num_heads: 6 | |
depth: 9 | |
class_dropout_prob: 0.1 | |
block_size: 8192 | |
in_channels: 80 | |
style_condition: true | |
final_layer_type: 'mlp' | |
target: 'mel' # mel or betavae | |
content_dim: 384 | |
content_codebook_size: 1024 | |
content_type: 'discrete' | |
f0_condition: false | |
n_f0_bins: 512 | |
content_codebooks: 1 | |
is_causal: false | |
long_skip_connection: false | |
zero_prompt_speech_token: false # for prompt component, do not input corresponding speech token | |
time_as_token: true | |
style_as_token: true | |
uvit_skip_connection: true | |
add_resblock_in_transformer: false | |
loss_params: | |
base_lr: 0.0001 |