{ "model_params": { "decoder": { "resblock_dilation_sizes": [ [ 1, 3, 5 ], [ 1, 3, 5 ], [ 1, 3, 5 ] ], "resblock_kernel_sizes": [ 3, 7, 11 ], "type": "hifigan", "upsample_initial_channel": 512, "upsample_kernel_sizes": [ 20, 10, 6, 4 ], "upsample_rates": [ 10, 5, 3, 2 ] }, "diffusion": { "dist": { "estimate_sigma_data": true, "mean": -3.0, "sigma_data": 0.2, "std": 1.0 }, "embedding_mask_proba": 0.1, "transformer": { "head_features": 64, "multiplier": 2, "num_heads": 8, "num_layers": 3 } }, "dim_in": 64, "dropout": 0.2, "hidden_dim": 512, "max_conv_dim": 512, "max_dur": 50, "multispeaker": false, "n_layer": 3, "n_mels": 80, "n_token": 178, "slm": { "hidden": 768, "initial_channel": 64, "model": "microsoft/wavlm-base-plus", "nlayers": 13, "sr": 16000 }, "style_dim": 128 }, "training_config": { "epochs": 4, "batch_size": 2, "max_len": 120, "optimizer": { "bert_lr": 1e-05, "ft_lr": 0.0001, "lr": 0.0001 }, "loss_params": { "diff_epoch": 1, "joint_epoch": 110, "lambda_F0": 1.0, "lambda_ce": 20.0, "lambda_diff": 1.0, "lambda_dur": 1.0, "lambda_gen": 1.0, "lambda_mel": 5.0, "lambda_mono": 1.0, "lambda_norm": 1.0, "lambda_s2s": 1.0, "lambda_slm": 1.0, "lambda_sty": 1.0 } }, "preprocess_params": { "spect_params": { "hop_length": 300, "n_fft": 2048, "win_length": 1200 }, "sr": 24000 }, "data_params": { "OOD_data": "Data/OOD_texts.txt", "min_length": 50, "root_path": "Data/wavs", "train_data": "Data/train_list.txt", "val_data": "Data/val_list.txt" }, "model_state": { "epoch": 3, "iterations": 310, "val_loss": 0.41642701625823975 }, "training_metrics": { "train_loss": [], "val_loss": [ 18.0, 38.0, 58.0, 15.0 ], "dur_loss": [ 0.458, 0.444, 0.428, 0.416 ], "F0_loss": [ 1.186, 1.157, 1.089, 1.198 ], "epochs": [ 1, 2, 3, 4 ] } }