pheme_small / config_s2a.json
pfb30's picture
fix naming
2be2c9c
{
"saving_path": "/home/ubuntu/experiments/a2s_giga2",
"resume_checkpoint": null,
"vocoder_type": "SPEECHTOKENIZER",
"vocoder_config_path": null,
"vocoder_ckpt_path": null,
"metapath": [
"/home/ubuntu/data/poly/giga-training-data/train.json"
],
"val_metapath": [
"/home/ubuntu/data/poly/giga-training-data/dev.json"
],
"pretrained_path": null,
"speaker_embedding_dir": null,
"sampledir": "/home/ubuntu/experiments/a2s_giga2",
"lr": 0.0005,
"batch_size": 400.0,
"train_bucket_size": 8192,
"training_step": 800000,
"optim_flat_percent": 0.0,
"warmup_step": 10000,
"adam_beta1": 0.9,
"adam_beta2": 0.98,
"ffd_size": 1024,
"hidden_size": 768,
"enc_nlayers": 3,
"dec_nlayers": 6,
"nheads": 8,
"dropout": 0.1,
"depthwise_conv_kernel_size": 5,
"aligner_softmax_temp": 1.0,
"layer_norm_eps": 1e-05,
"use_sem_tokens": true,
"use_spkr_emb": true,
"use_text_emb": false,
"fairseq": false,
"only_inference": false,
"speaker_embed_dropout": 0.05,
"label_smoothing": 0.0,
"val_check_interval": 1,
"max_dataset_samples": -1,
"check_val_every_n_epoch": 1,
"precision": "bf16",
"nworkers": 12,
"distributed": true,
"accelerator": "gpu",
"version": null,
"accumulate_grad_batches": 1,
"sagemaker": false,
"use_repetition_token": false,
"use_repetition_gating": false,
"repetition_penalty": 1.0,
"sampling_temperature": 1.0,
"top_k": -1,
"min_top_k": 3,
"top_p": 0.8,
"sample_num": 4,
"length_penalty_max_length": 150,
"length_penalty_max_prob": 0.95,
"max_input_length": 2048,
"max_output_length": 2000,
"phone_context_window": 3,
"sample_rate": 16000,
"n_codes": 1024,
"n_cluster_groups": 7,
"first_n_lvls": 7,
"use_pretrained_ckpt_cfg": false,
"n_semantic_codes": 1024
}