taras-sereda's picture
speech tokenizer, requirements
3980644
raw
history blame
906 Bytes
{
"resblock": "1",
"num_gpus": 3,
"batch_size": 60,
"learning_rate": 0.0001,
"adam_b1": 0.5,
"adam_b2": 0.9,
"lr_decay": 0.98,
"seed": 1234,
"lambda_distill": 0.15,
"n_filters": 64,
"strides": [8,5,4,2],
"dimension": 1024,
"semantic_dimension": 768,
"bidirectional": true,
"dilation_base": 2,
"residual_kernel_size": 3,
"n_residual_layers": 1,
"lstm_layers": 2,
"activation": "ELU",
"segment_size": 48000,
"num_mels": 80,
"num_freq": 1025,
"n_fft": 1024,
"hop_size": 240,
"win_size": 1024,
"sampling_rate": 16000,
"sample_rate": 16000,
"codebook_size": 1024,
"n_q": 8,
"fmin": 0,
"fmax": 8000,
"fmax_for_loss": null,
"num_workers": 12,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54322",
"world_size": 1
}
}