{ "encoder_name": "WavLM", "encoder_config": { "hidden_dims": [512, 512, 512, 512, 512, 512, 512], "kernel_sizes": [10, 3, 3, 3, 3, 2, 2], "strides": [5, 2, 2, 2, 2, 2, 2], "num_layers": 6, "dim": 1024, "ffn_dim": 4096, "num_heads": 16, "num_buckets": 320, "max_distance": 800, "dropout": 0.0, "conv_pos": 128, "conv_pos_groups": 16 }, "compressor_name": "FocalEncoder", "compressor_config": { "input_dim": 1024, "output_dim": 13, "hidden_dims": [1024, 512, 256], "downscale_factors": [2, 2, 1], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "quantizer_name": "BinarySphericalQuantizer", "quantizer_config": { "codebook_size": 8192 }, "decompressor_name": "FocalDecoder", "decompressor_config": { "input_dim": 13, "output_dim": 1024, "hidden_dims": [256, 512, 1024], "upscale_factors": [1, 2, 2], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "decoder_name": "Vocos", "decoder_config": { "input_channels": 1024, "num_layers": 8, "dim": 512, "ffn_dim": 1536, "kernel_size": 7, "padding": 3, "layerscale_init": null, "n_fft": 1024, "hop_length": 320 } }