metadata

license: apache-2.0
base_model:
  - microsoft/wavlm-large
pipeline_tag: audio-to-audio

FocalCodec

A low-bitrate single-codebook 16 kHz speech codec based on focal modulation.

Preprint: https://arxiv.org/abs/2502.04465
Project Page: https://lucadellalib.github.io/focalcodec-web/
GitHub: https://github.com/lucadellalib/focalcodec

▶️ Quickstart

See the readme at: https://github.com/lucadellalib/focalcodec

📌 Available Checkpoints

Checkpoint	Token Rate (Hz)	Bitrate (kbps)	Dataset
LibriTTS960_50Hz	50.0	0.65	LibriTTS960
LibriTTS960_25Hz	25.0	0.33	LibriTTS960
LibriTTS960_12_5Hz	12.5	0.16	LibriTTS960

@ Citing

@article{dellalibera2025focalcodec,
    title   = {{FocalCodec}: Low-Bitrate Speech Coding via Focal Modulation Networks},
    author  = {Luca {Della Libera} and Francesco Paissan and Cem Subakan and Mirco Ravanelli},
    journal = {arXiv preprint arXiv:2502.04465},
    year    = {2025},
}

📧 Contact

[email protected]

File information

The repository contains the following file information:

Filename: LibriTTS960_25Hz.json Content: { "encoder_name": "WavLM", "encoder_config": { "hidden_dims": [ 512, 512, 512, 512, 512, 512, 512 ], "kernel_sizes": [ 10, 3, 3, 3, 3, 2, 2 ], "strides": [ 5, 2, 2, 2, 2, 2, 2 ], "num_layers": 6, "dim": 1024, "ffn_dim": 4096, "num_heads": 16, "num_buckets": 320, "max_distance": 800, "dropout": 0.0, "conv_pos": 128, "conv_pos_groups": 16 }, "compressor_name": "FocalEncoder", "compressor_config": { "input_dim": 1024, "output_dim": 13, "hidden_dims": [ 1024, 512, 256 ], "downscale_factors": [ 2, 1, 1 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "quantizer_name": "BinarySphericalQuantizer", "quantizer_config": { "codebook_size": 8192 }, "decompressor_name": "FocalDecoder", "decompressor_config": { "input_dim": 13, "output_dim": 1024, "hidden_dims": [ 256, 512, 1024 ], "upscale_factors": [ 1, 1, 2 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "decoder_name": "Vocos", "decoder_config": { "input_channels": 1024, "num_layers": 8, "dim": 512, "ffn_dim": 1536, "kernel_size": 7, "padding": 3, "layerscale_init": null, "n_fft": 1024, "hop_length": 320 } }

Filename: focalcodec.png Content: "Content of the file is larger than 50 KB, too long to display."

Filename: LibriTTS960_50Hz.json Content: { "encoder_name": "WavLM", "encoder_config": { "hidden_dims": [ 512, 512, 512, 512, 512, 512, 512 ], "kernel_sizes": [ 10, 3, 3, 3, 3, 2, 2 ], "strides": [ 5, 2, 2, 2, 2, 2, 2 ], "num_layers": 6, "dim": 1024, "ffn_dim": 4096, "num_heads": 16, "num_buckets": 320, "max_distance": 800, "dropout": 0.0, "conv_pos": 128, "conv_pos_groups": 16 }, "compressor_name": "FocalEncoder", "compressor_config": { "input_dim": 1024, "output_dim": 13, "hidden_dims": [ 1024, 512, 256 ], "downscale_factors": [ 1, 1, 1 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "quantizer_name": "BinarySphericalQuantizer", "quantizer_config": { "codebook_size": 8192 }, "decompressor_name": "FocalDecoder", "decompressor_config": { "input_dim": 13, "output_dim": 1024, "hidden_dims": [ 256, 512, 1024 ], "upscale_factors": [ 1, 1, 1 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "decoder_name": "Vocos", "decoder_config": { "input_channels": 1024, "num_layers": 8, "dim": 512, "ffn_dim": 1536, "kernel_size": 7, "padding": 3, "layerscale_init": null, "n_fft": 1024, "hop_length": 320 } }

Filename: LibriTTS960_12_5Hz.json Content: { "encoder_name": "WavLM", "encoder_config": { "hidden_dims": [ 512, 512, 512, 512, 512, 512, 512 ], "kernel_sizes": [ 10, 3, 3, 3, 3, 2, 2 ], "strides": [ 5, 2, 2, 2, 2, 2, 2 ], "num_layers": 6, "dim": 1024, "ffn_dim": 4096, "num_heads": 16, "num_buckets": 320, "max_distance": 800, "dropout": 0.0, "conv_pos": 128, "conv_pos_groups": 16 }, "compressor_name": "FocalEncoder", "compressor_config": { "input_dim": 1024, "output_dim": 13, "hidden_dims": [ 1024, 512, 256 ], "downscale_factors": [ 2, 2, 1 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "quantizer_name": "BinarySphericalQuantizer", "quantizer_config": { "codebook_size": 8192 }, "decompressor_name": "FocalDecoder", "decompressor_config": { "input_dim": 13, "output_dim": 1024, "hidden_dims": [ 256, 512, 1024 ], "upscale_factors": [ 1, 2, 2 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "decoder_name": "Vocos", "decoder_config": { "input_channels": 1024, "num_layers": 8, "dim": 512, "ffn_dim": 1536, "kernel_size": 7, "padding": 3, "layerscale_init": null, "n_fft": 1024, "hop_length": 320 } }