--- license: apache-2.0 base_model: - microsoft/wavlm-large pipeline_tag: audio-to-audio --- # FocalCodec A low-bitrate single-codebook 16 kHz speech codec based on [focal modulation](https://arxiv.org/abs/2203.11926). - **Preprint**: https://arxiv.org/abs/2502.04465 - **Project Page**: https://lucadellalib.github.io/focalcodec-web/ - **GitHub**: https://github.com/lucadellalib/focalcodec --------------------------------------------------------------------------------------------------------- ## ▶️ Quickstart See the readme at: https://github.com/lucadellalib/focalcodec --------------------------------------------------------------------------------------------------------- ## 📌 Available Checkpoints | Checkpoint | Token Rate (Hz) | Bitrate (kbps) | Dataset | |:-----------------------:|:---------------:|:--------------:|:-----------:| | **LibriTTS960_50Hz** | 50.0 | 0.65 | LibriTTS960 | | **LibriTTS960_25Hz** | 25.0 | 0.33 | LibriTTS960 | | **LibriTTS960_12_5Hz** | 12.5 | 0.16 | LibriTTS960 | --------------------------------------------------------------------------------------------------------- ## @ Citing ``` @article{dellalibera2025focalcodec, title = {{FocalCodec}: Low-Bitrate Speech Coding via Focal Modulation Networks}, author = {Luca {Della Libera} and Francesco Paissan and Cem Subakan and Mirco Ravanelli}, journal = {arXiv preprint arXiv:2502.04465}, year = {2025}, } ``` --------------------------------------------------------------------------------------------------------- ## 📧 Contact [luca.dellalib@gmail.com](mailto:luca.dellalib@gmail.com) --------------------------------------------------------------------------------------------------------- # File information The repository contains the following file information: Filename: LibriTTS960_25Hz.json Content: { "encoder_name": "WavLM", "encoder_config": { "hidden_dims": [ 512, 512, 512, 512, 512, 512, 512 ], "kernel_sizes": [ 10, 3, 3, 3, 3, 2, 2 ], "strides": [ 5, 2, 2, 2, 2, 2, 2 ], "num_layers": 6, "dim": 1024, "ffn_dim": 4096, "num_heads": 16, "num_buckets": 320, "max_distance": 800, "dropout": 0.0, "conv_pos": 128, "conv_pos_groups": 16 }, "compressor_name": "FocalEncoder", "compressor_config": { "input_dim": 1024, "output_dim": 13, "hidden_dims": [ 1024, 512, 256 ], "downscale_factors": [ 2, 1, 1 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "quantizer_name": "BinarySphericalQuantizer", "quantizer_config": { "codebook_size": 8192 }, "decompressor_name": "FocalDecoder", "decompressor_config": { "input_dim": 13, "output_dim": 1024, "hidden_dims": [ 256, 512, 1024 ], "upscale_factors": [ 1, 1, 2 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "decoder_name": "Vocos", "decoder_config": { "input_channels": 1024, "num_layers": 8, "dim": 512, "ffn_dim": 1536, "kernel_size": 7, "padding": 3, "layerscale_init": null, "n_fft": 1024, "hop_length": 320 } } Filename: focalcodec.png Content: "Content of the file is larger than 50 KB, too long to display." Filename: LibriTTS960_50Hz.json Content: { "encoder_name": "WavLM", "encoder_config": { "hidden_dims": [ 512, 512, 512, 512, 512, 512, 512 ], "kernel_sizes": [ 10, 3, 3, 3, 3, 2, 2 ], "strides": [ 5, 2, 2, 2, 2, 2, 2 ], "num_layers": 6, "dim": 1024, "ffn_dim": 4096, "num_heads": 16, "num_buckets": 320, "max_distance": 800, "dropout": 0.0, "conv_pos": 128, "conv_pos_groups": 16 }, "compressor_name": "FocalEncoder", "compressor_config": { "input_dim": 1024, "output_dim": 13, "hidden_dims": [ 1024, 512, 256 ], "downscale_factors": [ 1, 1, 1 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "quantizer_name": "BinarySphericalQuantizer", "quantizer_config": { "codebook_size": 8192 }, "decompressor_name": "FocalDecoder", "decompressor_config": { "input_dim": 13, "output_dim": 1024, "hidden_dims": [ 256, 512, 1024 ], "upscale_factors": [ 1, 1, 1 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "decoder_name": "Vocos", "decoder_config": { "input_channels": 1024, "num_layers": 8, "dim": 512, "ffn_dim": 1536, "kernel_size": 7, "padding": 3, "layerscale_init": null, "n_fft": 1024, "hop_length": 320 } } Filename: LibriTTS960_12_5Hz.json Content: { "encoder_name": "WavLM", "encoder_config": { "hidden_dims": [ 512, 512, 512, 512, 512, 512, 512 ], "kernel_sizes": [ 10, 3, 3, 3, 3, 2, 2 ], "strides": [ 5, 2, 2, 2, 2, 2, 2 ], "num_layers": 6, "dim": 1024, "ffn_dim": 4096, "num_heads": 16, "num_buckets": 320, "max_distance": 800, "dropout": 0.0, "conv_pos": 128, "conv_pos_groups": 16 }, "compressor_name": "FocalEncoder", "compressor_config": { "input_dim": 1024, "output_dim": 13, "hidden_dims": [ 1024, 512, 256 ], "downscale_factors": [ 2, 2, 1 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "quantizer_name": "BinarySphericalQuantizer", "quantizer_config": { "codebook_size": 8192 }, "decompressor_name": "FocalDecoder", "decompressor_config": { "input_dim": 13, "output_dim": 1024, "hidden_dims": [ 256, 512, 1024 ], "upscale_factors": [ 1, 2, 2 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "decoder_name": "Vocos", "decoder_config": { "input_channels": 1024, "num_layers": 8, "dim": 512, "ffn_dim": 1536, "kernel_size": 7, "padding": 3, "layerscale_init": null, "n_fft": 1024, "hop_length": 320 } }