Audio-to-Audio
Safetensors
torch
focalcodec_12_5hz / README.md
lucadellalib's picture
Upload 4 files
5c3331f verified
|
raw
history blame
7.49 kB
metadata
license: apache-2.0
base_model:
  - microsoft/wavlm-large
pipeline_tag: audio-to-audio

FocalCodec

A low-bitrate single-codebook 16 kHz speech codec based on focal modulation.


โ–ถ๏ธ Quickstart

See the readme at: https://github.com/lucadellalib/focalcodec


๐Ÿ“Œ Available Checkpoints

Checkpoint Token Rate (Hz) Bitrate (kbps) Dataset
LibriTTS960_50Hz 50.0 0.65 LibriTTS960
LibriTTS960_25Hz 25.0 0.33 LibriTTS960
LibriTTS960_12_5Hz 12.5 0.16 LibriTTS960

@ Citing

@article{dellalibera2025focalcodec,
    title   = {{FocalCodec}: Low-Bitrate Speech Coding via Focal Modulation Networks},
    author  = {Luca {Della Libera} and Francesco Paissan and Cem Subakan and Mirco Ravanelli},
    journal = {arXiv preprint arXiv:2502.04465},
    year    = {2025},
}

๐Ÿ“ง Contact

[email protected]


File information

The repository contains the following file information:

Filename: LibriTTS960_25Hz.json Content: { "encoder_name": "WavLM", "encoder_config": { "hidden_dims": [ 512, 512, 512, 512, 512, 512, 512 ], "kernel_sizes": [ 10, 3, 3, 3, 3, 2, 2 ], "strides": [ 5, 2, 2, 2, 2, 2, 2 ], "num_layers": 6, "dim": 1024, "ffn_dim": 4096, "num_heads": 16, "num_buckets": 320, "max_distance": 800, "dropout": 0.0, "conv_pos": 128, "conv_pos_groups": 16 }, "compressor_name": "FocalEncoder", "compressor_config": { "input_dim": 1024, "output_dim": 13, "hidden_dims": [ 1024, 512, 256 ], "downscale_factors": [ 2, 1, 1 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "quantizer_name": "BinarySphericalQuantizer", "quantizer_config": { "codebook_size": 8192 }, "decompressor_name": "FocalDecoder", "decompressor_config": { "input_dim": 13, "output_dim": 1024, "hidden_dims": [ 256, 512, 1024 ], "upscale_factors": [ 1, 1, 2 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "decoder_name": "Vocos", "decoder_config": { "input_channels": 1024, "num_layers": 8, "dim": 512, "ffn_dim": 1536, "kernel_size": 7, "padding": 3, "layerscale_init": null, "n_fft": 1024, "hop_length": 320 } }

Filename: focalcodec.png Content: "Content of the file is larger than 50 KB, too long to display."

Filename: LibriTTS960_50Hz.json Content: { "encoder_name": "WavLM", "encoder_config": { "hidden_dims": [ 512, 512, 512, 512, 512, 512, 512 ], "kernel_sizes": [ 10, 3, 3, 3, 3, 2, 2 ], "strides": [ 5, 2, 2, 2, 2, 2, 2 ], "num_layers": 6, "dim": 1024, "ffn_dim": 4096, "num_heads": 16, "num_buckets": 320, "max_distance": 800, "dropout": 0.0, "conv_pos": 128, "conv_pos_groups": 16 }, "compressor_name": "FocalEncoder", "compressor_config": { "input_dim": 1024, "output_dim": 13, "hidden_dims": [ 1024, 512, 256 ], "downscale_factors": [ 1, 1, 1 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "quantizer_name": "BinarySphericalQuantizer", "quantizer_config": { "codebook_size": 8192 }, "decompressor_name": "FocalDecoder", "decompressor_config": { "input_dim": 13, "output_dim": 1024, "hidden_dims": [ 256, 512, 1024 ], "upscale_factors": [ 1, 1, 1 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "decoder_name": "Vocos", "decoder_config": { "input_channels": 1024, "num_layers": 8, "dim": 512, "ffn_dim": 1536, "kernel_size": 7, "padding": 3, "layerscale_init": null, "n_fft": 1024, "hop_length": 320 } }

Filename: LibriTTS960_12_5Hz.json Content: { "encoder_name": "WavLM", "encoder_config": { "hidden_dims": [ 512, 512, 512, 512, 512, 512, 512 ], "kernel_sizes": [ 10, 3, 3, 3, 3, 2, 2 ], "strides": [ 5, 2, 2, 2, 2, 2, 2 ], "num_layers": 6, "dim": 1024, "ffn_dim": 4096, "num_heads": 16, "num_buckets": 320, "max_distance": 800, "dropout": 0.0, "conv_pos": 128, "conv_pos_groups": 16 }, "compressor_name": "FocalEncoder", "compressor_config": { "input_dim": 1024, "output_dim": 13, "hidden_dims": [ 1024, 512, 256 ], "downscale_factors": [ 2, 2, 1 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "quantizer_name": "BinarySphericalQuantizer", "quantizer_config": { "codebook_size": 8192 }, "decompressor_name": "FocalDecoder", "decompressor_config": { "input_dim": 13, "output_dim": 1024, "hidden_dims": [ 256, 512, 1024 ], "upscale_factors": [ 1, 2, 2 ], "focal_window": 7, "focal_level": 2, "focal_factor": 2, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "normalize_modulator": false }, "decoder_name": "Vocos", "decoder_config": { "input_channels": 1024, "num_layers": 8, "dim": 512, "ffn_dim": 1536, "kernel_size": 7, "padding": 3, "layerscale_init": null, "n_fft": 1024, "hop_length": 320 } }