Spaces:

copyvara
/

Spark-TTS

Runtime error

App Files Files Community

진승환 commited on Mar 14

Commit

44a8c76

1 Parent(s): ee29f36

Update UI

Browse files

Files changed (24) hide show

README.md +6 -0
sparkTTS/.gitattributes +35 -0
sparkTTS/README.md +12 -0
sparktts/models/audio_tokenizer.py +0 -163
sparktts/models/bicodec.py +0 -247
sparktts/modules/blocks/layers.py +0 -73
sparktts/modules/blocks/samper.py +0 -115
sparktts/modules/blocks/vocos.py +0 -373
sparktts/modules/encoder_decoder/feat_decoder.py +0 -115
sparktts/modules/encoder_decoder/feat_encoder.py +0 -105
sparktts/modules/encoder_decoder/wave_generator.py +0 -88
sparktts/modules/fsq/finite_scalar_quantization.py +0 -251
sparktts/modules/fsq/residual_fsq.py +0 -355
sparktts/modules/speaker/ecapa_tdnn.py +0 -267
sparktts/modules/speaker/perceiver_encoder.py +0 -360
sparktts/modules/speaker/pooling_layers.py +0 -298
sparktts/modules/speaker/speaker_encoder.py +0 -136
sparktts/modules/vq/factorized_vector_quantize.py +0 -187
sparktts/utils/__init__.py +0 -0
sparktts/utils/audio.py +0 -271
sparktts/utils/file.py +0 -221
sparktts/utils/parse_options.sh +0 -97
sparktts/utils/token_parser.py +0 -187
webui.py +3 -2

README.md CHANGED Viewed

@@ -1,3 +1,9 @@
 <div align="center">
     <h1>
     Spark-TTS

+---
+title: Spark-TTS
+app_file: webui.py
+sdk: gradio
+sdk_version: 5.18.0
+---
 <div align="center">
     <h1>
     Spark-TTS

sparkTTS/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

sparkTTS/README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: SparkTTS
+emoji: 🐨
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 5.21.0
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

sparktts/models/audio_tokenizer.py DELETED Viewed

@@ -1,163 +0,0 @@
-# Copyright (c) 2025 SparkAudio
-#               2025 Xinsheng Wang ([email protected])
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from pathlib import Path
-from typing import Any, Dict, Tuple
-from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
-from sparktts.utils.file import load_config
-from sparktts.utils.audio import load_audio
-from sparktts.models.bicodec import BiCodec
-class BiCodecTokenizer:
-    """BiCodec tokenizer for handling audio input and tokenization."""
-    def __init__(self, model_dir: Path, device: torch.device = None, **kwargs):
-        super().__init__()
-        """
-        Args:
-            model_dir: Path to the model directory.
-            device: Device to run the model on (default is GPU if available).
-        """
-        self.device = device
-        self.model_dir = model_dir
-        self.config = load_config(f"{model_dir}/config.yaml")
-        self._initialize_model()
-    def _initialize_model(self):
-        """Load and initialize the BiCodec model and Wav2Vec2 feature extractor."""
-        self.model = BiCodec.load_from_checkpoint(f"{self.model_dir}/BiCodec").to(
-            self.device
-        )
-        self.processor = Wav2Vec2FeatureExtractor.from_pretrained(
-            f"{self.model_dir}/wav2vec2-large-xlsr-53"
-        )
-        self.feature_extractor = Wav2Vec2Model.from_pretrained(
-            f"{self.model_dir}/wav2vec2-large-xlsr-53"
-        ).to(self.device)
-        self.feature_extractor.config.output_hidden_states = True
-    def get_ref_clip(self, wav: np.ndarray) -> np.ndarray:
-        """Get reference audio clip for speaker embedding."""
-        ref_segment_length = (
-            int(self.config["sample_rate"] * self.config["ref_segment_duration"])
-            // self.config["latent_hop_length"]
-            * self.config["latent_hop_length"]
-        )
-        wav_length = len(wav)
-        if ref_segment_length > wav_length:
-            # Repeat and truncate to handle insufficient length
-            wav = np.tile(wav, ref_segment_length // wav_length + 1)
-        return wav[:ref_segment_length]
-    def process_audio(self, wav_path: Path) -> Tuple[np.ndarray, torch.Tensor]:
-        """load auido and get reference audio from wav path"""
-        wav = load_audio(
-            wav_path,
-            sampling_rate=self.config["sample_rate"],
-            volume_normalize=self.config["volume_normalize"],
-        )
-        wav_ref = self.get_ref_clip(wav)
-        wav_ref = torch.from_numpy(wav_ref).unsqueeze(0).float()
-        return wav, wav_ref
-    def extract_wav2vec2_features(self, wavs: torch.Tensor) -> torch.Tensor:
-        """extract wav2vec2 features"""
-        inputs = self.processor(
-            wavs,
-            sampling_rate=16000,
-            return_tensors="pt",
-            padding=True,
-            output_hidden_states=True,
-        ).input_values
-        feat = self.feature_extractor(inputs.to(self.feature_extractor.device))
-        feats_mix = (
-            feat.hidden_states[11] + feat.hidden_states[14] + feat.hidden_states[16]
-        ) / 3
-        return feats_mix
-    def tokenize_batch(self, batch: Dict[str, Any]) -> torch.Tensor:
-        """tokenize the batch of audio
-        Args:
-            batch:
-                wavs (List[np.ndarray]): batch of audio
-                ref_wavs (torch.Tensor): reference audio. shape: (batch_size, seq_len)
-        Returns:
-            semantic_tokens: semantic tokens. shape: (batch_size, seq_len, latent_dim)
-            global_tokens: global tokens. shape: (batch_size, seq_len, global_dim)
-        """
-        feats = self.extract_wav2vec2_features(batch["wav"])
-        batch["feat"] = feats
-        semantic_tokens, global_tokens = self.model.tokenize(batch)
-        return global_tokens, semantic_tokens
-    def tokenize(self, audio_path: str) -> Tuple[torch.Tensor, torch.Tensor]:
-        """tokenize the audio"""
-        wav, ref_wav = self.process_audio(audio_path)
-        feat = self.extract_wav2vec2_features(wav)
-        batch = {
-            "wav": torch.from_numpy(wav).unsqueeze(0).float().to(self.device),
-            "ref_wav": ref_wav.to(self.device),
-            "feat": feat.to(self.device),
-        }
-        semantic_tokens, global_tokens = self.model.tokenize(batch)
-        return global_tokens, semantic_tokens
-    def detokenize(
-        self, global_tokens: torch.Tensor, semantic_tokens: torch.Tensor
-    ) -> np.array:
-        """detokenize the tokens to waveform
-        Args:
-            global_tokens: global tokens. shape: (batch_size, global_dim)
-            semantic_tokens: semantic tokens. shape: (batch_size, latent_dim)
-        Returns:
-            wav_rec: waveform. shape: (batch_size, seq_len) for batch or (seq_len,) for single
-        """
-        global_tokens = global_tokens.unsqueeze(1)
-        wav_rec = self.model.detokenize(semantic_tokens, global_tokens)
-        return wav_rec.detach().squeeze().cpu().numpy()
-# test
-if __name__ == "__main__":
-    import soundfile as sf
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    tokenizer = BiCodecTokenizer(
-        model_dir="pretrained_models/Spark-TTS-0.5B",
-        device=device,
-    )
-    wav_path = "example/prompt_audio.wav"
-    global_tokens, semantic_tokens = tokenizer.tokenize(wav_path)
-    wav_rec = tokenizer.detokenize(global_tokens.squeeze(0), semantic_tokens)
-    sf.write("example/prompt_recon.wav", wav_rec, 16000)

sparktts/models/bicodec.py DELETED Viewed

@@ -1,247 +0,0 @@
-# Copyright (c) 2025 SparkAudio
-#               2025 Xinsheng Wang ([email protected])
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-from pathlib import Path
-from typing import Dict, Any
-from omegaconf import DictConfig
-from safetensors.torch import load_file
-from sparktts.utils.file import load_config
-from sparktts.modules.speaker.speaker_encoder import SpeakerEncoder
-from sparktts.modules.encoder_decoder.feat_encoder import Encoder
-from sparktts.modules.encoder_decoder.feat_decoder import Decoder
-from sparktts.modules.encoder_decoder.wave_generator import WaveGenerator
-from sparktts.modules.vq.factorized_vector_quantize import FactorizedVectorQuantize
-class BiCodec(nn.Module):
-    """
-    BiCodec model for speech synthesis, incorporating a speaker encoder, feature encoder/decoder,
-    quantizer, and wave generator.
-    """
-    def __init__(
-        self,
-        mel_params: Dict[str, Any],
-        encoder: nn.Module,
-        decoder: nn.Module,
-        quantizer: nn.Module,
-        speaker_encoder: nn.Module,
-        prenet: nn.Module,
-        postnet: nn.Module,
-        **kwargs
-    ) -> None:
-        """
-        Initializes the BiCodec model with the required components.
-        Args:
-            mel_params (dict): Parameters for the mel-spectrogram transformer.
-            encoder (nn.Module): Encoder module.
-            decoder (nn.Module): Decoder module.
-            quantizer (nn.Module): Quantizer module.
-            speaker_encoder (nn.Module): Speaker encoder module.
-            prenet (nn.Module): Prenet network.
-            postnet (nn.Module): Postnet network.
-        """
-        super().__init__()
-        self.encoder = encoder
-        self.decoder = decoder
-        self.quantizer = quantizer
-        self.speaker_encoder = speaker_encoder
-        self.prenet = prenet
-        self.postnet = postnet
-        self.init_mel_transformer(mel_params)
-    @classmethod
-    def load_from_checkpoint(cls, model_dir: Path, **kwargs) -> "BiCodec":
-        """
-        Loads the model from a checkpoint.
-        Args:
-            model_dir (Path): Path to the model directory containing checkpoint and config.
-        Returns:
-            BiCodec: The initialized BiCodec model.
-        """
-        ckpt_path = f'{model_dir}/model.safetensors'
-        config = load_config(f'{model_dir}/config.yaml')['audio_tokenizer']
-        mel_params = config["mel_params"]
-        encoder = Encoder(**config["encoder"])
-        quantizer = FactorizedVectorQuantize(**config["quantizer"])
-        prenet = Decoder(**config["prenet"])
-        postnet = Decoder(**config["postnet"])
-        decoder = WaveGenerator(**config["decoder"])
-        speaker_encoder = SpeakerEncoder(**config["speaker_encoder"])
-        model = cls(
-            mel_params=mel_params,
-            encoder=encoder,
-            decoder=decoder,
-            quantizer=quantizer,
-            speaker_encoder=speaker_encoder,
-            prenet=prenet,
-            postnet=postnet,
-        )
-        state_dict = load_file(ckpt_path)
-        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-        for key in missing_keys:
-            print(f"Missing tensor: {key}")
-        for key in unexpected_keys:
-            print(f"Unexpected tensor: {key}")
-        model.eval()
-        model.remove_weight_norm()
-        return model
-    def forward(self, batch: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Performs a forward pass through the model.
-        Args:
-            batch (dict): A dictionary containing features, reference waveform, and target waveform.
-        Returns:
-            dict: A dictionary containing the reconstruction, features, and other metrics.
-        """
-        feat = batch["feat"]
-        mel = self.mel_transformer(batch["ref_wav"]).squeeze(1)
-        z = self.encoder(feat.transpose(1, 2))
-        vq_outputs = self.quantizer(z)
-        x_vector, d_vector = self.speaker_encoder(mel.transpose(1, 2))
-        conditions = d_vector
-        with_speaker_loss = False
-        x = self.prenet(vq_outputs["z_q"], conditions)
-        pred_feat = self.postnet(x)
-        x = x + conditions.unsqueeze(-1)
-        wav_recon = self.decoder(x)
-        return {
-            "vq_loss": vq_outputs["vq_loss"],
-            "perplexity": vq_outputs["perplexity"],
-            "cluster_size": vq_outputs["active_num"],
-            "recons": wav_recon,
-            "pred_feat": pred_feat,
-            "x_vector": x_vector,
-            "d_vector": d_vector,
-            "audios": batch["wav"].unsqueeze(1),
-            "with_speaker_loss": with_speaker_loss,
-        }
-    @torch.no_grad()
-    def tokenize(self, batch: Dict[str, Any]):
-        """
-        Tokenizes the input audio into semantic and global tokens.
-        Args:
-            batch (dict): The input audio features and reference waveform.
-        Returns:
-            tuple: Semantic tokens and global tokens.
-        """
-        feat = batch["feat"]
-        mel = self.mel_transformer(batch["ref_wav"]).squeeze(1)
-        z = self.encoder(feat.transpose(1, 2))
-        semantic_tokens = self.quantizer.tokenize(z)
-        global_tokens = self.speaker_encoder.tokenize(mel.transpose(1, 2))
-        return semantic_tokens, global_tokens
-    @torch.no_grad()
-    def detokenize(self, semantic_tokens, global_tokens):
-        """
-        Detokenizes the semantic and global tokens into a waveform.
-        Args:
-            semantic_tokens (tensor): Semantic tokens.
-            global_tokens (tensor): Global tokens.
-        Returns:
-            tensor: Reconstructed waveform.
-        """
-        z_q = self.quantizer.detokenize(semantic_tokens)
-        d_vector = self.speaker_encoder.detokenize(global_tokens)
-        x = self.prenet(z_q, d_vector)
-        x = x + d_vector.unsqueeze(-1)
-        wav_recon = self.decoder(x)
-        return wav_recon
-    def init_mel_transformer(self, config: Dict[str, Any]):
-        """
-        Initializes the MelSpectrogram transformer based on the provided configuration.
-        Args:
-            config (dict): Configuration parameters for MelSpectrogram.
-        """
-        import torchaudio.transforms as TT
-        self.mel_transformer = TT.MelSpectrogram(
-            config["sample_rate"],
-            config["n_fft"],
-            config["win_length"],
-            config["hop_length"],
-            config["mel_fmin"],
-            config["mel_fmax"],
-            n_mels=config["num_mels"],
-            power=1,
-            norm="slaney",
-            mel_scale="slaney",
-        )
-    def remove_weight_norm(self):
-        """Removes weight normalization from all layers."""
-        def _remove_weight_norm(m):
-            try:
-                torch.nn.utils.remove_weight_norm(m)
-            except ValueError:
-                pass  # The module didn't have weight norm
-        self.apply(_remove_weight_norm)
-# Test the model
-if __name__ == "__main__":
-    config = load_config("pretrained_models/SparkTTS-0.5B/BiCodec/config.yaml")
-    model = BiCodec.load_from_checkpoint(
-        model_dir="pretrained_models/SparkTTS-0.5B/BiCodec",
-    )
-    # Generate random inputs for testing
-    duration = 0.96
-    x = torch.randn(20, 1, int(duration * 16000))
-    feat = torch.randn(20, int(duration * 50), 1024)
-    inputs = {"feat": feat, "wav": x, "ref_wav": x}
-    # Forward pass
-    outputs = model(inputs)
-    semantic_tokens, global_tokens = model.tokenize(inputs)
-    wav_recon = model.detokenize(semantic_tokens, global_tokens)
-    # Verify if the reconstruction matches
-    if torch.allclose(outputs["recons"].detach(), wav_recon):
-        print("Test successful")
-    else:
-        print("Test failed")

sparktts/modules/blocks/layers.py DELETED Viewed

@@ -1,73 +0,0 @@
-# Copyright (c) 2025 SparkAudio
-#               2025 Xinsheng Wang ([email protected])
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Adapted from https://github.com/descriptinc/descript-audio-codec under the Apache License 2.0
-import torch
-import torch.nn as nn
-from torch.nn.utils import weight_norm
-def WNConv1d(*args, **kwargs):
-    return weight_norm(nn.Conv1d(*args, **kwargs))
-def WNConvTranspose1d(*args, **kwargs):
-    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
-# Scripting this brings model speed up 1.4x
-@torch.jit.script
-def snake(x, alpha):
-    shape = x.shape
-    x = x.reshape(shape[0], shape[1], -1)
-    x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
-    x = x.reshape(shape)
-    return x
-class Snake1d(nn.Module):
-    def __init__(self, channels):
-        super().__init__()
-        self.alpha = nn.Parameter(torch.ones(1, channels, 1))
-    def forward(self, x):
-        return snake(x, self.alpha)
-class ResidualUnit(nn.Module):
-    def __init__(self, dim: int = 16, dilation: int = 1):
-        super().__init__()
-        pad = ((7 - 1) * dilation) // 2
-        self.block = nn.Sequential(
-            Snake1d(dim),
-            WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
-            Snake1d(dim),
-            WNConv1d(dim, dim, kernel_size=1),
-        )
-    def forward(self, x):
-        y = self.block(x)
-        pad = (x.shape[-1] - y.shape[-1]) // 2
-        if pad > 0:
-            x = x[..., pad:-pad]
-        return x + y
-def init_weights(m):
-    if isinstance(m, nn.Conv1d):
-        nn.init.trunc_normal_(m.weight, std=0.02)
-        nn.init.constant_(m.bias, 0)

sparktts/modules/blocks/samper.py DELETED Viewed

@@ -1,115 +0,0 @@
-# Copyright (c) 2025 SparkAudio
-#               2025 Xinsheng Wang ([email protected])
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-class SamplingBlock(nn.Module):
-    """Sampling block for upsampling or downsampling"""
-    def __init__(
-        self,
-        dim: int,
-        groups: int = 1,
-        upsample_scale: int = 1,
-        downsample_scale: int = 1,
-    ) -> None:
-        """
-        Args:
-            dim: input dimension
-            groups: number of groups
-            upsample_scale: upsampling scale
-            downsample_scale: downsampling scale
-        """
-        super(SamplingBlock, self).__init__()
-        self.upsample_scale = upsample_scale
-        self.downsample_scale = downsample_scale
-        if self.upsample_scale > 1:
-            self.de_conv_upsampler = nn.Sequential(
-                nn.LeakyReLU(0.2),
-                nn.ConvTranspose1d(
-                    dim,
-                    dim,
-                    kernel_size=upsample_scale * 2,
-                    stride=upsample_scale,
-                    padding=upsample_scale // 2 + upsample_scale % 2,
-                    output_padding=upsample_scale % 2,
-                    groups=groups,
-                ),
-            )
-        if self.downsample_scale > 1:
-            self.conv_downsampler = nn.Sequential(
-                nn.LeakyReLU(0.2),
-                nn.Conv1d(
-                    dim,
-                    dim,
-                    kernel_size=2 * downsample_scale,
-                    stride=downsample_scale,
-                    padding=downsample_scale // 2 + downsample_scale % 2,
-                    groups=groups,
-                ),
-            )
-    @staticmethod
-    def repeat_upsampler(x, upsample_scale):
-        return x.repeat_interleave(upsample_scale, dim=2)
-    @staticmethod
-    def skip_downsampler(x, downsample_scale):
-        return F.avg_pool1d(x, kernel_size=downsample_scale, stride=downsample_scale)
-    def forward(self, x):
-        x = x.transpose(1, 2)
-        if self.upsample_scale > 1:
-            repeat_res = self.repeat_upsampler(x, self.upsample_scale)
-            deconv_res = self.de_conv_upsampler(x)
-            upmerge_res = repeat_res + deconv_res
-        else:
-            upmerge_res = x
-            repeat_res = x
-        if self.downsample_scale > 1:
-            conv_res = self.conv_downsampler(upmerge_res)
-            skip2_res = self.skip_downsampler(upmerge_res, self.downsample_scale)
-            skip1_res = self.skip_downsampler(repeat_res, self.downsample_scale)
-        else:
-            conv_res = upmerge_res
-            skip2_res = upmerge_res
-            skip1_res = repeat_res
-        final_res = conv_res + skip1_res + skip2_res
-        return final_res
-# test
-if __name__ == "__main__":
-    test_input = torch.randn(8, 1024, 50)  # Batch size = 8, 1024 channels, length = 50
-    model = SamplingBlock(1024, 1024, upsample_scale=2)
-    model_down = SamplingBlock(1024, 1024, downsample_scale=2)
-    output = model(test_input)
-    output_down = model_down(test_input)
-    print("shape after upsample * 2", output.shape)  # torch.Size([8, 1024, 100])
-    print("shape after downsample * 2", output_down.shape)  # torch.Size([8, 1024, 25])
-    if output.shape == torch.Size([8, 1024, 100]) and output_down.shape == torch.Size(
-        [8, 1024, 25]
-    ):
-        print("test successful")

sparktts/modules/blocks/vocos.py DELETED Viewed

@@ -1,373 +0,0 @@
-# Copyright (c) 2025 SparkAudio
-#               2025 Xinsheng Wang ([email protected])
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-from typing import Tuple
-from torch.nn.utils import weight_norm, remove_weight_norm
-from typing import Optional
-class ConvNeXtBlock(nn.Module):
-    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
-    Args:
-        dim (int): Number of input channels.
-        intermediate_dim (int): Dimensionality of the intermediate layer.
-        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
-            Defaults to None.
-        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
-            None means non-conditional LayerNorm. Defaults to None.
-    """
-    def __init__(
-        self,
-        dim: int,
-        intermediate_dim: int,
-        layer_scale_init_value: float,
-        condition_dim: Optional[int] = None,
-    ):
-        super().__init__()
-        self.dwconv = nn.Conv1d(
-            dim, dim, kernel_size=7, padding=3, groups=dim
-        )  # depthwise conv
-        self.adanorm = condition_dim is not None
-        if condition_dim:
-            self.norm = AdaLayerNorm(condition_dim, dim, eps=1e-6)
-        else:
-            self.norm = nn.LayerNorm(dim, eps=1e-6)
-        self.pwconv1 = nn.Linear(
-            dim, intermediate_dim
-        )  # pointwise/1x1 convs, implemented with linear layers
-        self.act = nn.GELU()
-        self.pwconv2 = nn.Linear(intermediate_dim, dim)
-        self.gamma = (
-            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
-            if layer_scale_init_value > 0
-            else None
-        )
-    def forward(
-        self, x: torch.Tensor, cond_embedding_id: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
-        residual = x
-        x = self.dwconv(x)
-        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
-        if self.adanorm:
-            assert cond_embedding_id is not None
-            x = self.norm(x, cond_embedding_id)
-        else:
-            x = self.norm(x)
-        x = self.pwconv1(x)
-        x = self.act(x)
-        x = self.pwconv2(x)
-        if self.gamma is not None:
-            x = self.gamma * x
-        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
-        x = residual + x
-        return x
-class AdaLayerNorm(nn.Module):
-    """
-    Adaptive Layer Normalization module with learnable embeddings per `num_embeddings` classes
-    Args:
-        condition_dim (int): Dimension of the condition.
-        embedding_dim (int): Dimension of the embeddings.
-    """
-    def __init__(self, condition_dim: int, embedding_dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.dim = embedding_dim
-        self.scale = nn.Linear(condition_dim, embedding_dim)
-        self.shift = nn.Linear(condition_dim, embedding_dim)
-        torch.nn.init.ones_(self.scale.weight)
-        torch.nn.init.zeros_(self.shift.weight)
-    def forward(self, x: torch.Tensor, cond_embedding: torch.Tensor) -> torch.Tensor:
-        scale = self.scale(cond_embedding)
-        shift = self.shift(cond_embedding)
-        x = nn.functional.layer_norm(x, (self.dim,), eps=self.eps)
-        x = x * scale.unsqueeze(1) + shift.unsqueeze(1)
-        return x
-class ResBlock1(nn.Module):
-    """
-    ResBlock adapted from HiFi-GAN V1 (https://github.com/jik876/hifi-gan) with dilated 1D convolutions,
-    but without upsampling layers.
-    Args:
-        dim (int): Number of input channels.
-        kernel_size (int, optional): Size of the convolutional kernel. Defaults to 3.
-        dilation (tuple[int], optional): Dilation factors for the dilated convolutions.
-            Defaults to (1, 3, 5).
-        lrelu_slope (float, optional): Negative slope of the LeakyReLU activation function.
-            Defaults to 0.1.
-        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
-            Defaults to None.
-    """
-    def __init__(
-        self,
-        dim: int,
-        kernel_size: int = 3,
-        dilation: Tuple[int, int, int] = (1, 3, 5),
-        lrelu_slope: float = 0.1,
-        layer_scale_init_value: Optional[float] = None,
-    ):
-        super().__init__()
-        self.lrelu_slope = lrelu_slope
-        self.convs1 = nn.ModuleList(
-            [
-                weight_norm(
-                    nn.Conv1d(
-                        dim,
-                        dim,
-                        kernel_size,
-                        1,
-                        dilation=dilation[0],
-                        padding=self.get_padding(kernel_size, dilation[0]),
-                    )
-                ),
-                weight_norm(
-                    nn.Conv1d(
-                        dim,
-                        dim,
-                        kernel_size,
-                        1,
-                        dilation=dilation[1],
-                        padding=self.get_padding(kernel_size, dilation[1]),
-                    )
-                ),
-                weight_norm(
-                    nn.Conv1d(
-                        dim,
-                        dim,
-                        kernel_size,
-                        1,
-                        dilation=dilation[2],
-                        padding=self.get_padding(kernel_size, dilation[2]),
-                    )
-                ),
-            ]
-        )
-        self.convs2 = nn.ModuleList(
-            [
-                weight_norm(
-                    nn.Conv1d(
-                        dim,
-                        dim,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=self.get_padding(kernel_size, 1),
-                    )
-                ),
-                weight_norm(
-                    nn.Conv1d(
-                        dim,
-                        dim,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=self.get_padding(kernel_size, 1),
-                    )
-                ),
-                weight_norm(
-                    nn.Conv1d(
-                        dim,
-                        dim,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=self.get_padding(kernel_size, 1),
-                    )
-                ),
-            ]
-        )
-        self.gamma = nn.ParameterList(
-            [
-                (
-                    nn.Parameter(
-                        layer_scale_init_value * torch.ones(dim, 1), requires_grad=True
-                    )
-                    if layer_scale_init_value is not None
-                    else None
-                ),
-                (
-                    nn.Parameter(
-                        layer_scale_init_value * torch.ones(dim, 1), requires_grad=True
-                    )
-                    if layer_scale_init_value is not None
-                    else None
-                ),
-                (
-                    nn.Parameter(
-                        layer_scale_init_value * torch.ones(dim, 1), requires_grad=True
-                    )
-                    if layer_scale_init_value is not None
-                    else None
-                ),
-            ]
-        )
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        for c1, c2, gamma in zip(self.convs1, self.convs2, self.gamma):
-            xt = torch.nn.functional.leaky_relu(x, negative_slope=self.lrelu_slope)
-            xt = c1(xt)
-            xt = torch.nn.functional.leaky_relu(xt, negative_slope=self.lrelu_slope)
-            xt = c2(xt)
-            if gamma is not None:
-                xt = gamma * xt
-            x = xt + x
-        return x
-    def remove_weight_norm(self):
-        for l in self.convs1:
-            remove_weight_norm(l)
-        for l in self.convs2:
-            remove_weight_norm(l)
-    @staticmethod
-    def get_padding(kernel_size: int, dilation: int = 1) -> int:
-        return int((kernel_size * dilation - dilation) / 2)
-class Backbone(nn.Module):
-    """Base class for the generator's backbone. It preserves the same temporal resolution across all layers."""
-    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
-        """
-        Args:
-            x (Tensor): Input tensor of shape (B, C, L), where B is the batch size,
-                        C denotes output features, and L is the sequence length.
-        Returns:
-            Tensor: Output of shape (B, L, H), where B is the batch size, L is the sequence length,
-                    and H denotes the model dimension.
-        """
-        raise NotImplementedError("Subclasses must implement the forward method.")
-class VocosBackbone(Backbone):
-    """
-    Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization
-    Args:
-        input_channels (int): Number of input features channels.
-        dim (int): Hidden dimension of the model.
-        intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock.
-        num_layers (int): Number of ConvNeXtBlock layers.
-        layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to `1 / num_layers`.
-        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
-                                                None means non-conditional model. Defaults to None.
-    """
-    def __init__(
-        self,
-        input_channels: int,
-        dim: int,
-        intermediate_dim: int,
-        num_layers: int,
-        layer_scale_init_value: Optional[float] = None,
-        condition_dim: Optional[int] = None,
-    ):
-        super().__init__()
-        self.input_channels = input_channels
-        self.embed = nn.Conv1d(input_channels, dim, kernel_size=7, padding=3)
-        self.adanorm = condition_dim is not None
-        if condition_dim:
-            self.norm = AdaLayerNorm(condition_dim, dim, eps=1e-6)
-        else:
-            self.norm = nn.LayerNorm(dim, eps=1e-6)
-        layer_scale_init_value = layer_scale_init_value or 1 / num_layers
-        self.convnext = nn.ModuleList(
-            [
-                ConvNeXtBlock(
-                    dim=dim,
-                    intermediate_dim=intermediate_dim,
-                    layer_scale_init_value=layer_scale_init_value,
-                    condition_dim=condition_dim,
-                )
-                for _ in range(num_layers)
-            ]
-        )
-        self.final_layer_norm = nn.LayerNorm(dim, eps=1e-6)
-        self.apply(self._init_weights)
-    def _init_weights(self, m):
-        if isinstance(m, (nn.Conv1d, nn.Linear)):
-            nn.init.trunc_normal_(m.weight, std=0.02)
-            nn.init.constant_(m.bias, 0)
-    def forward(self, x: torch.Tensor, condition: torch.Tensor = None) -> torch.Tensor:
-        x = self.embed(x)
-        if self.adanorm:
-            assert condition is not None
-            x = self.norm(x.transpose(1, 2), condition)
-        else:
-            x = self.norm(x.transpose(1, 2))
-        x = x.transpose(1, 2)
-        for conv_block in self.convnext:
-            x = conv_block(x, condition)
-        x = self.final_layer_norm(x.transpose(1, 2))
-        return x
-class VocosResNetBackbone(Backbone):
-    """
-    Vocos backbone module built with ResBlocks.
-    Args:
-        input_channels (int): Number of input features channels.
-        dim (int): Hidden dimension of the model.
-        num_blocks (int): Number of ResBlock1 blocks.
-        layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to None.
-    """
-    def __init__(
-        self,
-        input_channels,
-        dim,
-        num_blocks,
-        layer_scale_init_value=None,
-    ):
-        super().__init__()
-        self.input_channels = input_channels
-        self.embed = weight_norm(
-            nn.Conv1d(input_channels, dim, kernel_size=3, padding=1)
-        )
-        layer_scale_init_value = layer_scale_init_value or 1 / num_blocks / 3
-        self.resnet = nn.Sequential(
-            *[
-                ResBlock1(dim=dim, layer_scale_init_value=layer_scale_init_value)
-                for _ in range(num_blocks)
-            ]
-        )
-    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
-        x = self.embed(x)
-        x = self.resnet(x)
-        x = x.transpose(1, 2)
-        return x

sparktts/modules/encoder_decoder/feat_decoder.py DELETED Viewed

@@ -1,115 +0,0 @@
-# Copyright (c) 2025 SparkAudio
-#               2025 Xinsheng Wang ([email protected])
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-from typing import List
-from sparktts.modules.blocks.vocos import VocosBackbone
-from sparktts.modules.blocks.samper import SamplingBlock
-class Decoder(nn.Module):
-    """Decoder module with convnext and upsampling blocks
-    Args:
-        sample_ratios (List[int]): sample ratios
-            example: [2, 2] means downsample by 2x and then upsample by 2x
-    """
-    def __init__(
-        self,
-        input_channels: int,
-        vocos_dim: int,
-        vocos_intermediate_dim: int,
-        vocos_num_layers: int,
-        out_channels: int,
-        condition_dim: int = None,
-        sample_ratios: List[int] = [1, 1],
-        use_tanh_at_final: bool = False,
-    ):
-        super().__init__()
-        self.linear_pre = nn.Linear(input_channels, vocos_dim)
-        modules = [
-            nn.Sequential(
-                SamplingBlock(
-                    dim=vocos_dim,
-                    groups=vocos_dim,
-                    upsample_scale=ratio,
-                ),
-                VocosBackbone(
-                    input_channels=vocos_dim,
-                    dim=vocos_dim,
-                    intermediate_dim=vocos_intermediate_dim,
-                    num_layers=2,
-                    condition_dim=None,
-                ),
-            )
-            for ratio in sample_ratios
-        ]
-        self.downsample = nn.Sequential(*modules)
-        self.vocos_backbone = VocosBackbone(
-            input_channels=vocos_dim,
-            dim=vocos_dim,
-            intermediate_dim=vocos_intermediate_dim,
-            num_layers=vocos_num_layers,
-            condition_dim=condition_dim,
-        )
-        self.linear = nn.Linear(vocos_dim, out_channels)
-        self.use_tanh_at_final = use_tanh_at_final
-    def forward(self, x: torch.Tensor, c: torch.Tensor = None):
-        """encoder forward.
-        Args:
-            x (torch.Tensor): (batch_size, input_channels, length)
-        Returns:
-            x (torch.Tensor): (batch_size, encode_channels, length)
-        """
-        x = self.linear_pre(x.transpose(1, 2))
-        x = self.downsample(x).transpose(1, 2)
-        x = self.vocos_backbone(x, condition=c)
-        x = self.linear(x).transpose(1, 2)
-        if self.use_tanh_at_final:
-            x = torch.tanh(x)
-        return x
-# test
-if __name__ == "__main__":
-    test_input = torch.randn(8, 1024, 50)  # Batch size = 8, 1024 channels, length = 50
-    condition = torch.randn(8, 256)
-    decoder = Decoder(
-        input_channels=1024,
-        vocos_dim=384,
-        vocos_intermediate_dim=2048,
-        vocos_num_layers=12,
-        out_channels=256,
-        condition_dim=256,
-        sample_ratios=[2, 2],
-    )
-    output = decoder(test_input, condition)
-    print(output.shape)  # torch.Size([8, 256, 200])
-    if output.shape == torch.Size([8, 256, 200]):
-        print("Decoder test passed")
-    else:
-        print("Decoder test failed")

sparktts/modules/encoder_decoder/feat_encoder.py DELETED Viewed

@@ -1,105 +0,0 @@
-# Copyright (c) 2025 SparkAudio
-#               2025 Xinsheng Wang ([email protected])
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-from typing import List
-from sparktts.modules.blocks.vocos import VocosBackbone
-from sparktts.modules.blocks.samper import SamplingBlock
-class Encoder(nn.Module):
-    """Encoder module with convnext and downsampling blocks"""
-    def __init__(
-        self,
-        input_channels: int,
-        vocos_dim: int,
-        vocos_intermediate_dim: int,
-        vocos_num_layers: int,
-        out_channels: int,
-        sample_ratios: List[int] = [1, 1],
-    ):
-        super().__init__()
-        """
-        Encoder module with VocosBackbone and sampling blocks.
-        Args:
-            sample_ratios (List[int]): sample ratios
-                example: [2, 2] means downsample by 2x and then upsample by 2x
-        """
-        self.encoder = VocosBackbone(
-            input_channels=input_channels,
-            dim=vocos_dim,
-            intermediate_dim=vocos_intermediate_dim,
-            num_layers=vocos_num_layers,
-            condition_dim=None,
-        )
-        modules = [
-            nn.Sequential(
-                SamplingBlock(
-                    dim=vocos_dim,
-                    groups=vocos_dim,
-                    downsample_scale=ratio,
-                ),
-                VocosBackbone(
-                    input_channels=vocos_dim,
-                    dim=vocos_dim,
-                    intermediate_dim=vocos_intermediate_dim,
-                    num_layers=2,
-                    condition_dim=None,
-                ),
-            )
-            for ratio in sample_ratios
-        ]
-        self.downsample = nn.Sequential(*modules)
-        self.project = nn.Linear(vocos_dim, out_channels)
-    def forward(self, x: torch.Tensor, *args):
-        """
-        Args:
-            x (torch.Tensor): (batch_size, input_channels, length)
-        Returns:
-            x (torch.Tensor): (batch_size, encode_channels, length)
-        """
-        x = self.encoder(x)
-        x = self.downsample(x)
-        x = self.project(x)
-        return x.transpose(1, 2)
-# test
-if __name__ == "__main__":
-    test_input = torch.randn(8, 1024, 50)  # Batch size = 8, 1024 channels, length = 50
-    encoder = Encoder(
-        input_channels=1024,
-        vocos_dim=384,
-        vocos_intermediate_dim=2048,
-        vocos_num_layers=12,
-        out_channels=256,
-        sample_ratios=[2, 2],
-    )
-    output = encoder(test_input)
-    print(output.shape)  # torch.Size([8, 256, 12])
-    if output.shape == torch.Size([8, 256, 12]):
-        print("test successful")

sparktts/modules/encoder_decoder/wave_generator.py DELETED Viewed

@@ -1,88 +0,0 @@
-# Copyright (c) 2024 Xinsheng Wang ([email protected])
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Adapted from https://github.com/descriptinc/descript-audio-codec under the Apache License 2.0
-import torch.nn as nn
-from sparktts.modules.blocks.layers import (
-    Snake1d,
-    WNConv1d,
-    ResidualUnit,
-    WNConvTranspose1d,
-    init_weights,
-)
-class DecoderBlock(nn.Module):
-    def __init__(
-        self,
-        input_dim: int = 16,
-        output_dim: int = 8,
-        kernel_size: int = 2,
-        stride: int = 1,
-    ):
-        super().__init__()
-        self.block = nn.Sequential(
-            Snake1d(input_dim),
-            WNConvTranspose1d(
-                input_dim,
-                output_dim,
-                kernel_size=kernel_size,
-                stride=stride,
-                padding=(kernel_size - stride) // 2,
-            ),
-            ResidualUnit(output_dim, dilation=1),
-            ResidualUnit(output_dim, dilation=3),
-            ResidualUnit(output_dim, dilation=9),
-        )
-    def forward(self, x):
-        return self.block(x)
-class WaveGenerator(nn.Module):
-    def __init__(
-        self,
-        input_channel,
-        channels,
-        rates,
-        kernel_sizes,
-        d_out: int = 1,
-    ):
-        super().__init__()
-        # Add first conv layer
-        layers = [WNConv1d(input_channel, channels, kernel_size=7, padding=3)]
-        # Add upsampling + MRF blocks
-        for i, (kernel_size, stride) in enumerate(zip(kernel_sizes, rates)):
-            input_dim = channels // 2**i
-            output_dim = channels // 2 ** (i + 1)
-            layers += [DecoderBlock(input_dim, output_dim, kernel_size, stride)]
-        # Add final conv layer
-        layers += [
-            Snake1d(output_dim),
-            WNConv1d(output_dim, d_out, kernel_size=7, padding=3),
-            nn.Tanh(),
-        ]
-        self.model = nn.Sequential(*layers)
-        self.apply(init_weights)
-    def forward(self, x):
-        return self.model(x)

sparktts/modules/fsq/finite_scalar_quantization.py DELETED Viewed

@@ -1,251 +0,0 @@
-"""
-Finite Scalar Quantization: VQ-VAE Made Simple - https://arxiv.org/abs/2309.15505
-Code adapted from Jax version in Appendix A.1
-"""
-from __future__ import annotations
-from functools import wraps, partial
-from contextlib import nullcontext
-from typing import List, Tuple
-import torch
-import torch.nn as nn
-from torch.nn import Module
-from torch import Tensor, int32
-from torch.amp import autocast
-from einops import rearrange, pack, unpack
-# helper functions
-def exists(v):
-    return v is not None
-def default(*args):
-    for arg in args:
-        if exists(arg):
-            return arg
-    return None
-def maybe(fn):
-    @wraps(fn)
-    def inner(x, *args, **kwargs):
-        if not exists(x):
-            return x
-        return fn(x, *args, **kwargs)
-    return inner
-def pack_one(t, pattern):
-    return pack([t], pattern)
-def unpack_one(t, ps, pattern):
-    return unpack(t, ps, pattern)[0]
-# tensor helpers
-def round_ste(z: Tensor) -> Tensor:
-    """Round with straight through gradients."""
-    zhat = z.round()
-    return z + (zhat - z).detach()
-# main class
-class FSQ(Module):
-    def __init__(
-        self,
-        levels: List[int],
-        dim: int | None = None,
-        num_codebooks=1,
-        keep_num_codebooks_dim: bool | None = None,
-        scale: float | None = None,
-        allowed_dtypes: Tuple[torch.dtype, ...] = (torch.float32, torch.float64),
-        channel_first: bool = False,
-        projection_has_bias: bool = True,
-        return_indices=True,
-        force_quantization_f32=True,
-    ):
-        super().__init__()
-        _levels = torch.tensor(levels, dtype=int32)
-        self.register_buffer("_levels", _levels, persistent=False)
-        _basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=int32)
-        self.register_buffer("_basis", _basis, persistent=False)
-        self.scale = scale
-        codebook_dim = len(levels)
-        self.codebook_dim = codebook_dim
-        effective_codebook_dim = codebook_dim * num_codebooks
-        self.num_codebooks = num_codebooks
-        self.effective_codebook_dim = effective_codebook_dim
-        keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1)
-        assert not (num_codebooks > 1 and not keep_num_codebooks_dim)
-        self.keep_num_codebooks_dim = keep_num_codebooks_dim
-        self.dim = default(dim, len(_levels) * num_codebooks)
-        self.channel_first = channel_first
-        has_projections = self.dim != effective_codebook_dim
-        self.project_in = (
-            nn.Linear(self.dim, effective_codebook_dim, bias=projection_has_bias)
-            if has_projections
-            else nn.Identity()
-        )
-        self.project_out = (
-            nn.Linear(effective_codebook_dim, self.dim, bias=projection_has_bias)
-            if has_projections
-            else nn.Identity()
-        )
-        self.has_projections = has_projections
-        self.return_indices = return_indices
-        if return_indices:
-            self.codebook_size = self._levels.prod().item()
-            implicit_codebook = self._indices_to_codes(torch.arange(self.codebook_size))
-            self.register_buffer(
-                "implicit_codebook", implicit_codebook, persistent=False
-            )
-        self.allowed_dtypes = allowed_dtypes
-        self.force_quantization_f32 = force_quantization_f32
-    def bound(self, z, eps: float = 1e-3):
-        """Bound `z`, an array of shape (..., d)."""
-        half_l = (self._levels - 1) * (1 + eps) / 2
-        offset = torch.where(self._levels % 2 == 0, 0.5, 0.0)
-        shift = (offset / half_l).atanh()
-        return (z + shift).tanh() * half_l - offset
-    def quantize(self, z):
-        """Quantizes z, returns quantized zhat, same shape as z."""
-        quantized = round_ste(self.bound(z))
-        half_width = self._levels // 2  # Renormalize to [-1, 1].
-        return quantized / half_width
-    def _scale_and_shift(self, zhat_normalized):
-        half_width = self._levels // 2
-        return (zhat_normalized * half_width) + half_width
-    def _scale_and_shift_inverse(self, zhat):
-        half_width = self._levels // 2
-        return (zhat - half_width) / half_width
-    def _indices_to_codes(self, indices):
-        level_indices = self.indices_to_level_indices(indices)
-        codes = self._scale_and_shift_inverse(level_indices)
-        return codes
-    def codes_to_indices(self, zhat):
-        """Converts a `code` to an index in the codebook."""
-        assert zhat.shape[-1] == self.codebook_dim
-        zhat = self._scale_and_shift(zhat)
-        return (zhat * self._basis).sum(dim=-1).to(int32)
-    def indices_to_level_indices(self, indices):
-        """Converts indices to indices at each level, perhaps needed for a transformer with factorized embeddings"""
-        indices = rearrange(indices, "... -> ... 1")
-        codes_non_centered = (indices // self._basis) % self._levels
-        return codes_non_centered
-    def indices_to_codes(self, indices):
-        """Inverse of `codes_to_indices`."""
-        assert exists(indices)
-        is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim))
-        codes = self._indices_to_codes(indices)
-        if self.keep_num_codebooks_dim:
-            codes = rearrange(codes, "... c d -> ... (c d)")
-        codes = self.project_out(codes)
-        if is_img_or_video or self.channel_first:
-            codes = rearrange(codes, "b ... d -> b d ...")
-        return codes
-    def forward(self, z):
-        """
-        einstein notation
-        b - batch
-        n - sequence (or flattened spatial dimensions)
-        d - feature dimension
-        c - number of codebook dim
-        """
-        is_img_or_video = z.ndim >= 4
-        need_move_channel_last = is_img_or_video or self.channel_first
-        # standardize image or video into (batch, seq, dimension)
-        if need_move_channel_last:
-            z = rearrange(z, "b d ... -> b ... d")
-            z, ps = pack_one(z, "b * d")
-        assert (
-            z.shape[-1] == self.dim
-        ), f"expected dimension of {self.dim} but found dimension of {z.shape[-1]}"
-        z = self.project_in(z)
-        z = rearrange(z, "b n (c d) -> b n c d", c=self.num_codebooks)
-        # whether to force quantization step to be full precision or not
-        force_f32 = self.force_quantization_f32
-        quantization_context = (
-            partial(autocast, "cuda", enabled=False) if force_f32 else nullcontext
-        )
-        with quantization_context():
-            orig_dtype = z.dtype
-            if force_f32 and orig_dtype not in self.allowed_dtypes:
-                z = z.float()
-            codes = self.quantize(z)
-            # returning indices could be optional
-            indices = None
-            if self.return_indices:
-                indices = self.codes_to_indices(codes)
-            codes = rearrange(codes, "b n c d -> b n (c d)")
-            codes = codes.type(orig_dtype)
-        # project out
-        out = self.project_out(codes)
-        # reconstitute image or video dimensions
-        if need_move_channel_last:
-            out = unpack_one(out, ps, "b * d")
-            out = rearrange(out, "b ... d -> b d ...")
-            indices = maybe(unpack_one)(indices, ps, "b * c")
-        if not self.keep_num_codebooks_dim and self.return_indices:
-            indices = maybe(rearrange)(indices, "... 1 -> ...")
-        # return quantized output and indices
-        return out, indices

sparktts/modules/fsq/residual_fsq.py DELETED Viewed

@@ -1,355 +0,0 @@
-import random
-import torch
-import torch.nn.functional as F
-import torch.distributed as dist
-from typing import List
-from torch import nn
-from torch.nn import Module
-from torch.amp import autocast
-from einx import get_at
-from einops import rearrange, reduce, pack, unpack
-from sparktts.modules.fsq.finite_scalar_quantization import FSQ
-def exists(val):
-    return val is not None
-def first(l):
-    return l[0]
-def default(val, d):
-    return val if exists(val) else d
-def round_up_multiple(num, mult):
-    return ceil(num / mult) * mult
-# distributed helpers
-def is_distributed():
-    return dist.is_initialized() and dist.get_world_size() > 1
-def get_maybe_sync_seed(device, max_size=10_000):
-    rand_int = torch.randint(0, max_size, (), device=device)
-    if is_distributed():
-        dist.all_reduce(rand_int)
-    return rand_int.item()
-class ResidualFSQ(Module):
-    """Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf"""
-    def __init__(
-        self,
-        *,
-        levels: List[int],
-        num_quantizers,
-        dim=None,
-        is_channel_first=False,
-        quantize_dropout=False,
-        quantize_dropout_cutoff_index=0,
-        quantize_dropout_multiple_of=1,
-        **kwargs,
-    ):
-        super().__init__()
-        codebook_dim = len(levels)
-        dim = default(dim, codebook_dim)
-        requires_projection = codebook_dim != dim
-        self.project_in = (
-            nn.Linear(dim, codebook_dim) if requires_projection else nn.Identity()
-        )
-        self.project_out = (
-            nn.Linear(codebook_dim, dim) if requires_projection else nn.Identity()
-        )
-        self.has_projections = requires_projection
-        self.is_channel_first = is_channel_first
-        self.num_quantizers = num_quantizers
-        self.levels = levels
-        self.layers = nn.ModuleList([])
-        levels_tensor = torch.Tensor(levels)
-        scales = []
-        for ind in range(num_quantizers):
-            scales.append((levels_tensor - 1) ** -ind)
-            fsq = FSQ(levels=levels, dim=codebook_dim, **kwargs)
-            self.layers.append(fsq)
-        assert all([not fsq.has_projections for fsq in self.layers])
-        self.codebook_size = self.layers[0].codebook_size
-        self.register_buffer("scales", torch.stack(scales), persistent=False)
-        self.quantize_dropout = quantize_dropout and num_quantizers > 1
-        assert quantize_dropout_cutoff_index >= 0
-        self.quantize_dropout_cutoff_index = quantize_dropout_cutoff_index
-        self.quantize_dropout_multiple_of = quantize_dropout_multiple_of  # encodec paper proposes structured dropout, believe this was set to 4
-    @property
-    def codebooks(self):
-        codebooks = [layer.implicit_codebook for layer in self.layers]
-        codebooks = torch.stack(codebooks, dim=0)
-        return codebooks
-    def get_codes_from_indices(self, indices):
-        batch, quantize_dim = indices.shape[0], indices.shape[-1]
-        # may also receive indices in the shape of 'b h w q' (accept_image_fmap)
-        indices, ps = pack([indices], "b * q")
-        # because of quantize dropout, one can pass in indices that are coarse
-        # and the network should be able to reconstruct
-        if quantize_dim < self.num_quantizers:
-            assert (
-                self.quantize_dropout > 0.0
-            ), "quantize dropout must be greater than 0 if you wish to reconstruct from a signal with less fine quantizations"
-            indices = F.pad(indices, (0, self.num_quantizers - quantize_dim), value=-1)
-        # take care of quantizer dropout
-        mask = indices == -1
-        indices = indices.masked_fill(
-            mask, 0
-        )  # have it fetch a dummy code to be masked out later
-        all_codes = get_at("q [c] d, b n q -> q b n d", self.codebooks, indices)
-        # mask out any codes that were dropout-ed
-        all_codes = all_codes.masked_fill(rearrange(mask, "b n q -> q b n 1"), 0.0)
-        # scale the codes
-        scales = rearrange(self.scales, "q d -> q 1 1 d")
-        all_codes = all_codes * scales
-        # if (accept_image_fmap = True) then return shape (quantize, batch, height, width, dimension)
-        (all_codes,) = unpack(all_codes, ps, "q b * d")
-        return all_codes
-    def get_output_from_indices(self, indices):
-        codes = self.get_codes_from_indices(indices)
-        codes_summed = reduce(codes, "q ... -> ...", "sum")
-        return self.project_out(codes_summed)
-    def forward(self, x, return_all_codes=False, rand_quantize_dropout_fixed_seed=None):
-        num_quant, quant_dropout_multiple_of, device = (
-            self.num_quantizers,
-            self.quantize_dropout_multiple_of,
-            x.device,
-        )
-        # handle channel first
-        if self.is_channel_first:
-            x = rearrange(x, "b d ... -> b ... d")
-            x, ps = pack([x], "b * d")
-        # maybe project in
-        x = self.project_in(x)
-        quantized_out = 0.0
-        residual = x
-        all_indices = []
-        should_quantize_dropout = self.training and self.quantize_dropout
-        # sample a layer index at which to dropout further residual quantization
-        # also prepare null indices
-        if should_quantize_dropout:
-            # check if seed is manually passed in
-            if not exists(rand_quantize_dropout_fixed_seed):
-                rand_quantize_dropout_fixed_seed = get_maybe_sync_seed(device)
-            rand = random.Random(rand_quantize_dropout_fixed_seed)
-            rand_quantize_dropout_index = rand.randrange(
-                self.quantize_dropout_cutoff_index, num_quant
-            )
-            if quant_dropout_multiple_of != 1:
-                rand_quantize_dropout_index = (
-                    round_up_multiple(
-                        rand_quantize_dropout_index + 1, quant_dropout_multiple_of
-                    )
-                    - 1
-                )
-            null_indices = torch.full(
-                x.shape[:2], -1.0, device=device, dtype=torch.long
-            )
-        # go through the layers
-        with autocast("cuda", enabled=False):
-            for quantizer_index, (layer, scale) in enumerate(
-                zip(self.layers, self.scales)
-            ):
-                if (
-                    should_quantize_dropout
-                    and quantizer_index > rand_quantize_dropout_index
-                ):
-                    all_indices.append(null_indices)
-                    continue
-                quantized, indices = layer(residual / scale)
-                quantized = quantized * scale
-                residual = residual - quantized.detach()
-                quantized_out = quantized_out + quantized
-                all_indices.append(indices)
-        # project out, if needed
-        quantized_out = self.project_out(quantized_out)
-        # stack all indices
-        all_indices = torch.stack(all_indices, dim=-1)
-        # channel first out
-        if self.is_channel_first:
-            (quantized_out,) = unpack(quantized_out, ps, "b * d")
-            (all_indices,) = unpack(all_indices, ps, "b * d")
-            quantized_out = rearrange(quantized_out, "b ... d -> b d ...")
-            all_indices = rearrange(all_indices, "b ... d -> b d ...")
-        # return
-        ret = (quantized_out, all_indices)
-        if not return_all_codes:
-            return ret
-        # whether to return all codes from all codebooks across layers
-        all_codes = self.get_codes_from_indices(all_indices)
-        # will return all codes in shape (quantizer, batch, sequence length, codebook dimension)
-        return (*ret, all_codes)
-# grouped residual fsq
-class GroupedResidualFSQ(Module):
-    def __init__(self, *, dim, groups=1, accept_image_fmap=False, **kwargs):
-        super().__init__()
-        self.dim = dim
-        self.groups = groups
-        assert (dim % groups) == 0
-        dim_per_group = dim // groups
-        self.accept_image_fmap = accept_image_fmap
-        self.rvqs = nn.ModuleList([])
-        for _ in range(groups):
-            self.rvqs.append(ResidualFSQ(dim=dim_per_group, **kwargs))
-        self.codebook_size = self.rvqs[0].codebook_size
-    @property
-    def codebooks(self):
-        return torch.stack(tuple(rvq.codebooks for rvq in self.rvqs))
-    @property
-    def split_dim(self):
-        return 1 if self.accept_image_fmap else -1
-    def get_codes_from_indices(self, indices):
-        codes = tuple(
-            rvq.get_codes_from_indices(chunk_indices)
-            for rvq, chunk_indices in zip(self.rvqs, indices)
-        )
-        return torch.stack(codes)
-    def get_output_from_indices(self, indices):
-        outputs = tuple(
-            rvq.get_output_from_indices(chunk_indices)
-            for rvq, chunk_indices in zip(self.rvqs, indices)
-        )
-        return torch.cat(outputs, dim=self.split_dim)
-    def forward(self, x, return_all_codes=False):
-        shape, split_dim, device = x.shape, self.split_dim, x.device
-        assert shape[split_dim] == self.dim
-        # split the feature dimension into groups
-        x = x.chunk(self.groups, dim=split_dim)
-        forward_kwargs = dict(
-            return_all_codes=return_all_codes,
-            rand_quantize_dropout_fixed_seed=(
-                get_maybe_sync_seed(device) if self.training else None
-            ),
-        )
-        # invoke residual vq on each group
-        out = tuple(rvq(chunk, **forward_kwargs) for rvq, chunk in zip(self.rvqs, x))
-        out = tuple(zip(*out))
-        # otherwise, get all the zipped outputs and combine them
-        quantized, all_indices, *maybe_all_codes = out
-        quantized = torch.cat(quantized, dim=split_dim)
-        all_indices = torch.stack(all_indices)
-        ret = (quantized, all_indices, *maybe_all_codes)
-        return ret
-if __name__ == "__main__":
-    model = ResidualFSQ(
-        levels=[4, 4, 4, 4, 4, 4],
-        num_quantizers=1,
-        dim=30,
-        is_channel_first=True,
-        quantize_dropout=False,
-    )
-    x = torch.randn(2, 30, 10)
-    quantize, embed_ind = model(x)
-    emb_from_ind = model.get_output_from_indices(embed_ind.transpose(1, 2))
-    print(quantize == emb_from_ind.transpose(1, 2))
-    print("quantize shape", quantize.shape)
-    print("embed_ind", embed_ind)

sparktts/modules/speaker/ecapa_tdnn.py DELETED Viewed

@@ -1,267 +0,0 @@
-# Copyright (c) 2021 Zhengyang Chen ([email protected])
-#               2022 Hongji Wang ([email protected])
-#               2023 Bing Han ([email protected])
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" This implementation is adapted from github repo:
-    https://github.com/lawlict/ECAPA-TDNN.
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import sparktts.modules.speaker.pooling_layers as pooling_layers
-class Res2Conv1dReluBn(nn.Module):
-    """
-    in_channels == out_channels == channels
-    """
-    def __init__(
-        self,
-        channels,
-        kernel_size=1,
-        stride=1,
-        padding=0,
-        dilation=1,
-        bias=True,
-        scale=4,
-    ):
-        super().__init__()
-        assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
-        self.scale = scale
-        self.width = channels // scale
-        self.nums = scale if scale == 1 else scale - 1
-        self.convs = []
-        self.bns = []
-        for i in range(self.nums):
-            self.convs.append(
-                nn.Conv1d(
-                    self.width,
-                    self.width,
-                    kernel_size,
-                    stride,
-                    padding,
-                    dilation,
-                    bias=bias,
-                )
-            )
-            self.bns.append(nn.BatchNorm1d(self.width))
-        self.convs = nn.ModuleList(self.convs)
-        self.bns = nn.ModuleList(self.bns)
-    def forward(self, x):
-        out = []
-        spx = torch.split(x, self.width, 1)
-        sp = spx[0]
-        for i, (conv, bn) in enumerate(zip(self.convs, self.bns)):
-            # Order: conv -> relu -> bn
-            if i >= 1:
-                sp = sp + spx[i]
-            sp = conv(sp)
-            sp = bn(F.relu(sp))
-            out.append(sp)
-        if self.scale != 1:
-            out.append(spx[self.nums])
-        out = torch.cat(out, dim=1)
-        return out
-""" Conv1d + BatchNorm1d + ReLU
-"""
-class Conv1dReluBn(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size=1,
-        stride=1,
-        padding=0,
-        dilation=1,
-        bias=True,
-    ):
-        super().__init__()
-        self.conv = nn.Conv1d(
-            in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias
-        )
-        self.bn = nn.BatchNorm1d(out_channels)
-    def forward(self, x):
-        return self.bn(F.relu(self.conv(x)))
-""" The SE connection of 1D case.
-"""
-class SE_Connect(nn.Module):
-    def __init__(self, channels, se_bottleneck_dim=128):
-        super().__init__()
-        self.linear1 = nn.Linear(channels, se_bottleneck_dim)
-        self.linear2 = nn.Linear(se_bottleneck_dim, channels)
-    def forward(self, x):
-        out = x.mean(dim=2)
-        out = F.relu(self.linear1(out))
-        out = torch.sigmoid(self.linear2(out))
-        out = x * out.unsqueeze(2)
-        return out
-""" SE-Res2Block of the ECAPA-TDNN architecture.
-"""
-class SE_Res2Block(nn.Module):
-    def __init__(self, channels, kernel_size, stride, padding, dilation, scale):
-        super().__init__()
-        self.se_res2block = nn.Sequential(
-            Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
-            Res2Conv1dReluBn(
-                channels, kernel_size, stride, padding, dilation, scale=scale
-            ),
-            Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
-            SE_Connect(channels),
-        )
-    def forward(self, x):
-        return x + self.se_res2block(x)
-class ECAPA_TDNN(nn.Module):
-    def __init__(
-        self,
-        channels=512,
-        feat_dim=80,
-        embed_dim=192,
-        pooling_func="ASTP",
-        global_context_att=False,
-        emb_bn=False,
-    ):
-        super().__init__()
-        self.layer1 = Conv1dReluBn(feat_dim, channels, kernel_size=5, padding=2)
-        self.layer2 = SE_Res2Block(
-            channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8
-        )
-        self.layer3 = SE_Res2Block(
-            channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8
-        )
-        self.layer4 = SE_Res2Block(
-            channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8
-        )
-        cat_channels = channels * 3
-        out_channels = 512 * 3
-        self.conv = nn.Conv1d(cat_channels, out_channels, kernel_size=1)
-        self.pool = getattr(pooling_layers, pooling_func)(
-            in_dim=out_channels, global_context_att=global_context_att
-        )
-        self.pool_out_dim = self.pool.get_out_dim()
-        self.bn = nn.BatchNorm1d(self.pool_out_dim)
-        self.linear = nn.Linear(self.pool_out_dim, embed_dim)
-        self.emb_bn = emb_bn
-        if emb_bn:  # better in SSL for SV
-            self.bn2 = nn.BatchNorm1d(embed_dim)
-        else:
-            self.bn2 = nn.Identity()
-    def forward(self, x, return_latent=False):
-        x = x.permute(0, 2, 1)  # (B,T,F) -> (B,F,T)
-        out1 = self.layer1(x)
-        out2 = self.layer2(out1)
-        out3 = self.layer3(out2)
-        out4 = self.layer4(out3)
-        out = torch.cat([out2, out3, out4], dim=1)
-        latent = F.relu(self.conv(out))
-        out = self.bn(self.pool(latent))
-        out = self.linear(out)
-        if self.emb_bn:
-            out = self.bn2(out)
-        if return_latent:
-            return out, latent
-        return out
-def ECAPA_TDNN_c1024(feat_dim, embed_dim, pooling_func="ASTP", emb_bn=False):
-    return ECAPA_TDNN(
-        channels=1024,
-        feat_dim=feat_dim,
-        embed_dim=embed_dim,
-        pooling_func=pooling_func,
-        emb_bn=emb_bn,
-    )
-def ECAPA_TDNN_GLOB_c1024(feat_dim, embed_dim, pooling_func="ASTP", emb_bn=False):
-    return ECAPA_TDNN(
-        channels=1024,
-        feat_dim=feat_dim,
-        embed_dim=embed_dim,
-        pooling_func=pooling_func,
-        global_context_att=True,
-        emb_bn=emb_bn,
-    )
-def ECAPA_TDNN_c512(feat_dim, embed_dim, pooling_func="ASTP", emb_bn=False):
-    return ECAPA_TDNN(
-        channels=512,
-        feat_dim=feat_dim,
-        embed_dim=embed_dim,
-        pooling_func=pooling_func,
-        emb_bn=emb_bn,
-    )
-def ECAPA_TDNN_GLOB_c512(feat_dim, embed_dim, pooling_func="ASTP", emb_bn=False):
-    return ECAPA_TDNN(
-        channels=512,
-        feat_dim=feat_dim,
-        embed_dim=embed_dim,
-        pooling_func=pooling_func,
-        global_context_att=True,
-        emb_bn=emb_bn,
-    )
-if __name__ == "__main__":
-    x = torch.zeros(1, 200, 100)
-    model = ECAPA_TDNN_GLOB_c512(feat_dim=100, embed_dim=256, pooling_func="ASTP")
-    model.eval()
-    out, latent = model(x, True)
-    print(out.shape)
-    print(latent.shape)
-    num_params = sum(param.numel() for param in model.parameters())
-    print("{} M".format(num_params / 1e6))
-    # from thop import profile
-    # x_np = torch.randn(1, 200, 80)
-    # flops, params = profile(model, inputs=(x_np, ))
-    # print("FLOPs: {} G, Params: {} M".format(flops / 1e9, params / 1e6))

sparktts/modules/speaker/perceiver_encoder.py DELETED Viewed

@@ -1,360 +0,0 @@
-# Copyright (c) 2025 SparkAudio
-#               2025 Xinsheng Wang ([email protected])
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Adapted from https://github.com/lucidrains/naturalspeech2-pytorch/blob/659bec7f7543e7747e809e950cc2f84242fbeec7/naturalspeech2_pytorch/naturalspeech2_pytorch.py#L532
-from collections import namedtuple
-from functools import wraps
-import torch
-import torch.nn.functional as F
-from einops import rearrange, repeat
-from einops.layers.torch import Rearrange
-from packaging import version
-from torch import einsum, nn
-def exists(val):
-    return val is not None
-def once(fn):
-    called = False
-    @wraps(fn)
-    def inner(x):
-        nonlocal called
-        if called:
-            return
-        called = True
-        return fn(x)
-    return inner
-print_once = once(print)
-# main class
-class Attend(nn.Module):
-    def __init__(self, dropout=0.0, causal=False, use_flash=False):
-        super().__init__()
-        self.dropout = dropout
-        self.attn_dropout = nn.Dropout(dropout)
-        self.causal = causal
-        self.register_buffer("mask", None, persistent=False)
-        self.use_flash = use_flash
-        assert not (
-            use_flash and version.parse(torch.__version__) < version.parse("2.0.0")
-        ), "in order to use flash attention, you must be using pytorch 2.0 or above"
-        # determine efficient attention configs for cuda and cpu
-        self.config = namedtuple(
-            "EfficientAttentionConfig",
-            ["enable_flash", "enable_math", "enable_mem_efficient"],
-        )
-        self.cpu_config = self.config(True, True, True)
-        self.cuda_config = None
-        if not torch.cuda.is_available() or not use_flash:
-            return
-        device_properties = torch.cuda.get_device_properties(torch.device("cuda"))
-        if device_properties.major == 8 and device_properties.minor == 0:
-            print_once(
-                "A100 GPU detected, using flash attention if input tensor is on cuda"
-            )
-            self.cuda_config = self.config(True, False, False)
-        else:
-            print_once(
-                "Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda"
-            )
-            self.cuda_config = self.config(False, True, True)
-    def get_mask(self, n, device):
-        if exists(self.mask) and self.mask.shape[-1] >= n:
-            return self.mask[:n, :n]
-        mask = torch.ones((n, n), device=device, dtype=torch.bool).triu(1)
-        self.register_buffer("mask", mask, persistent=False)
-        return mask
-    def flash_attn(self, q, k, v, mask=None):
-        _, heads, q_len, _, k_len, is_cuda = *q.shape, k.shape[-2], q.is_cuda
-        # Recommended for multi-query single-key-value attention by Tri Dao
-        # kv shape torch.Size([1, 512, 64]) -> torch.Size([1, 8, 512, 64])
-        if k.ndim == 3:
-            k = rearrange(k, "b ... -> b 1 ...").expand_as(q)
-        if v.ndim == 3:
-            v = rearrange(v, "b ... -> b 1 ...").expand_as(q)
-        # Check if mask exists and expand to compatible shape
-        # The mask is B L, so it would have to be expanded to B H N L
-        if exists(mask):
-            mask = rearrange(mask, "b j -> b 1 1 j")
-            mask = mask.expand(-1, heads, q_len, -1)
-        # Check if there is a compatible device for flash attention
-        config = self.cuda_config if is_cuda else self.cpu_config
-        # pytorch 2.0 flash attn: q, k, v, mask, dropout, causal, softmax_scale
-        with torch.backends.cuda.sdp_kernel(**config._asdict()):
-            out = F.scaled_dot_product_attention(
-                q,
-                k,
-                v,
-                attn_mask=mask,
-                dropout_p=self.dropout if self.training else 0.0,
-                is_causal=self.causal,
-            )
-        return out
-    def forward(self, q, k, v, mask=None):
-        """
-        einstein notation
-        b - batch
-        h - heads
-        n, i, j - sequence length (base sequence length, source, target)
-        d - feature dimension
-        """
-        n, device = q.shape[-2], q.device
-        scale = q.shape[-1] ** -0.5
-        if self.use_flash:
-            return self.flash_attn(q, k, v, mask=mask)
-        kv_einsum_eq = "b j d" if k.ndim == 3 else "b h j d"
-        # similarity
-        sim = einsum(f"b h i d, {kv_einsum_eq} -> b h i j", q, k) * scale
-        # key padding mask
-        if exists(mask):
-            mask = rearrange(mask, "b j -> b 1 1 j")
-            sim = sim.masked_fill(~mask, -torch.finfo(sim.dtype).max)
-        # causal mask
-        if self.causal:
-            causal_mask = self.get_mask(n, device)
-            sim = sim.masked_fill(causal_mask, -torch.finfo(sim.dtype).max)
-        # attention
-        attn = sim.softmax(dim=-1)
-        attn = self.attn_dropout(attn)
-        # aggregate values
-        out = einsum(f"b h i j, {kv_einsum_eq} -> b h i d", attn, v)
-        return out
-def Sequential(*mods):
-    return nn.Sequential(*filter(exists, mods))
-def exists(x):
-    return x is not None
-def default(val, d):
-    if exists(val):
-        return val
-    return d() if callable(d) else d
-class RMSNorm(nn.Module):
-    def __init__(self, dim, scale=True, dim_cond=None):
-        super().__init__()
-        self.cond = exists(dim_cond)
-        self.to_gamma_beta = nn.Linear(dim_cond, dim * 2) if self.cond else None
-        self.scale = dim**0.5
-        self.gamma = nn.Parameter(torch.ones(dim)) if scale else None
-    def forward(self, x, cond=None):
-        gamma = default(self.gamma, 1)
-        out = F.normalize(x, dim=-1) * self.scale * gamma
-        if not self.cond:
-            return out
-        assert exists(cond)
-        gamma, beta = self.to_gamma_beta(cond).chunk(2, dim=-1)
-        gamma, beta = map(lambda t: rearrange(t, "b d -> b 1 d"), (gamma, beta))
-        return out * gamma + beta
-class CausalConv1d(nn.Conv1d):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        (kernel_size,) = self.kernel_size
-        (dilation,) = self.dilation
-        (stride,) = self.stride
-        assert stride == 1
-        self.causal_padding = dilation * (kernel_size - 1)
-    def forward(self, x):
-        causal_padded_x = F.pad(x, (self.causal_padding, 0), value=0.0)
-        return super().forward(causal_padded_x)
-class GEGLU(nn.Module):
-    def forward(self, x):
-        x, gate = x.chunk(2, dim=-1)
-        return F.gelu(gate) * x
-def FeedForward(dim, mult=4, causal_conv=False):
-    dim_inner = int(dim * mult * 2 / 3)
-    conv = None
-    if causal_conv:
-        conv = nn.Sequential(
-            Rearrange("b n d -> b d n"),
-            CausalConv1d(dim_inner, dim_inner, 3),
-            Rearrange("b d n -> b n d"),
-        )
-    return Sequential(
-        nn.Linear(dim, dim_inner * 2), GEGLU(), conv, nn.Linear(dim_inner, dim)
-    )
-class Attention(nn.Module):
-    def __init__(
-        self,
-        dim,
-        *,
-        dim_context=None,
-        causal=False,
-        dim_head=64,
-        heads=8,
-        dropout=0.0,
-        use_flash=False,
-        cross_attn_include_queries=False,
-    ):
-        super().__init__()
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        self.cross_attn_include_queries = cross_attn_include_queries
-        dim_inner = dim_head * heads
-        dim_context = default(dim_context, dim)
-        self.attend = Attend(causal=causal, dropout=dropout, use_flash=use_flash)
-        self.to_q = nn.Linear(dim, dim_inner, bias=False)
-        self.to_kv = nn.Linear(dim_context, dim_inner * 2, bias=False)
-        self.to_out = nn.Linear(dim_inner, dim, bias=False)
-    def forward(self, x, context=None, mask=None):
-        h, has_context = self.heads, exists(context)
-        context = default(context, x)
-        if has_context and self.cross_attn_include_queries:
-            context = torch.cat((x, context), dim=-2)
-        q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, dim=-1))
-        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
-        out = self.attend(q, k, v, mask=mask)
-        out = rearrange(out, "b h n d -> b n (h d)")
-        return self.to_out(out)
-class PerceiverResampler(nn.Module):
-    def __init__(
-        self,
-        *,
-        dim,
-        depth=2,
-        dim_context=None,
-        num_latents=32,
-        dim_head=64,
-        heads=8,
-        ff_mult=4,
-        use_flash_attn=False,
-    ):
-        super().__init__()
-        dim_context = default(dim_context, dim)
-        self.proj_context = (
-            nn.Linear(dim_context, dim) if dim_context != dim else nn.Identity()
-        )
-        self.latents = nn.Parameter(torch.randn(num_latents, dim))
-        nn.init.normal_(self.latents, std=0.02)
-        self.layers = nn.ModuleList([])
-        for _ in range(depth):
-            self.layers.append(
-                nn.ModuleList(
-                    [
-                        Attention(
-                            dim=dim,
-                            dim_head=dim_head,
-                            heads=heads,
-                            use_flash=use_flash_attn,
-                            cross_attn_include_queries=True,
-                        ),
-                        FeedForward(dim=dim, mult=ff_mult),
-                    ]
-                )
-            )
-        self.norm = RMSNorm(dim)
-    def forward(self, x, mask=None):
-        batch = x.shape[0]
-        x = self.proj_context(x)
-        latents = repeat(self.latents, "n d -> b n d", b=batch)
-        for attn, ff in self.layers:
-            latents = attn(latents, x, mask=mask) + latents
-            latents = ff(latents) + latents
-        return self.norm(latents)
-if __name__ == "__main__":
-    model = PerceiverResampler(dim=256, dim_context=80)
-    x = torch.randn(8, 200, 80)
-    out = model(x)
-    print(out.shape)  # [8, 32, 80]
-    num_params = sum(param.numel() for param in model.parameters())
-    print("{} M".format(num_params / 1e6))

sparktts/modules/speaker/pooling_layers.py DELETED Viewed

@@ -1,298 +0,0 @@
-# Copyright (c) 2021 Shuai Wang ([email protected])
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Pooling functions to aggregate frame-level deep features
-into segment-level speaker embeddings
-High-order statistics are surprisingly effective, TSDP acts similarly as TSTP,
-even though we remove the mean statistic, on Voxceleb.
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-class TAP(nn.Module):
-    """
-    Temporal average pooling, only first-order mean is considered
-    """
-    def __init__(self, in_dim=0, **kwargs):
-        super(TAP, self).__init__()
-        self.in_dim = in_dim
-    def forward(self, x):
-        pooling_mean = x.mean(dim=-1)
-        # To be compatable with 2D input
-        pooling_mean = pooling_mean.flatten(start_dim=1)
-        return pooling_mean
-    def get_out_dim(self):
-        self.out_dim = self.in_dim
-        return self.out_dim
-class TSDP(nn.Module):
-    """
-    Temporal standard deviation pooling, only second-order std is considered
-    """
-    def __init__(self, in_dim=0, **kwargs):
-        super(TSDP, self).__init__()
-        self.in_dim = in_dim
-    def forward(self, x):
-        # The last dimension is the temporal axis
-        pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-7)
-        pooling_std = pooling_std.flatten(start_dim=1)
-        return pooling_std
-    def get_out_dim(self):
-        self.out_dim = self.in_dim
-        return self.out_dim
-class TSTP(nn.Module):
-    """
-    Temporal statistics pooling, concatenate mean and std, which is used in
-    x-vector
-    Comment: simple concatenation can not make full use of both statistics
-    """
-    def __init__(self, in_dim=0, **kwargs):
-        super(TSTP, self).__init__()
-        self.in_dim = in_dim
-    def forward(self, x):
-        # The last dimension is the temporal axis
-        pooling_mean = x.mean(dim=-1)
-        pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-7)
-        pooling_mean = pooling_mean.flatten(start_dim=1)
-        pooling_std = pooling_std.flatten(start_dim=1)
-        stats = torch.cat((pooling_mean, pooling_std), 1)
-        return stats
-    def get_out_dim(self):
-        self.out_dim = self.in_dim * 2
-        return self.out_dim
-class ASTP(nn.Module):
-    """ Attentive statistics pooling: Channel- and context-dependent
-        statistics pooling, first used in ECAPA_TDNN.
-    """
-    def __init__(self,
-                 in_dim,
-                 bottleneck_dim=128,
-                 global_context_att=False,
-                 **kwargs):
-        super(ASTP, self).__init__()
-        self.in_dim = in_dim
-        self.global_context_att = global_context_att
-        # Use Conv1d with stride == 1 rather than Linear, then we don't
-        # need to transpose inputs.
-        if global_context_att:
-            self.linear1 = nn.Conv1d(
-                in_dim * 3, bottleneck_dim,
-                kernel_size=1)  # equals W and b in the paper
-        else:
-            self.linear1 = nn.Conv1d(
-                in_dim, bottleneck_dim,
-                kernel_size=1)  # equals W and b in the paper
-        self.linear2 = nn.Conv1d(bottleneck_dim, in_dim,
-                                 kernel_size=1)  # equals V and k in the paper
-    def forward(self, x):
-        """
-        x: a 3-dimensional tensor in tdnn-based architecture (B,F,T)
-            or a 4-dimensional tensor in resnet architecture (B,C,F,T)
-            0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
-        """
-        if len(x.shape) == 4:
-            x = x.reshape(x.shape[0], x.shape[1] * x.shape[2], x.shape[3])
-        assert len(x.shape) == 3
-        if self.global_context_att:
-            context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
-            context_std = torch.sqrt(
-                torch.var(x, dim=-1, keepdim=True) + 1e-7).expand_as(x)
-            x_in = torch.cat((x, context_mean, context_std), dim=1)
-        else:
-            x_in = x
-        # DON'T use ReLU here! ReLU may be hard to converge.
-        alpha = torch.tanh(
-            self.linear1(x_in))  # alpha = F.relu(self.linear1(x_in))
-        alpha = torch.softmax(self.linear2(alpha), dim=2)
-        mean = torch.sum(alpha * x, dim=2)
-        var = torch.sum(alpha * (x**2), dim=2) - mean**2
-        std = torch.sqrt(var.clamp(min=1e-7))
-        return torch.cat([mean, std], dim=1)
-    def get_out_dim(self):
-        self.out_dim = 2 * self.in_dim
-        return self.out_dim
-class MHASTP(torch.nn.Module):
-    """ Multi head attentive statistics pooling
-    Reference:
-        Self Multi-Head Attention for Speaker Recognition
-        https://arxiv.org/pdf/1906.09890.pdf
-    """
-    def __init__(self,
-                 in_dim,
-                 layer_num=2,
-                 head_num=2,
-                 d_s=1,
-                 bottleneck_dim=64,
-                 **kwargs):
-        super(MHASTP, self).__init__()
-        assert (in_dim % head_num
-                ) == 0  # make sure that head num can be divided by input_dim
-        self.in_dim = in_dim
-        self.head_num = head_num
-        d_model = int(in_dim / head_num)
-        channel_dims = [bottleneck_dim for i in range(layer_num + 1)]
-        if d_s > 1:
-            d_s = d_model
-        else:
-            d_s = 1
-        self.d_s = d_s
-        channel_dims[0], channel_dims[-1] = d_model, d_s
-        heads_att_trans = []
-        for i in range(self.head_num):
-            att_trans = nn.Sequential()
-            for i in range(layer_num - 1):
-                att_trans.add_module(
-                    'att_' + str(i),
-                    nn.Conv1d(channel_dims[i], channel_dims[i + 1], 1, 1))
-                att_trans.add_module('tanh' + str(i), nn.Tanh())
-            att_trans.add_module(
-                'att_' + str(layer_num - 1),
-                nn.Conv1d(channel_dims[layer_num - 1], channel_dims[layer_num],
-                          1, 1))
-            heads_att_trans.append(att_trans)
-        self.heads_att_trans = nn.ModuleList(heads_att_trans)
-    def forward(self, input):
-        """
-        input: a 3-dimensional tensor in xvector architecture
-            or a 4-dimensional tensor in resnet architecture
-            0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
-        """
-        if len(input.shape) == 4:  # B x F x T
-            input = input.reshape(input.shape[0],
-                                  input.shape[1] * input.shape[2],
-                                  input.shape[3])
-        assert len(input.shape) == 3
-        bs, f_dim, t_dim = input.shape
-        chunks = torch.chunk(input, self.head_num, 1)
-        # split
-        chunks_out = []
-        # for i in range(self.head_num):
-        #     att_score = self.heads_att_trans[i](chunks[i])
-        for i, layer in enumerate(self.heads_att_trans):
-            att_score = layer(chunks[i])
-            alpha = F.softmax(att_score, dim=-1)
-            mean = torch.sum(alpha * chunks[i], dim=2)
-            var = torch.sum(alpha * chunks[i]**2, dim=2) - mean**2
-            std = torch.sqrt(var.clamp(min=1e-7))
-            chunks_out.append(torch.cat((mean, std), dim=1))
-        out = torch.cat(chunks_out, dim=1)
-        return out
-    def get_out_dim(self):
-        self.out_dim = 2 * self.in_dim
-        return self.out_dim
-class MQMHASTP(torch.nn.Module):
-    """ An attentive pooling
-    Reference:
-        multi query multi head attentive statistics pooling
-        https://arxiv.org/pdf/2110.05042.pdf
-    Args:
-        in_dim: the feature dimension of input
-        layer_num: the number of layer in the pooling layer
-        query_num: the number of querys
-        head_num: the number of heads
-        bottleneck_dim: the bottleneck dimension
-    SA (H = 1, Q = 1, n = 2, d_s = 1) ref:
-        https://www.danielpovey.com/files/2018_interspeech_xvector_attention.pdf
-    MHA (H > 1, Q = 1, n = 1, d_s = 1) ref:
-        https://arxiv.org/pdf/1906.09890.pdf
-    AS (H = 1, Q > 1, n = 2, d_s = 1) ref:
-        https://arxiv.org/pdf/1803.10963.pdf
-    VSA (H = 1, Q > 1, n = 2, d_s = d_h) ref:
-        http://www.interspeech2020.org/uploadfile/pdf/Mon-2-10-5.pdf
-    """
-    def __init__(self,
-                 in_dim,
-                 layer_num=2,
-                 query_num=2,
-                 head_num=8,
-                 d_s=2,
-                 bottleneck_dim=64,
-                 **kwargs):
-        super(MQMHASTP, self).__init__()
-        self.n_query = nn.ModuleList([
-            MHASTP(in_dim,
-                   layer_num=layer_num,
-                   head_num=head_num,
-                   d_s=d_s,
-                   bottleneck_dim=bottleneck_dim) for i in range(query_num)
-        ])
-        self.query_num = query_num
-        self.in_dim = in_dim
-    def forward(self, input):
-        """
-        input: a 3-dimensional tensor in xvector architecture
-            or a 4-dimensional tensor in resnet architecture
-            0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
-        """
-        if len(input.shape) == 4:  # B x F x T
-            input = input.reshape(input.shape[0],
-                                  input.shape[1] * input.shape[2],
-                                  input.shape[3])
-        assert len(input.shape) == 3
-        res = []
-        for i, layer in enumerate(self.n_query):
-            res.append(layer(input))
-        out = torch.cat(res, dim=-1)
-        return out
-    def get_out_dim(self):
-        self.out_dim = self.in_dim * 2 * self.query_num
-        return self.out_dim
-if __name__ == '__main__':
-    data = torch.randn(16, 512, 10, 35)
-    # model = StatisticsPooling()
-    model = MQMHASTP(512 * 10)
-    model = MHASTP(512 * 10)
-    model = MQMHASTP(512 * 10, context=False)
-    print(model)
-    out = model(data)
-    print(out.shape)
-    print(model.get_out_dim())

sparktts/modules/speaker/speaker_encoder.py DELETED Viewed

@@ -1,136 +0,0 @@
-# Copyright (c) 2025 SparkAudio
-#               2025 Xinsheng Wang ([email protected])
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-from typing import List, Tuple
-from sparktts.modules.fsq.residual_fsq import ResidualFSQ
-from sparktts.modules.speaker.ecapa_tdnn import ECAPA_TDNN_GLOB_c512
-from sparktts.modules.speaker.perceiver_encoder import PerceiverResampler
-"""
-x-vector + d-vector
-"""
-class SpeakerEncoder(nn.Module):
-    """
-    Args:
-        input_dim (int): acoustic feature dimension
-        out_dim (int): output dimension of x-vector and d-vector
-        latent_dim (int): latent dimension before quantization
-        token_num (int): sequence length of speaker tokens
-        fsq_levels (List[int]): number of levels for each quantizer
-        fsq_num_quantizers (int): number of quantizers
-    Return:
-        speaker_embs: (B, T2, out_dim)
-    """
-    def __init__(
-        self,
-        input_dim: int = 100,
-        out_dim: int = 512,
-        latent_dim: int = 128,
-        token_num: int = 32,
-        fsq_levels: List[int] = [4, 4, 4, 4, 4, 4],
-        fsq_num_quantizers: int = 1,
-    ):
-        super(SpeakerEncoder, self).__init__()
-        self.speaker_encoder = ECAPA_TDNN_GLOB_c512(
-            feat_dim=input_dim, embed_dim=out_dim
-        )
-        self.perceiver_sampler = PerceiverResampler(
-            dim=latent_dim, dim_context=512 * 3, num_latents=token_num
-        )
-        self.quantizer = ResidualFSQ(
-            levels=fsq_levels,
-            num_quantizers=fsq_num_quantizers,
-            dim=latent_dim,
-            is_channel_first=True,
-            quantize_dropout=False,
-        )
-        self.project = nn.Linear(latent_dim * token_num, out_dim)
-    def get_codes_from_indices(self, indices: torch.Tensor) -> torch.Tensor:
-        zq = self.quantizer.get_codes_from_indices(indices.transpose(1, 2))
-        return zq.transpose(1, 2)
-    def get_indices(self, mels: torch.Tensor) -> torch.Tensor:
-        mels = mels.transpose(1, 2)
-        x = self.perceiver_sampler(mels).transpose(1, 2)
-        zq, indices = self.quantizer(x)
-        return indices
-    def forward(self, mels: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Args:
-            mels: (B, D_mel, T1)
-        Return:
-            x_vector: (B, out_dim)
-            d_vector: (B, out_dim)
-        """
-        # mels = mels.transpose(1,2)
-        x_vector, features = self.speaker_encoder(mels, True)
-        x = self.perceiver_sampler(features.transpose(1, 2)).transpose(1, 2)
-        zq, indices = self.quantizer(x)  # zq: (B, latent_dim, T2, latent_dim)
-        x = zq.reshape(zq.shape[0], -1)
-        d_vector = self.project(x)
-        return x_vector, d_vector
-    def tokenize(self, mels: torch.Tensor) -> torch.Tensor:
-        """tokenize the input mel spectrogram"""
-        _, features = self.speaker_encoder(mels, True)
-        x = self.perceiver_sampler(features.transpose(1, 2)).transpose(1, 2)
-        zq, indices = self.quantizer(x)
-        return indices
-    def detokenize(self, indices: torch.Tensor) -> torch.Tensor:
-        """detokenize the input indices to d-vector"""
-        zq = self.quantizer.get_output_from_indices(indices.transpose(1, 2)).transpose(1, 2)
-        x = zq.reshape(zq.shape[0], -1)
-        d_vector = self.project(x)
-        return d_vector
-if __name__ == "__main__":
-    model = SpeakerEncoder(
-        input_dim=100,
-        latent_dim=128,
-        token_num=32,
-        fsq_levels=[4, 4, 4, 4, 4, 4],
-        fsq_num_quantizers=1,
-    )
-    mel = torch.randn(8, 200, 100)
-    x_vector, d_vector = model(mel)
-    print("x-vector shape", x_vector.shape)
-    print("d-vector shape", d_vector.shape)
-    indices = model.tokenize(mel)
-    print("indices shape", indices.shape)
-    d_vector_post = model.detokenize(indices)
-    print("d-vector shape", d_vector_post.shape)
-    if d_vector_post.all() == d_vector.all():
-        print("d-vector post and d-vector are the same")
-    else:
-        print("d-vector post and d-vector are different")
-    num_params = sum(param.numel() for param in model.parameters())
-    print("{} M".format(num_params / 1e6))

sparktts/modules/vq/factorized_vector_quantize.py DELETED Viewed

@@ -1,187 +0,0 @@
-# Copyright (c) 2025 SparkAudio
-#               2025 Xinsheng Wang ([email protected])
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Heavily based on https://github.com/lucidrains/vector-quantize-pytorch
-from typing import Any, Dict
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from torch.nn.utils import weight_norm
-def WNConv1d(*args, **kwargs):
-    return weight_norm(nn.Conv1d(*args, **kwargs))
-def ema_inplace(moving_avg, new, decay):
-    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
-class FactorizedVectorQuantize(nn.Module):
-    def __init__(
-        self,
-        input_dim: int,
-        codebook_size: int,
-        codebook_dim: int,
-        commitment: float,
-        codebook_loss_weight: float = 1.0,
-        decay: float = 0.99,
-        threshold_ema_dead_code: float = 2,
-        momentum: float = 0.99,
-        **kwargs,
-    ):
-        super().__init__()
-        self.input_dim = input_dim
-        self.codebook_size = codebook_size
-        self.codebook_dim = codebook_dim
-        self.commitment = commitment
-        self.codebook_loss_weight = codebook_loss_weight
-        self.decay = decay
-        self.threshold_ema_dead_code = threshold_ema_dead_code
-        self.momentum = momentum
-        if input_dim != self.codebook_dim:
-            self.in_project = WNConv1d(input_dim, self.codebook_dim, kernel_size=1)
-            self.out_project = WNConv1d(self.codebook_dim, input_dim, kernel_size=1)
-        else:
-            self.in_project = nn.Identity()
-            self.out_project = nn.Identity()
-        self.codebook = nn.Embedding(self.codebook_size, self.codebook_dim)
-        self.register_buffer("cluster_size", torch.zeros(self.codebook_size))
-    def forward(self, z: torch.Tensor) -> Dict[str, Any]:
-        """Quantized the input tensor using a fixed codebook and returns
-        the corresponding codebook vectors
-        Parameters
-        ----------
-        z : Tensor[B x D x T]
-        Returns
-        -------
-        Tensor[B x D x T]
-            Quantized continuous representation of input
-        Tensor[1]
-            Commitment loss to train encoder to predict vectors closer to codebook
-            entries
-        Tensor[1]
-            Codebook loss to update the codebook
-        Tensor[B x T]
-            Codebook indices (quantized discrete representation of input)
-        Tensor[B x D x T]
-            Projected latents (continuous representation of input before quantization)
-        """
-        # transpose since we use linear
-        # Factorized codes project input into low-dimensional space if self.input_dim != self.codebook_dim
-        z_e = self.in_project(z)
-        z_q, indices, dists = self.decode_latents(z_e)
-        # statistic the usage of codes
-        embed_onehot = F.one_hot(indices, self.codebook_size).type(z_e.dtype)
-        avg_probs = torch.mean(embed_onehot.reshape(-1, self.codebook_size), dim=0)
-        perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))
-        active_num = (embed_onehot.sum(0).sum(0) > 0).sum()
-        if self.training:
-            # We do the expiry of code at that point as buffers are in sync
-            # and all the workers will take the same decision.
-            ema_inplace(self.cluster_size, embed_onehot.sum(0).sum(0), self.decay)
-            active_num = sum(self.cluster_size > self.threshold_ema_dead_code)
-        if self.training:
-            commit_loss = (
-                F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
-                * self.commitment
-            )
-            codebook_loss = (
-                F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
-                * self.codebook_loss_weight
-            )
-        else:
-            commit_loss = torch.zeros(0, device=z.device)
-            codebook_loss = torch.zeros(0, device=z.device)
-        z_q = (
-            z_e + (z_q - z_e).detach()
-        )  # noop in forward pass, straight-through gradient estimator in backward pass
-        z_q = self.out_project(z_q)
-        vq_loss = (commit_loss + codebook_loss).mean()
-        return {
-            "z_q": z_q,
-            "indices": indices,
-            "dists": dists,
-            "vq_loss": vq_loss,
-            "perplexity": perplexity,
-            "active_num": active_num.float(),
-        }
-    def vq2emb(self, vq, out_proj=True):
-        emb = self.embed_code(vq)
-        if out_proj:
-            emb = self.out_project(emb)
-        return emb
-    def tokenize(self, z: torch.Tensor) -> torch.Tensor:
-        """tokenize the input tensor"""
-        z_e = self.in_project(z)
-        _, indices, _ = self.decode_latents(z_e)
-        return indices
-    def detokenize(self, indices):
-        """detokenize the input indices"""
-        z_q = self.decode_code(indices)
-        z_q = self.out_project(z_q)
-        return z_q
-    def get_emb(self):
-        return self.codebook.weight
-    def embed_code(self, embed_id):
-        return F.embedding(embed_id, self.codebook.weight)
-    def decode_code(self, embed_id):
-        return self.embed_code(embed_id).transpose(1, 2)
-    def decode_latents(self, latents):
-        encodings = rearrange(latents, "b d t -> (b t) d")
-        codebook = self.codebook.weight
-        # L2 normalize encodings and codebook
-        encodings = F.normalize(encodings)
-        codebook = F.normalize(codebook)
-        # Compute euclidean distance between encodings and codebook,
-        # with L2 normalization, the distance is equal to cosine distance
-        dist = (
-            encodings.pow(2).sum(1, keepdim=True)
-            - 2 * encodings @ codebook.t()
-            + codebook.pow(2).sum(1, keepdim=True).t()
-        )
-        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
-        z_q = self.decode_code(indices)
-        return z_q, indices, dist

sparktts/utils/__init__.py DELETED Viewed

File without changes

sparktts/utils/audio.py DELETED Viewed

@@ -1,271 +0,0 @@
-# Copyright (c) 2025 SparkAudio
-#               2025 Xinsheng Wang ([email protected])
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Description:
-    This script contains a collection of functions designed to handle various
-    audio processing.
-"""
-import random
-import soxr
-import soundfile
-import torch
-import torchaudio
-import numpy as np
-from pathlib import Path
-from typing import Tuple
-from numpy.lib.stride_tricks import sliding_window_view
-def audio_volume_normalize(audio: np.ndarray, coeff: float = 0.2) -> np.ndarray:
-    """
-    Normalize the volume of an audio signal.
-    Parameters:
-        audio (numpy array): Input audio signal array.
-        coeff (float): Target coefficient for normalization, default is 0.2.
-    Returns:
-        numpy array: The volume-normalized audio signal.
-    """
-    # Sort the absolute values of the audio signal
-    temp = np.sort(np.abs(audio))
-    # If the maximum value is less than 0.1, scale the array to have a maximum of 0.1
-    if temp[-1] < 0.1:
-        scaling_factor = max(
-            temp[-1], 1e-3
-        )  # Prevent division by zero with a small constant
-        audio = audio / scaling_factor * 0.1
-    # Filter out values less than 0.01 from temp
-    temp = temp[temp > 0.01]
-    L = temp.shape[0]  # Length of the filtered array
-    # If there are fewer than or equal to 10 significant values, return the audio without further processing
-    if L <= 10:
-        return audio
-    # Compute the average of the top 10% to 1% of values in temp
-    volume = np.mean(temp[int(0.9 * L) : int(0.99 * L)])
-    # Normalize the audio to the target coefficient level, clamping the scale factor between 0.1 and 10
-    audio = audio * np.clip(coeff / volume, a_min=0.1, a_max=10)
-    # Ensure the maximum absolute value in the audio does not exceed 1
-    max_value = np.max(np.abs(audio))
-    if max_value > 1:
-        audio = audio / max_value
-    return audio
-def load_audio(
-    adfile: Path,
-    sampling_rate: int = None,
-    length: int = None,
-    volume_normalize: bool = False,
-    segment_duration: int = None,
-) -> np.ndarray:
-    r"""Load audio file with target sampling rate and lsength
-    Args:
-        adfile (Path): path to audio file.
-        sampling_rate (int, optional): target sampling rate. Defaults to None.
-        length (int, optional): target audio length. Defaults to None.
-        volume_normalize (bool, optional): whether perform volume normalization. Defaults to False.
-        segment_duration (int): random select a segment with duration of {segment_duration}s.
-                                Defualt to None which means the whole audio will be used.
-    Returns:
-        audio (np.ndarray): audio
-    """
-    audio, sr = soundfile.read(adfile)
-    if len(audio.shape) > 1:
-        audio = audio[:, 0]
-    if sampling_rate is not None and sr != sampling_rate:
-        audio = soxr.resample(audio, sr, sampling_rate, quality="VHQ")
-        sr = sampling_rate
-    if segment_duration is not None:
-        seg_length = int(sr * segment_duration)
-        audio = random_select_audio_segment(audio, seg_length)
-    # Audio volume normalize
-    if volume_normalize:
-        audio = audio_volume_normalize(audio)
-    # check the audio length
-    if length is not None:
-        assert abs(audio.shape[0] - length) < 1000
-        if audio.shape[0] > length:
-            audio = audio[:length]
-        else:
-            audio = np.pad(audio, (0, int(length - audio.shape[0])))
-    return audio
-def random_select_audio_segment(audio: np.ndarray, length: int) -> np.ndarray:
-    """get an audio segment given the length
-    Args:
-        audio (np.ndarray):
-        length (int): audio length = sampling_rate * duration
-    """
-    if audio.shape[0] < length:
-        audio = np.pad(audio, (0, int(length - audio.shape[0])))
-    start_index = random.randint(0, audio.shape[0] - length)
-    end_index = int(start_index + length)
-    return audio[start_index:end_index]
-def audio_highpass_filter(audio, sample_rate, highpass_cutoff_freq):
-    """apply highpass fileter to audio
-    Args:
-        audio (np.ndarray):
-        sample_rate (ind):
-        highpass_cutoff_freq (int):
-    """
-    audio = torchaudio.functional.highpass_biquad(
-        torch.from_numpy(audio), sample_rate, cutoff_freq=highpass_cutoff_freq
-    )
-    return audio.numpy()
-def stft(
-    x: torch.Tensor,
-    fft_size: int,
-    hop_size: int,
-    win_length: int,
-    window: str,
-    use_complex: bool = False,
-) -> torch.Tensor:
-    """Perform STFT and convert to magnitude spectrogram.
-    Args:
-        x (Tensor): Input signal tensor (B, T).
-        fft_size (int): FFT size.
-        hop_size (int): Hop size.
-        win_length (int): Window length.
-        window (str): Window function type.
-    Returns:
-        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
-    """
-    x_stft = torch.stft(
-        x, fft_size, hop_size, win_length, window.to(x.device), return_complex=True
-    )
-    # clamp is needed to avoid nan or inf
-    if not use_complex:
-        return torch.sqrt(
-            torch.clamp(x_stft.real**2 + x_stft.imag**2, min=1e-7, max=1e3)
-        ).transpose(2, 1)
-    else:
-        res = torch.cat([x_stft.real.unsqueeze(1), x_stft.imag.unsqueeze(1)], dim=1)
-        res = res.transpose(2, 3)  # [B, 2, T, F]
-        return res
-def detect_speech_boundaries(
-    wav: np.ndarray,
-    sample_rate: int,
-    window_duration: float = 0.1,
-    energy_threshold: float = 0.01,
-    margin_factor: int = 2
-) -> Tuple[int, int]:
-    """Detect the start and end points of speech in an audio signal using RMS energy.
-    Args:
-        wav: Input audio signal array with values in [-1, 1]
-        sample_rate: Audio sample rate in Hz
-        window_duration: Duration of detection window in seconds
-        energy_threshold: RMS energy threshold for speech detection
-        margin_factor: Factor to determine extra margin around detected boundaries
-    Returns:
-        tuple: (start_index, end_index) of speech segment
-    Raises:
-        ValueError: If the audio contains only silence
-    """
-    window_size = int(window_duration * sample_rate)
-    margin = margin_factor * window_size
-    step_size = window_size // 10
-    # Create sliding windows using stride tricks to avoid loops
-    windows = sliding_window_view(wav, window_size)[::step_size]
-    # Calculate RMS energy for each window
-    energy = np.sqrt(np.mean(windows ** 2, axis=1))
-    speech_mask = energy >= energy_threshold
-    if not np.any(speech_mask):
-        raise ValueError("No speech detected in audio (only silence)")
-    start = max(0, np.argmax(speech_mask) * step_size - margin)
-    end = min(len(wav), (len(speech_mask) - 1 - np.argmax(speech_mask[::-1])) * step_size + margin)
-    return start, end
-def remove_silence_on_both_ends(
-    wav: np.ndarray,
-    sample_rate: int,
-    window_duration: float = 0.1,
-    volume_threshold: float = 0.01
-) -> np.ndarray:
-    """Remove silence from both ends of an audio signal.
-    Args:
-        wav: Input audio signal array
-        sample_rate: Audio sample rate in Hz
-        window_duration: Duration of detection window in seconds
-        volume_threshold: Amplitude threshold for silence detection
-    Returns:
-        np.ndarray: Audio signal with silence removed from both ends
-    Raises:
-        ValueError: If the audio contains only silence
-    """
-    start, end = detect_speech_boundaries(
-        wav,
-        sample_rate,
-        window_duration,
-        volume_threshold
-    )
-    return wav[start:end]
-def hertz_to_mel(pitch: float) -> float:
-    """
-    Converts a frequency from the Hertz scale to the Mel scale.
-    Parameters:
-    - pitch: float or ndarray
-        Frequency in Hertz.
-    Returns:
-    - mel: float or ndarray
-        Frequency in Mel scale.
-    """
-    mel = 2595 * np.log10(1 + pitch / 700)
-    return mel

sparktts/utils/file.py DELETED Viewed

@@ -1,221 +0,0 @@
-# Copyright (c) 2025 SparkAudio
-#               2025 Xinsheng Wang ([email protected])
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Description:
-    This script contains a collection of functions designed to handle various
-    file reading and writing operations. It provides utilities to read from files,
-    write data to files, and perform file manipulation tasks.
-"""
-import os
-import json
-import json
-import csv
-from tqdm import tqdm
-from typing import List, Dict, Any, Set, Union
-from pathlib import Path
-from omegaconf import OmegaConf, DictConfig
-def resolve_symbolic_link(symbolic_link_path: Path) -> Path:
-    """
-    Resolves the absolute path of a symbolic link.
-    Args:
-        symbolic_link_path (Path): The path to the symbolic link.
-    Returns:
-        Path: The absolute path that the symbolic link points to.
-    """
-    link_directory = os.path.dirname(symbolic_link_path)
-    target_path_relative = os.readlink(symbolic_link_path)
-    return os.path.join(link_directory, target_path_relative)
-def write_jsonl(metadata: List[dict], file_path: Path) -> None:
-    """Writes a list of dictionaries to a JSONL file.
-    Args:
-    metadata : List[dict]
-        A list of dictionaries, each representing a piece of meta.
-    file_path : Path
-        The file path to save the JSONL file
-    This function writes each dictionary in the list to a new line in the specified file.
-    """
-    with open(file_path, "w", encoding="utf-8") as f:
-        for meta in tqdm(metadata, desc="writing jsonl"):
-            # Convert dictionary to JSON string and write it to the file with a newline
-            json_str = json.dumps(meta, ensure_ascii=False) + "\n"
-            f.write(json_str)
-    print(f"jsonl saved to {file_path}")
-def read_jsonl(file_path: Path) -> List[dict]:
-    """
-    Reads a JSONL file and returns a list of dictionaries.
-    Args:
-    file_path : Path
-        The path to the JSONL file to be read.
-    Returns:
-    List[dict]
-        A list of dictionaries parsed from each line of the JSONL file.
-    """
-    metadata = []
-    # Open the file for reading
-    with open(file_path, "r", encoding="utf-8") as f:
-        # Split the file into lines
-        lines = f.read().splitlines()
-    # Process each line
-    for line in lines:
-        # Convert JSON string back to dictionary and append to list
-        meta = json.loads(line)
-        metadata.append(meta)
-    # Return the list of metadata
-    return metadata
-def read_json_as_jsonl(file_path: Path) -> List[dict]:
-    metadata = []
-    with open(file_path, 'r', encoding='utf-8') as infile:
-        data = json.load(infile)
-    for k in sorted(data.keys()):
-        meta = {'index': k}
-        meta.update(data[k])
-        metadata.append(meta)
-    return metadata
-def decode_unicode_strings(meta: Dict[str, Any]) -> Dict[str, Any]:
-    processed_meta = {}
-    for k, v in meta.items():
-        if isinstance(v, str):
-            processed_meta[k] = v.encode("utf-8").decode("unicode_escape")
-        else:
-            processed_meta[k] = v
-    return processed_meta
-def load_config(config_path: Path) -> DictConfig:
-    """Loads a configuration file and optionally merges it with a base configuration.
-    Args:
-    config_path (Path): Path to the configuration file.
-    """
-    # Load the initial configuration from the given path
-    config = OmegaConf.load(config_path)
-    # Check if there is a base configuration specified and merge if necessary
-    if config.get("base_config", None) is not None:
-        base_config = OmegaConf.load(config["base_config"])
-        config = OmegaConf.merge(base_config, config)
-    return config
-def jsonl_to_csv(jsonl_file_path: str, csv_file_path: str) -> None:
-    """
-    Converts a JSONL file to a CSV file.
-    This function reads a JSONL file, determines all unique keys present in the file,
-    and writes the data to a CSV file with columns for all these keys.
-    """
-    all_keys = set()
-    data_rows = []
-    # Read the JSONL file once to extract keys and collect data
-    with open(jsonl_file_path, 'r') as file:
-        for line in file:
-            data = json.loads(line.strip())
-            data_rows.append(data)
-            all_keys.update(data.keys())
-    # Convert the set of keys to a sorted list for consistent column order
-    sorted_keys = sorted(all_keys)
-    # Write the data to a CSV file
-    with open(csv_file_path, 'w', newline='') as csvfile:
-        writer = csv.DictWriter(csvfile, fieldnames=sorted_keys)
-        # Write the header row
-        writer.writeheader()
-        # Write each row of data
-        for data in data_rows:
-            writer.writerow(data)
-    print(f"CSV file has been created at {csv_file_path}")
-def save_metadata(data, filename, headers=None):
-    """
-    Save metadata to a file.
-    Args:
-        data (list of dict): Metadata to be saved.
-        filename (str): Name of the file to save the metadata.
-        headers (list of str): The order of column names to be saved; defaults to the keys from the first dictionary in data if not provided.
-    """
-    # Set headers to keys from the first dictionary in data if not explicitly provided
-    if headers is None:
-        headers = list(data[0].keys())
-    with open(filename, "w", encoding="utf-8") as file:
-        # Write the headers to the file
-        file.write("|".join(headers) + "\n")
-        for entry in data:
-            # Retrieve values in the order of headers, replacing any '|' characters with a space to prevent formatting errors
-            formatted_values = [str(entry.get(key, "")).replace("|", " ") for key in headers]
-            # Write the formatted values to the file
-            file.write("|".join(formatted_values) + "\n")
-def read_metadata(filename, headers=None):
-    """
-    Read metadata from a file.
-    Args:
-        filename (str): The file from which to read the metadata.
-    Returns:
-        list of dict: The metadata read from the file.
-        list of str: The headers used in the file.
-    """
-    with open(filename, "r", encoding="utf-8") as file:
-        lines = file.readlines()
-    data = []
-    # Set headers from the first line of the file if not provided
-    if headers is None:
-        headers = lines[0].strip().split("|")
-        lines = lines[1:]
-    for line in lines:
-        line = line.strip()
-        # Skip empty lines
-        if not line:
-            continue
-        # Split the line by '|' and pair with headers to form a dictionary
-        entry_data = dict(zip(headers, line.split("|")))
-        data.append(entry_data)
-    return data, headers

sparktts/utils/parse_options.sh DELETED Viewed

@@ -1,97 +0,0 @@
-#!/bin/bash
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
-#                 Arnab Ghoshal, Karel Vesely
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-# Parse command-line options.
-# To be sourced by another script (as in ". parse_options.sh").
-# Option format is: --option-name arg
-# and shell variable "option_name" gets set to value "arg."
-# The exception is --help, which takes no arguments, but prints the
-# $help_message variable (if defined).
-###
-### The --config file options have lower priority to command line
-### options, so we need to import them first...
-###
-# Now import all the configs specified by command-line, in left-to-right order
-# for ((argpos=1; argpos<$#; argpos++)); do
-#   if [ "${!argpos}" == "--config" ]; then
-#     argpos_plus1=$((argpos+1))
-#     config=${!argpos_plus1}
-#     [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
-#     . $config  # source the config file.
-#   fi
-# done
-###
-### No we process the command line options
-###
-while true; do
-  [ -z "${1:-}" ] && break;  # break if there are no arguments
-  case "$1" in
-    # If the enclosing script is called with --help option, print the help
-    # message and exit.  Scripts should put help messages in $help_message
-    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
-      else printf "$help_message\n" 1>&2 ; fi;
-      exit 0 ;;
-    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
-      exit 1 ;;
-    # If the first command-line argument begins with "--" (e.g. --foo-bar),
-    # then work out the variable name as $name, which will equal "foo_bar".
-    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
-      # Next we test whether the variable in question is undefned-- if so it's
-      # an invalid option and we die.  Note: $0 evaluates to the name of the
-      # enclosing script.
-      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
-      # is undefined.  We then have to wrap this test inside "eval" because
-      # foo_bar is itself inside a variable ($name).
-      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
-      oldval="`eval echo \\$$name`";
-      # Work out whether we seem to be expecting a Boolean argument.
-      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
-        was_bool=true;
-      else
-        was_bool=false;
-      fi
-      # Set the variable to the right value-- the escaped quotes make it work if
-      # the option had spaces, like --cmd "queue.pl -sync y"
-      eval $name=\"$2\";
-      # Check that Boolean-valued arguments are really Boolean.
-      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
-        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
-        exit 1;
-      fi
-      shift 2;
-      ;;
-  *) break;
-  esac
-done
-# Check for an empty argument to the --cmd option, which can easily occur as a
-# result of scripting errors.
-[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
-true; # so this script returns exit code 0.

sparktts/utils/token_parser.py DELETED Viewed

@@ -1,187 +0,0 @@
-TASK_TOKEN_MAP = {
-    "vc": "<|task_vc|>",
-    "tts": "<|task_tts|>",
-    "asr": "<|task_asr|>",
-    "s2s": "<|task_s2s|>",
-    "t2s": "<|task_t2s|>",
-    "understand": "<|task_understand|>",
-    "caption": "<|task_cap|>",
-    "controllable_tts": "<|task_controllable_tts|>",
-    "prompt_tts": "<|task_prompt_tts|>",
-    "speech_edit": "<|task_edit|>",
-}
-LEVELS_MAP = {
-    "very_low": 0,
-    "low": 1,
-    "moderate": 2,
-    "high": 3,
-    "very_high": 4,
-}
-LEVELS_MAP_UI = {
-    1: 'very_low',
-    2: 'low',
-    3: 'moderate',
-    4: 'high',
-    5: 'very_high'
-}
-GENDER_MAP = {
-    "female": 0,
-    "male": 1,
-}
-AGE_MAP = {"Child": 0, "Teenager": 1, "Youth-Adult": 2, "Middle-aged": 3, "Elderly": 4}
-EMO_MAP = {
-    "UNKNOWN": 0,
-    "NEUTRAL": 1,
-    "ANGRY": 2,
-    "HAPPY": 3,
-    "SAD": 4,
-    "FEARFUL": 5,
-    "DISGUSTED": 6,
-    "SURPRISED": 7,
-    "SARCASTIC": 8,
-    "EXCITED": 9,
-    "SLEEPY": 10,
-    "CONFUSED": 11,
-    "EMPHASIS": 12,
-    "LAUGHING": 13,
-    "SINGING": 14,
-    "WORRIED": 15,
-    "WHISPER": 16,
-    "ANXIOUS": 17,
-    "NO-AGREEMENT": 18,
-    "APOLOGETIC": 19,
-    "CONCERNED": 20,
-    "ENUNCIATED": 21,
-    "ASSERTIVE": 22,
-    "ENCOURAGING": 23,
-    "CONTEMPT": 24,
-}
-class TokenParser:
-    """Turn label to special token"""
-    def __init__(self):
-        pass
-    """Parse the attributes of a person."""
-    def __init__(self):
-        pass
-    @staticmethod
-    def age(age: str) -> str:
-        """Turn age token."""
-        age_id = AGE_MAP[age]
-        return f"<|age_{age_id}|>"
-    @staticmethod
-    def gender(gender: str) -> str:
-        """Turn gender token."""
-        gender_id = GENDER_MAP[gender]
-        return f"<|gender_{gender_id}|>"
-    @staticmethod
-    def mel_value(mel: int):
-        """Turn special token of mel scale pitch."""
-        mel = max(0, int(mel))
-        mel = min(1000, int(mel))
-        return f"<|pitch_value_{mel}|>"
-    @staticmethod
-    def mel_level(level: str):
-        """Turn special token of mel level."""
-        level_tag = LEVELS_MAP[level]
-        return f"<|pitch_label_{level_tag}|>"
-    @staticmethod
-    def pitch_var_value(pitch_std: int):
-        """Turn special token of pitch_std value."""
-        assert isinstance(pitch_std, int)
-        pitch_std = max(0, int(pitch_std))
-        pitch_std = min(10, int(pitch_std))
-        return f"<|pitch_var_value_{pitch_std}|>"
-    @staticmethod
-    def pitch_var_level(level: str):
-        """Turn special token of pitch std level."""
-        level_tag = LEVELS_MAP[level]
-        return f"<|pitch_var_label_{level_tag}|>"
-    @staticmethod
-    def loudness_value(loudness: int):
-        """Turn special toak of loudness value [0, 30]"""
-        assert loudness >= 0
-        loudness = max(0, int(loudness))
-        loudness = min(30, int(loudness))
-        return f"<|loudness_value_{loudness}|>"
-    @staticmethod
-    def loudness_level(level: str):
-        """Turn special token of loudness level."""
-        level_tag = LEVELS_MAP[level]
-        return f"<|loudness_label_{level_tag}|>"
-    @staticmethod
-    def speed_value(speed: int):
-        """Turn special token of speed value."""
-        speed = max(0, int(speed))
-        speed = min(10, int(speed))
-        return f"<|speed_value_{speed}|>"
-    @staticmethod
-    def speed_level(level: str):
-        """Turn special token of speed level."""
-        level_tag = LEVELS_MAP[level]
-        return f"<|speed_label_{level_tag}|>"
-    @staticmethod
-    def task(task: str) -> str:
-        """Turn special token of task."""
-        assert task in TASK_TOKEN_MAP.keys()
-        return TASK_TOKEN_MAP[task]
-    @staticmethod
-    def emotion(emotion: str):
-        emo_id = EMO_MAP[emotion]
-        return f"<|emotion_{emo_id}|>"
-# test
-if __name__ == "__main__":
-    from transformers import AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained(
-        "/aifs4su/xinshengwang/code/StyleCraft/tokenizer/stylecraft-bicodec-pitch-loudness-speed-emotion-tokenizer"
-    )
-    tasks = ["tts", "tts", "understand", "controllable_tts", "prompt_tts"]
-    ages = ["Child", "Teenager", "Youth-Adult", "Middle-aged", "Elderly"]
-    genders = ["female", "female", "female", "male", "male"]
-    mels = [100, 200, 300, 400, 500]
-    mel_levels = ["very_low", "low", "moderate", "high", "very_high"]
-    loudnesses = [1, 10, 23, 19, 30]
-    loudness_levels = ["very_low", "low", "moderate", "high", "very_high"]
-    emotions = ["UNKNOWN", "NEUTRAL", "ANGRY", "HAPPY", "SAD"]
-    for i in range(5):
-        task = TokenParser.task(tasks[i])
-        age = TokenParser.age(ages[i])
-        gender = TokenParser.gender(genders[i])
-        mel = TokenParser.mel_value(mels[i])
-        mel_level = TokenParser.mel_level(mel_levels[i])
-        loudness = TokenParser.loudness_value(loudnesses[i])
-        loudness_level = TokenParser.loudness_level(loudness_levels[i])
-        emotion = TokenParser.emotion(emotions[i])
-        inputs = [task, age, gender, mel, mel_level, loudness, loudness_level, emotion]
-        inputs = "".join(inputs)
-        ids = tokenizer.encode(inputs, add_special_tokens=False)
-        print(ids)
-        print("decode", tokenizer.decode(ids))

webui.py CHANGED Viewed

@@ -265,5 +265,6 @@ if __name__ == "__main__":
     # Launch Gradio with the specified server name and port
     demo.launch(
         server_name=args.server_name,
-        server_port=args.server_port
-    )

     # Launch Gradio with the specified server name and port
     demo.launch(
         server_name=args.server_name,
+        server_port=args.server_port,
+        share=True
+    )