Spaces:

Flux9665
/

MassivelyMultilingualTTS

Running on T4

App Files Files

Flux9665 commited on Jul 25

Commit

70399da

•

1 Parent(s): e208c87

update to the current version

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Architectures/ToucanTTS/StochasticToucanTTS/README.md +0 -1
Architectures/ToucanTTS/StochasticToucanTTS/StochasticToucanTTS.py +0 -493
Architectures/ToucanTTS/StochasticToucanTTS/StochasticVariancePredictor.py +0 -440
Architectures/__init__.py +0 -0
InferenceInterfaces/ControllableInterface.py +25 -18
InferenceInterfaces/ToucanTTSInterface.py +73 -72
InferenceInterfaces/UtteranceCloner.py +8 -6
InferenceInterfaces/audioseal_wm_16bits.yaml +0 -39
{Architectures → Modules}/Aligner/Aligner.py +27 -31
{Architectures → Modules}/Aligner/CodecAlignerDataset.py +57 -14
{Architectures → Modules}/Aligner/README.md +0 -0
{Architectures → Modules}/Aligner/Reconstructor.py +8 -15
{Architectures → Modules}/Aligner/__init__.py +0 -0
{Architectures → Modules}/Aligner/autoaligner_train_loop.py +4 -2
{Architectures → Modules}/ControllabilityGAN/GAN.py +23 -10
{Architectures → Modules}/ControllabilityGAN/__init__.py +0 -0
{Architectures → Modules}/ControllabilityGAN/dataset/__init__.py +0 -0
{Architectures → Modules}/ControllabilityGAN/dataset/speaker_embeddings_dataset.py +0 -0
{Architectures → Modules}/ControllabilityGAN/wgan/__init__.py +0 -0
{Architectures → Modules}/ControllabilityGAN/wgan/init_weights.py +0 -0
{Architectures → Modules}/ControllabilityGAN/wgan/init_wgan.py +2 -2
{Architectures → Modules}/ControllabilityGAN/wgan/resnet_1.py +2 -2
{Architectures → Modules}/ControllabilityGAN/wgan/resnet_init.py +4 -4
{Architectures → Modules}/ControllabilityGAN/wgan/wgan_qc.py +6 -11
{Architectures → Modules}/EmbeddingModel/GST.py +1 -1
{Architectures → Modules}/EmbeddingModel/README.md +0 -0
{Architectures → Modules}/EmbeddingModel/StyleEmbedding.py +2 -2
{Architectures → Modules}/EmbeddingModel/StyleTTSEncoder.py +0 -0
{Architectures → Modules}/EmbeddingModel/__init__.py +0 -0
{Architectures → Modules}/GeneralLayers/Attention.py +0 -0
{Architectures → Modules}/GeneralLayers/ConditionalLayerNorm.py +0 -0
{Architectures → Modules}/GeneralLayers/Conformer.py +29 -18
{Architectures → Modules}/GeneralLayers/Convolution.py +1 -1
{Architectures → Modules}/GeneralLayers/DurationPredictor.py +3 -3
{Architectures → Modules}/GeneralLayers/EncoderLayer.py +1 -1
{Architectures → Modules}/GeneralLayers/LayerNorm.py +0 -0
{Architectures → Modules}/GeneralLayers/LengthRegulator.py +0 -0
{Architectures → Modules}/GeneralLayers/MultiLayeredConv1d.py +0 -0
{Architectures → Modules}/GeneralLayers/MultiSequential.py +0 -0
{Architectures → Modules}/GeneralLayers/PositionalEncoding.py +0 -0
{Architectures → Modules}/GeneralLayers/PositionwiseFeedForward.py +0 -0
{Architectures → Modules}/GeneralLayers/README.md +0 -0
{Architectures → Modules}/GeneralLayers/ResidualBlock.py +0 -0
{Architectures → Modules}/GeneralLayers/ResidualStack.py +0 -0
{Architectures → Modules}/GeneralLayers/STFT.py +0 -0
{Architectures → Modules}/GeneralLayers/Swish.py +0 -0
{Architectures → Modules}/GeneralLayers/VariancePredictor.py +3 -3
{Architectures → Modules}/GeneralLayers/__init__.py +0 -0
{Architectures → Modules}/README.md +0 -0
{Architectures → Modules}/ToucanTTS/CodecDiscriminator.py +0 -0

Architectures/ToucanTTS/StochasticToucanTTS/README.md DELETED Viewed

	@@ -1 +0,0 @@
1	- This is an experimental version of the TTS that uses normalizing flows to predict the prosody explicitly, so that we can still have the controllability of the explicit prosody predictors, however a much better naturalness and livelyness than what we get from a deterministic predictor.

Architectures/ToucanTTS/StochasticToucanTTS/StochasticToucanTTS.py DELETED Viewed

@@ -1,493 +0,0 @@
-import torch
-from torch.nn import Linear
-from torch.nn import Sequential
-from torch.nn import Tanh
-from Architectures.GeneralLayers.Conformer import Conformer
-from Architectures.GeneralLayers.LengthRegulator import LengthRegulator
-from Architectures.ToucanTTS.Glow import Glow
-from Architectures.ToucanTTS.StochasticToucanTTS.StochasticToucanTTSLoss import StochasticToucanTTSLoss
-from Architectures.ToucanTTS.StochasticToucanTTS.StochasticVariancePredictor import StochasticVariancePredictor
-from Preprocessing.articulatory_features import get_feature_to_index_lookup
-from Utility.utils import initialize
-from Utility.utils import make_non_pad_mask
-from Utility.utils import make_pad_mask
-class StochasticToucanTTS(torch.nn.Module):
-    """
-    StochasticToucanTTS module, which is mostly just a FastSpeech 2 module,
-    but with lots of designs from different architectures accumulated
-    and some major components added to put a large focus on multilinguality.
-    Original contributions:
-    - Inputs are configurations of the articulatory tract
-    - Word boundaries are modeled explicitly in the encoder end removed before the decoder
-    - Speaker embedding conditioning is derived from GST and Adaspeech 4
-    - Responsiveness of variance predictors to utterance embedding is increased through conditional layer norm
-    - The final output receives a GAN discriminator feedback signal
-    - Stochastic Duration Prediction through a normalizing flow
-    - Stochastic Pitch Prediction through a normalizing flow
-    - Stochastic Energy prediction through a normalizing flow
-    Contributions inspired from elsewhere:
-    - The PostNet is also a normalizing flow, like in PortaSpeech
-    - Pitch and energy values are averaged per-phone, as in FastPitch to enable great controllability
-    - The encoder and decoder are Conformers
-    """
-    def __init__(self,
-                 # network structure related
-                 input_feature_dimensions=62,
-                 output_spectrogram_channels=80,
-                 attention_dimension=192,
-                 attention_heads=4,
-                 positionwise_conv_kernel_size=1,
-                 use_scaled_positional_encoding=True,
-                 init_type="xavier_uniform",
-                 use_macaron_style_in_conformer=True,
-                 use_cnn_in_conformer=True,
-                 # encoder
-                 encoder_layers=6,
-                 encoder_units=1536,
-                 encoder_normalize_before=True,
-                 encoder_concat_after=False,
-                 conformer_encoder_kernel_size=7,
-                 transformer_enc_dropout_rate=0.2,
-                 transformer_enc_positional_dropout_rate=0.2,
-                 transformer_enc_attn_dropout_rate=0.2,
-                 # decoder
-                 decoder_layers=6,
-                 decoder_units=1536,
-                 decoder_concat_after=False,
-                 conformer_decoder_kernel_size=31,
-                 decoder_normalize_before=True,
-                 transformer_dec_dropout_rate=0.2,
-                 transformer_dec_positional_dropout_rate=0.2,
-                 transformer_dec_attn_dropout_rate=0.2,
-                 # duration predictor
-                 duration_predictor_layers=3,
-                 duration_predictor_chans=256,
-                 duration_predictor_kernel_size=3,
-                 duration_predictor_dropout_rate=0.2,
-                 # pitch predictor
-                 pitch_embed_kernel_size=1,
-                 pitch_embed_dropout=0.0,
-                 # energy predictor
-                 energy_embed_kernel_size=1,
-                 energy_embed_dropout=0.0,
-                 # additional features
-                 utt_embed_dim=192,
-                 lang_embs=8000):
-        super().__init__()
-        self.input_feature_dimensions = input_feature_dimensions
-        self.output_spectrogram_channels = output_spectrogram_channels
-        self.attention_dimension = attention_dimension
-        self.use_scaled_pos_enc = use_scaled_positional_encoding
-        self.multilingual_model = lang_embs is not None
-        self.multispeaker_model = utt_embed_dim is not None
-        articulatory_feature_embedding = Sequential(Linear(input_feature_dimensions, 100), Tanh(), Linear(100, attention_dimension))
-        self.encoder = Conformer(conformer_type="encoder",
-                                 attention_dim=attention_dimension,
-                                 attention_heads=attention_heads,
-                                 linear_units=encoder_units,
-                                 num_blocks=encoder_layers,
-                                 input_layer=articulatory_feature_embedding,
-                                 dropout_rate=transformer_enc_dropout_rate,
-                                 positional_dropout_rate=transformer_enc_positional_dropout_rate,
-                                 attention_dropout_rate=transformer_enc_attn_dropout_rate,
-                                 normalize_before=encoder_normalize_before,
-                                 concat_after=encoder_concat_after,
-                                 positionwise_conv_kernel_size=positionwise_conv_kernel_size,
-                                 macaron_style=use_macaron_style_in_conformer,
-                                 use_cnn_module=use_cnn_in_conformer,
-                                 cnn_module_kernel=conformer_encoder_kernel_size,
-                                 zero_triu=False,
-                                 utt_embed=utt_embed_dim,
-                                 lang_embs=lang_embs,
-                                 use_output_norm=True)
-        self.duration_flow = StochasticVariancePredictor(in_channels=attention_dimension,
-                                                         kernel_size=3,
-                                                         p_dropout=0.5,
-                                                         n_flows=5,
-                                                         conditioning_signal_channels=utt_embed_dim)
-        self.pitch_flow = StochasticVariancePredictor(in_channels=attention_dimension,
-                                                      kernel_size=5,
-                                                      p_dropout=0.5,
-                                                      n_flows=6,
-                                                      conditioning_signal_channels=utt_embed_dim)
-        self.energy_flow = StochasticVariancePredictor(in_channels=attention_dimension,
-                                                       kernel_size=3,
-                                                       p_dropout=0.5,
-                                                       n_flows=3,
-                                                       conditioning_signal_channels=utt_embed_dim)
-        self.pitch_embed = Sequential(torch.nn.Conv1d(in_channels=1,
-                                                      out_channels=attention_dimension,
-                                                      kernel_size=pitch_embed_kernel_size,
-                                                      padding=(pitch_embed_kernel_size - 1) // 2),
-                                      torch.nn.Dropout(pitch_embed_dropout))
-        self.energy_embed = Sequential(torch.nn.Conv1d(in_channels=1, out_channels=attention_dimension, kernel_size=energy_embed_kernel_size,
-                                                       padding=(energy_embed_kernel_size - 1) // 2),
-                                       torch.nn.Dropout(energy_embed_dropout))
-        self.length_regulator = LengthRegulator()
-        self.decoder = Conformer(conformer_type="decoder",
-                                 attention_dim=attention_dimension,
-                                 attention_heads=attention_heads,
-                                 linear_units=decoder_units,
-                                 num_blocks=decoder_layers,
-                                 input_layer=None,
-                                 dropout_rate=transformer_dec_dropout_rate,
-                                 positional_dropout_rate=transformer_dec_positional_dropout_rate,
-                                 attention_dropout_rate=transformer_dec_attn_dropout_rate,
-                                 normalize_before=decoder_normalize_before,
-                                 concat_after=decoder_concat_after,
-                                 positionwise_conv_kernel_size=positionwise_conv_kernel_size,
-                                 macaron_style=use_macaron_style_in_conformer,
-                                 use_cnn_module=use_cnn_in_conformer,
-                                 cnn_module_kernel=conformer_decoder_kernel_size,
-                                 use_output_norm=False,
-                                 utt_embed=utt_embed_dim)
-        self.feat_out = Linear(attention_dimension, output_spectrogram_channels)
-        self.post_flow = Glow(
-            in_channels=output_spectrogram_channels,
-            hidden_channels=192,  # post_glow_hidden
-            kernel_size=3,  # post_glow_kernel_size
-            dilation_rate=1,
-            n_blocks=12,  # post_glow_n_blocks (original 12 in paper)
-            n_layers=3,  # post_glow_n_block_layers (original 3 in paper)
-            n_split=4,
-            n_sqz=2,
-            text_condition_channels=attention_dimension,
-            share_cond_layers=False,  # post_share_cond_layers
-            share_wn_layers=4,
-            sigmoid_scale=False,
-            condition_integration_projection=torch.nn.Conv1d(output_spectrogram_channels + attention_dimension, attention_dimension, 5, padding=2)
-        )
-        # initialize parameters
-        self._reset_parameters(init_type=init_type)
-        if lang_embs is not None:
-            torch.nn.init.normal_(self.encoder.language_embedding.weight, mean=0, std=attention_dimension ** -0.5)
-        self.criterion = StochasticToucanTTSLoss()
-    def forward(self,
-                text_tensors,
-                text_lengths,
-                gold_speech,
-                speech_lengths,
-                gold_durations,
-                gold_pitch,
-                gold_energy,
-                utterance_embedding,
-                return_feats=False,
-                lang_ids=None,
-                run_glow=True
-                ):
-        """
-        Args:
-            return_feats (Boolean): whether to return the predicted spectrogram
-            text_tensors (LongTensor): Batch of padded text vectors (B, Tmax).
-            text_lengths (LongTensor): Batch of lengths of each input (B,).
-            gold_speech (Tensor): Batch of padded target features (B, Lmax, odim).
-            speech_lengths (LongTensor): Batch of the lengths of each target (B,).
-            gold_durations (LongTensor): Batch of padded durations (B, Tmax + 1).
-            gold_pitch (Tensor): Batch of padded token-averaged pitch (B, Tmax + 1, 1).
-            gold_energy (Tensor): Batch of padded token-averaged energy (B, Tmax + 1, 1).
-            run_glow (Boolean): Whether to run the PostNet. There should be a warmup phase in the beginning.
-            lang_ids (LongTensor): The language IDs used to access the language embedding table, if the model is multilingual
-            utterance_embedding (Tensor): Batch of embeddings to condition the TTS on, if the model is multispeaker
-        """
-        before_outs, \
-        after_outs, \
-        duration_loss, \
-        pitch_loss, \
-        energy_loss, \
-        glow_loss = self._forward(text_tensors=text_tensors,
-                                  text_lengths=text_lengths,
-                                  gold_speech=gold_speech,
-                                  speech_lengths=speech_lengths,
-                                  gold_durations=gold_durations,
-                                  gold_pitch=gold_pitch,
-                                  gold_energy=gold_energy,
-                                  utterance_embedding=utterance_embedding,
-                                  is_inference=False,
-                                  lang_ids=lang_ids,
-                                  run_glow=run_glow)
-        # calculate loss
-        l1_loss = self.criterion(after_outs=after_outs,
-                                 before_outs=before_outs,
-                                 gold_spectrograms=gold_speech,
-                                 spectrogram_lengths=speech_lengths,
-                                 text_lengths=text_lengths)
-        if return_feats:
-            if after_outs is None:
-                after_outs = before_outs
-            return l1_loss, duration_loss, pitch_loss, energy_loss, glow_loss, after_outs
-        return l1_loss, duration_loss, pitch_loss, energy_loss, glow_loss
-    def _forward(self,
-                 text_tensors,
-                 text_lengths,
-                 gold_speech=None,
-                 speech_lengths=None,
-                 gold_durations=None,
-                 gold_pitch=None,
-                 gold_energy=None,
-                 is_inference=False,
-                 utterance_embedding=None,
-                 lang_ids=None,
-                 run_glow=True):
-        if not self.multilingual_model:
-            lang_ids = None
-        if not self.multispeaker_model:
-            utterance_embedding = None
-        # encoding the texts
-        text_masks = make_non_pad_mask(text_lengths, device=text_lengths.device).unsqueeze(-2)
-        padding_masks = make_pad_mask(text_lengths, device=text_lengths.device)
-        encoded_texts, _ = self.encoder(text_tensors, text_masks, utterance_embedding=utterance_embedding, lang_ids=lang_ids)
-        if is_inference:
-            variance_mask = torch.ones(size=[text_tensors.size(1)], device=text_tensors.device)
-            # predicting pitch
-            pitch_predictions = self.pitch_flow(encoded_texts.transpose(1, 2), variance_mask, w=None, g=utterance_embedding.unsqueeze(-1), reverse=True).squeeze(-1).transpose(1, 2)
-            for phoneme_index, phoneme_vector in enumerate(text_tensors.squeeze(0)):
-                if phoneme_vector[get_feature_to_index_lookup()["voiced"]] == 0:
-                    pitch_predictions[0][phoneme_index] = 0.0
-            embedded_pitch_curve = self.pitch_embed(pitch_predictions.transpose(1, 2)).transpose(1, 2)
-            encoded_texts = encoded_texts + embedded_pitch_curve
-            # predicting energy
-            energy_predictions = self.energy_flow(encoded_texts.transpose(1, 2), variance_mask, w=None, g=utterance_embedding.unsqueeze(-1), reverse=True).squeeze(-1).transpose(1, 2)
-            embedded_energy_curve = self.energy_embed(energy_predictions.transpose(1, 2)).transpose(1, 2)
-            encoded_texts = encoded_texts + embedded_energy_curve
-            # predicting durations
-            predicted_durations = self.duration_flow(encoded_texts.transpose(1, 2), variance_mask, w=None, g=utterance_embedding.unsqueeze(-1), reverse=True).squeeze(-1).transpose(1, 2).squeeze(-1)
-            predicted_durations = torch.ceil(torch.exp(predicted_durations)).long()
-            for phoneme_index, phoneme_vector in enumerate(text_tensors.squeeze(0)):
-                if phoneme_vector[get_feature_to_index_lookup()["word-boundary"]] == 1:
-                    predicted_durations[0][phoneme_index] = 0
-            # predicting durations for text and upsampling accordingly
-            upsampled_enriched_encoded_texts = self.length_regulator(encoded_texts, predicted_durations)
-        else:
-            # learning to predict pitch
-            idx = gold_pitch != 0
-            pitch_mask = torch.logical_and(text_masks, idx.transpose(1, 2))
-            scaled_pitch_targets = gold_pitch.detach().clone()
-            scaled_pitch_targets[idx] = torch.exp(gold_pitch[idx])  # we scale up, so that the log in the flow can handle the value ranges better.
-            pitch_flow_loss = torch.sum(self.pitch_flow(encoded_texts.transpose(1, 2).detach(), pitch_mask, w=scaled_pitch_targets.transpose(1, 2), g=utterance_embedding.unsqueeze(-1), reverse=False))
-            pitch_flow_loss = torch.sum(pitch_flow_loss / torch.sum(pitch_mask))  # weighted masking
-            embedded_pitch_curve = self.pitch_embed(gold_pitch.transpose(1, 2)).transpose(1, 2)
-            encoded_texts = encoded_texts + embedded_pitch_curve
-            # learning to predict energy
-            idx = gold_energy != 0
-            energy_mask = torch.logical_and(text_masks, idx.transpose(1, 2))
-            scaled_energy_targets = gold_energy.detach().clone()
-            scaled_energy_targets[idx] = torch.exp(gold_energy[idx])  # we scale up, so that the log in the flow can handle the value ranges better.
-            energy_flow_loss = torch.sum(self.energy_flow(encoded_texts.transpose(1, 2).detach(), energy_mask, w=scaled_energy_targets.transpose(1, 2), g=utterance_embedding.unsqueeze(-1), reverse=False))
-            energy_flow_loss = torch.sum(energy_flow_loss / torch.sum(energy_mask))  # weighted masking
-            embedded_energy_curve = self.energy_embed(gold_energy.transpose(1, 2)).transpose(1, 2)
-            encoded_texts = encoded_texts + embedded_energy_curve
-            # learning to predict durations
-            idx = gold_durations.unsqueeze(-1) != 0
-            duration_mask = torch.logical_and(text_masks, idx.transpose(1, 2))
-            duration_targets = gold_durations.unsqueeze(-1).detach().clone().float()
-            duration_flow_loss = torch.sum(self.duration_flow(encoded_texts.transpose(1, 2).detach(), duration_mask, w=duration_targets.transpose(1, 2), g=utterance_embedding.unsqueeze(-1), reverse=False))
-            duration_flow_loss = torch.sum(duration_flow_loss / torch.sum(duration_mask))  # weighted masking
-            upsampled_enriched_encoded_texts = self.length_regulator(encoded_texts, gold_durations)
-        # decoding spectrogram
-        decoder_masks = make_non_pad_mask(speech_lengths, device=speech_lengths.device).unsqueeze(-2) if speech_lengths is not None and not is_inference else None
-        decoded_speech, _ = self.decoder(upsampled_enriched_encoded_texts, decoder_masks, utterance_embedding=utterance_embedding)
-        decoded_spectrogram = self.feat_out(decoded_speech).view(decoded_speech.size(0), -1, self.output_spectrogram_channels)
-        # refine spectrogram further with a normalizing flow (requires warmup, so it's not always on)
-        glow_loss = None
-        if run_glow:
-            if is_inference:
-                refined_spectrogram = self.post_flow(tgt_mels=None,
-                                                     infer=is_inference,
-                                                     mel_out=decoded_spectrogram,
-                                                     encoded_texts=upsampled_enriched_encoded_texts,
-                                                     tgt_nonpadding=None).squeeze()
-            else:
-                glow_loss = self.post_flow(tgt_mels=gold_speech,
-                                           infer=is_inference,
-                                           mel_out=decoded_spectrogram.detach().clone(),
-                                           encoded_texts=upsampled_enriched_encoded_texts.detach().clone(),
-                                           tgt_nonpadding=decoder_masks)
-        if is_inference:
-            return decoded_spectrogram.squeeze(), \
-                   refined_spectrogram.squeeze(), \
-                   predicted_durations.squeeze(), \
-                   pitch_predictions.squeeze(), \
-                   energy_predictions.squeeze()
-        else:
-            return decoded_spectrogram, \
-                   None, \
-                   duration_flow_loss, \
-                   pitch_flow_loss, \
-                   energy_flow_loss, \
-                   glow_loss
-    @torch.inference_mode()
-    def inference(self,
-                  text,
-                  speech=None,
-                  utterance_embedding=None,
-                  return_duration_pitch_energy=False,
-                  lang_id=None,
-                  run_postflow=True):
-        """
-        Args:
-            text (LongTensor): Input sequence of characters (T,).
-            speech (Tensor, optional): Feature sequence to extract style (N, idim).
-            return_duration_pitch_energy (Boolean): whether to return the list of predicted durations for nicer plotting
-            run_postflow (Boolean): Whether to run the PostNet. There should be a warmup phase in the beginning.
-            lang_id (LongTensor): The language ID used to access the language embedding table, if the model is multilingual
-            utterance_embedding (Tensor): Embedding to condition the TTS on, if the model is multispeaker
-        """
-        self.eval()
-        x, y = text, speech
-        # setup batch axis
-        ilens = torch.tensor([x.shape[0]], dtype=torch.long, device=x.device)
-        xs, ys = x.unsqueeze(0), None
-        if y is not None:
-            ys = y.unsqueeze(0)
-        if lang_id is not None:
-            lang_id = lang_id.unsqueeze(0)
-        utterance_embeddings = utterance_embedding.unsqueeze(0) if utterance_embedding is not None else None
-        before_outs, \
-        after_outs, \
-        duration_predictions, \
-        pitch_predictions, \
-        energy_predictions = self._forward(xs,
-                                           ilens,
-                                           ys,
-                                           is_inference=True,
-                                           utterance_embedding=utterance_embeddings,
-                                           lang_ids=lang_id,
-                                           run_glow=run_postflow)  # (1, L, odim)
-        self.train()
-        if after_outs is None:
-            after_outs = before_outs
-        if return_duration_pitch_energy:
-            return before_outs, after_outs, duration_predictions, pitch_predictions, energy_predictions
-        return after_outs
-    def _reset_parameters(self, init_type):
-        # initialize parameters
-        if init_type != "pytorch":
-            initialize(self, init_type)
-if __name__ == '__main__':
-    print(sum(p.numel() for p in StochasticToucanTTS().parameters() if p.requires_grad))
-    print(" TESTING TRAINING ")
-    print(" batchsize 3 ")
-    dummy_text_batch = torch.randint(low=0, high=2, size=[3, 3, 62]).float()  # [Batch, Sequence Length, Features per Phone]
-    dummy_text_lens = torch.LongTensor([2, 3, 3])
-    dummy_speech_batch = torch.randn([3, 30, 80])  # [Batch, Sequence Length, Spectrogram Buckets]
-    dummy_speech_lens = torch.LongTensor([10, 30, 20])
-    dummy_durations = torch.LongTensor([[10, 0, 0], [10, 15, 5], [5, 5, 10]])
-    dummy_pitch = torch.Tensor([[[1.0], [0.], [0.]], [[1.1], [1.2], [0.8]], [[1.1], [1.2], [0.8]]])
-    dummy_energy = torch.Tensor([[[1.0], [1.3], [0.]], [[1.1], [1.4], [0.8]], [[1.1], [1.2], [0.8]]])
-    dummy_utterance_embed = torch.randn([3, 192])  # [Batch, Dimensions of Speaker Embedding]
-    dummy_language_id = torch.LongTensor([5, 3, 2]).unsqueeze(1)
-    model = StochasticToucanTTS()
-    l1, dl, pl, el, gl = model(dummy_text_batch,
-                               dummy_text_lens,
-                               dummy_speech_batch,
-                               dummy_speech_lens,
-                               dummy_durations,
-                               dummy_pitch,
-                               dummy_energy,
-                               utterance_embedding=dummy_utterance_embed,
-                               lang_ids=dummy_language_id)
-    loss = l1 + gl + dl + pl + el
-    print(loss)
-    loss.backward()
-    # from Utility.utils import plot_grad_flow
-    # plot_grad_flow(model.encoder.named_parameters())
-    # plot_grad_flow(model.decoder.named_parameters())
-    # plot_grad_flow(model.pitch_predictor.named_parameters())
-    # plot_grad_flow(model.duration_predictor.named_parameters())
-    # plot_grad_flow(model.post_flow.named_parameters())
-    print(" batchsize 2 ")
-    dummy_text_batch = torch.randint(low=0, high=2, size=[2, 3, 62]).float()  # [Batch, Sequence Length, Features per Phone]
-    dummy_text_lens = torch.LongTensor([2, 3])
-    dummy_speech_batch = torch.randn([2, 30, 80])  # [Batch, Sequence Length, Spectrogram Buckets]
-    dummy_speech_lens = torch.LongTensor([10, 30])
-    dummy_durations = torch.LongTensor([[10, 0, 0], [10, 15, 5]])
-    dummy_pitch = torch.Tensor([[[1.0], [0.], [0.]], [[1.1], [1.2], [0.8]]])
-    dummy_energy = torch.Tensor([[[1.0], [1.3], [0.]], [[1.1], [1.4], [0.8]]])
-    dummy_utterance_embed = torch.randn([2, 192])  # [Batch, Dimensions of Speaker Embedding]
-    dummy_language_id = torch.LongTensor([5, 3]).unsqueeze(1)
-    model = StochasticToucanTTS()
-    l1, dl, pl, el, gl = model(dummy_text_batch,
-                               dummy_text_lens,
-                               dummy_speech_batch,
-                               dummy_speech_lens,
-                               dummy_durations,
-                               dummy_pitch,
-                               dummy_energy,
-                               utterance_embedding=dummy_utterance_embed,
-                               lang_ids=dummy_language_id)
-    loss = l1 + gl + dl + el + pl
-    print(loss)
-    loss.backward()
-    print(" TESTING INFERENCE ")
-    dummy_text_batch = torch.randint(low=0, high=2, size=[12, 62]).float()  # [Sequence Length, Features per Phone]
-    dummy_utterance_embed = torch.randn([192])  # [Dimensions of Speaker Embedding]
-    dummy_language_id = torch.LongTensor([2])
-    print(StochasticToucanTTS().inference(dummy_text_batch,
-                                          utterance_embedding=dummy_utterance_embed,
-                                          lang_id=dummy_language_id).shape)

Architectures/ToucanTTS/StochasticToucanTTS/StochasticVariancePredictor.py DELETED Viewed

@@ -1,440 +0,0 @@
-"""
-Code taken and adapted from https://github.com/jaywalnut310/vits
-MIT License
-Copyright (c) 2021 Jaehyeon Kim
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-"""
-import math
-import numpy as np
-import torch
-from torch import nn
-from torch.nn import functional as F
-DEFAULT_MIN_BIN_WIDTH = 1e-3
-DEFAULT_MIN_BIN_HEIGHT = 1e-3
-DEFAULT_MIN_DERIVATIVE = 1e-3
-class StochasticVariancePredictor(nn.Module):
-    def __init__(self, in_channels, kernel_size, p_dropout, n_flows=4, conditioning_signal_channels=0):
-        super().__init__()
-        self.in_channels = in_channels
-        self.filter_channels = in_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.n_flows = n_flows
-        self.gin_channels = conditioning_signal_channels if conditioning_signal_channels is not None else 0
-        self.log_flow = Log()
-        self.flows = nn.ModuleList()
-        self.flows.append(ElementwiseAffine(2))
-        for i in range(n_flows):
-            self.flows.append(ConvFlow(2, in_channels, kernel_size, n_layers=3))
-            self.flows.append(Flip())
-        self.post_pre = nn.Conv1d(1, in_channels, 1)
-        self.post_proj = nn.Conv1d(in_channels, in_channels, 1)
-        self.post_convs = DDSConv(in_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
-        self.post_flows = nn.ModuleList()
-        self.post_flows.append(ElementwiseAffine(2))
-        for i in range(4):
-            self.post_flows.append(ConvFlow(2, in_channels, kernel_size, n_layers=3))
-            self.post_flows.append(Flip())
-        self.pre = nn.Conv1d(in_channels, in_channels, 1)
-        self.proj = nn.Conv1d(in_channels, in_channels, 1)
-        self.convs = DDSConv(in_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
-        if self.gin_channels != 0:
-            self.cond = nn.Conv1d(self.gin_channels, in_channels, 1)
-    def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=0.3):
-        x = self.pre(x)
-        if g is not None:
-            g = torch.detach(g)
-            x = x + self.cond(g)
-        x = self.convs(x, x_mask)
-        x = self.proj(x) * x_mask
-        if not reverse:
-            flows = self.flows
-            assert w is not None
-            logdet_tot_q = 0
-            h_w = self.post_pre(w)
-            h_w = self.post_convs(h_w, x_mask)
-            h_w = self.post_proj(h_w) * x_mask
-            e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
-            z_q = e_q
-            for flow in self.post_flows:
-                z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
-                logdet_tot_q += logdet_q
-            z_u, z1 = torch.split(z_q, [1, 1], 1)
-            u = torch.sigmoid(z_u) * x_mask
-            z0 = (w - u) * x_mask
-            logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2])
-            logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q ** 2)) * x_mask, [1, 2]) - logdet_tot_q
-            logdet_tot = 0
-            z0, logdet = self.log_flow(z0, x_mask)
-            logdet_tot += logdet
-            z = torch.cat([z0, z1], 1)
-            for flow in flows:
-                z, logdet = flow(z, x_mask, g=x, reverse=reverse)
-                logdet_tot = logdet_tot + logdet
-            nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2]) - logdet_tot
-            return nll + logq  # [b]
-        else:
-            flows = list(reversed(self.flows))
-            flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
-            z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
-            # noise scale 0.8 derived from coqui implementation, but dropped to 0.3 during testing. Might not be ideal yet.
-            for flow in flows:
-                z = flow(z, x_mask, g=x, reverse=reverse)
-            z0, z1 = torch.split(z, [1, 1], 1)
-            logw = z0
-            return logw
-class Log(nn.Module):
-    def forward(self, x, x_mask, reverse=False, **kwargs):
-        if not reverse:
-            y = torch.log(torch.clamp_min(x, 1e-6)) * x_mask
-            logdet = torch.sum(-y, [1, 2])
-            return y, logdet
-        else:
-            x = torch.exp(x) * x_mask
-            return x
-class Flip(nn.Module):
-    def forward(self, x, *args, reverse=False, **kwargs):
-        x = torch.flip(x, [1])
-        if not reverse:
-            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
-            return x, logdet
-        else:
-            return x
-class DDSConv(nn.Module):
-    """
-    Dialted and Depth-Separable Convolution
-    """
-    def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
-        super().__init__()
-        self.channels = channels
-        self.kernel_size = kernel_size
-        self.n_layers = n_layers
-        self.p_dropout = p_dropout
-        self.drop = nn.Dropout(p_dropout)
-        self.convs_sep = nn.ModuleList()
-        self.convs_1x1 = nn.ModuleList()
-        self.norms_1 = nn.ModuleList()
-        self.norms_2 = nn.ModuleList()
-        for i in range(n_layers):
-            dilation = kernel_size ** i
-            padding = (kernel_size * dilation - dilation) // 2
-            self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
-                                            groups=channels, dilation=dilation, padding=padding
-                                            ))
-            self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
-            self.norms_1.append(LayerNorm(channels))
-            self.norms_2.append(LayerNorm(channels))
-    def forward(self, x, x_mask, g=None):
-        if g is not None:
-            x = x + g
-        for i in range(self.n_layers):
-            y = self.convs_sep[i](x * x_mask)
-            y = self.norms_1[i](y)
-            y = F.gelu(y)
-            y = self.convs_1x1[i](y)
-            y = self.norms_2[i](y)
-            y = F.gelu(y)
-            y = self.drop(y)
-            x = x + y
-        return x * x_mask
-class ConvFlow(nn.Module):
-    def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
-        super().__init__()
-        self.in_channels = in_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.n_layers = n_layers
-        self.num_bins = num_bins
-        self.tail_bound = tail_bound
-        self.half_channels = in_channels // 2
-        self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
-        self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
-        self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
-        self.proj.weight.data.zero_()
-        self.proj.bias.data.zero_()
-    def forward(self, x, x_mask, g=None, reverse=False):
-        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
-        h = self.pre(x0)
-        h = self.convs(h, x_mask, g=g)
-        h = self.proj(h) * x_mask
-        b, c, t = x0.shape
-        h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2)  # [b, cx?, t] -> [b, c, t, ?]
-        unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
-        unnormalized_heights = h[..., self.num_bins:2 * self.num_bins] / math.sqrt(self.filter_channels)
-        unnormalized_derivatives = h[..., 2 * self.num_bins:]
-        x1, logabsdet = piecewise_rational_quadratic_transform(x1,
-                                                               unnormalized_widths,
-                                                               unnormalized_heights,
-                                                               unnormalized_derivatives,
-                                                               inverse=reverse,
-                                                               tails='linear',
-                                                               tail_bound=self.tail_bound
-                                                               )
-        x = torch.cat([x0, x1], 1) * x_mask
-        logdet = torch.sum(logabsdet * x_mask, [1, 2])
-        if not reverse:
-            return x, logdet
-        else:
-            return x
-class ElementwiseAffine(nn.Module):
-    def __init__(self, channels):
-        super().__init__()
-        self.channels = channels
-        self.m = nn.Parameter(torch.zeros(channels, 1))
-        self.logs = nn.Parameter(torch.zeros(channels, 1))
-    def forward(self, x, x_mask, reverse=False, **kwargs):
-        if not reverse:
-            y = self.m + torch.exp(self.logs) * x
-            y = y * x_mask
-            logdet = torch.sum(self.logs * x_mask, [1, 2])
-            return y, logdet
-        else:
-            x = (x - self.m) * torch.exp(-self.logs) * x_mask
-            return x
-class LayerNorm(nn.Module):
-    def __init__(self, channels, eps=1e-5):
-        super().__init__()
-        self.channels = channels
-        self.eps = eps
-        self.gamma = nn.Parameter(torch.ones(channels))
-        self.beta = nn.Parameter(torch.zeros(channels))
-    def forward(self, x):
-        x = x.transpose(1, -1)
-        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
-        return x.transpose(1, -1)
-def piecewise_rational_quadratic_transform(inputs,
-                                           unnormalized_widths,
-                                           unnormalized_heights,
-                                           unnormalized_derivatives,
-                                           inverse=False,
-                                           tails=None,
-                                           tail_bound=1.,
-                                           min_bin_width=DEFAULT_MIN_BIN_WIDTH,
-                                           min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
-                                           min_derivative=DEFAULT_MIN_DERIVATIVE):
-    if tails is None:
-        spline_fn = rational_quadratic_spline
-        spline_kwargs = {}
-    else:
-        spline_fn = unconstrained_rational_quadratic_spline
-        spline_kwargs = {
-            'tails'     : tails,
-            'tail_bound': tail_bound
-        }
-    outputs, logabsdet = spline_fn(
-        inputs=inputs,
-        unnormalized_widths=unnormalized_widths,
-        unnormalized_heights=unnormalized_heights,
-        unnormalized_derivatives=unnormalized_derivatives,
-        inverse=inverse,
-        min_bin_width=min_bin_width,
-        min_bin_height=min_bin_height,
-        min_derivative=min_derivative,
-        **spline_kwargs
-    )
-    return outputs, logabsdet
-def rational_quadratic_spline(inputs,
-                              unnormalized_widths,
-                              unnormalized_heights,
-                              unnormalized_derivatives,
-                              inverse=False,
-                              left=0., right=1., bottom=0., top=1.,
-                              min_bin_width=DEFAULT_MIN_BIN_WIDTH,
-                              min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
-                              min_derivative=DEFAULT_MIN_DERIVATIVE):
-    if torch.min(inputs) < left or torch.max(inputs) > right:
-        raise ValueError('Input to a transform is not within its domain')
-    num_bins = unnormalized_widths.shape[-1]
-    if min_bin_width * num_bins > 1.0:
-        raise ValueError('Minimal bin width too large for the number of bins')
-    if min_bin_height * num_bins > 1.0:
-        raise ValueError('Minimal bin height too large for the number of bins')
-    widths = F.softmax(unnormalized_widths, dim=-1)
-    widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
-    cumwidths = torch.cumsum(widths, dim=-1)
-    cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
-    cumwidths = (right - left) * cumwidths + left
-    cumwidths[..., 0] = left
-    cumwidths[..., -1] = right
-    widths = cumwidths[..., 1:] - cumwidths[..., :-1]
-    derivatives = min_derivative + F.softplus(unnormalized_derivatives)
-    heights = F.softmax(unnormalized_heights, dim=-1)
-    heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
-    cumheights = torch.cumsum(heights, dim=-1)
-    cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
-    cumheights = (top - bottom) * cumheights + bottom
-    cumheights[..., 0] = bottom
-    cumheights[..., -1] = top
-    heights = cumheights[..., 1:] - cumheights[..., :-1]
-    if inverse:
-        bin_idx = searchsorted(cumheights, inputs)[..., None]
-    else:
-        bin_idx = searchsorted(cumwidths, inputs)[..., None]
-    input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
-    input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
-    input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
-    delta = heights / widths
-    input_delta = delta.gather(-1, bin_idx)[..., 0]
-    input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
-    input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
-    input_heights = heights.gather(-1, bin_idx)[..., 0]
-    if inverse:
-        a = (((inputs - input_cumheights) * (input_derivatives
-                                             + input_derivatives_plus_one
-                                             - 2 * input_delta)
-              + input_heights * (input_delta - input_derivatives)))
-        b = (input_heights * input_derivatives
-             - (inputs - input_cumheights) * (input_derivatives
-                                              + input_derivatives_plus_one
-                                              - 2 * input_delta))
-        c = - input_delta * (inputs - input_cumheights)
-        discriminant = b.pow(2) - 4 * a * c
-        assert (discriminant >= 0).all()
-        root = (2 * c) / (-b - torch.sqrt(discriminant))
-        outputs = root * input_bin_widths + input_cumwidths
-        theta_one_minus_theta = root * (1 - root)
-        denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
-                                     * theta_one_minus_theta)
-        derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
-                                                     + 2 * input_delta * theta_one_minus_theta
-                                                     + input_derivatives * (1 - root).pow(2))
-        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
-        return outputs, -logabsdet
-    else:
-        theta = (inputs - input_cumwidths) / input_bin_widths
-        theta_one_minus_theta = theta * (1 - theta)
-        numerator = input_heights * (input_delta * theta.pow(2)
-                                     + input_derivatives * theta_one_minus_theta)
-        denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
-                                     * theta_one_minus_theta)
-        outputs = input_cumheights + numerator / denominator
-        derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
-                                                     + 2 * input_delta * theta_one_minus_theta
-                                                     + input_derivatives * (1 - theta).pow(2))
-        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
-        return outputs, logabsdet
-def searchsorted(bin_locations, inputs, eps=1e-6):
-    bin_locations[..., -1] += eps
-    return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
-def unconstrained_rational_quadratic_spline(inputs,
-                                            unnormalized_widths,
-                                            unnormalized_heights,
-                                            unnormalized_derivatives,
-                                            inverse=False,
-                                            tails='linear',
-                                            tail_bound=1.,
-                                            min_bin_width=DEFAULT_MIN_BIN_WIDTH,
-                                            min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
-                                            min_derivative=DEFAULT_MIN_DERIVATIVE):
-    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
-    outside_interval_mask = ~inside_interval_mask
-    outputs = torch.zeros_like(inputs)
-    logabsdet = torch.zeros_like(inputs)
-    if tails == 'linear':
-        unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
-        constant = np.log(np.exp(1 - min_derivative) - 1)
-        unnormalized_derivatives[..., 0] = constant
-        unnormalized_derivatives[..., -1] = constant
-        outputs[outside_interval_mask] = inputs[outside_interval_mask]
-        logabsdet[outside_interval_mask] = 0
-    else:
-        raise RuntimeError('{} tails are not implemented.'.format(tails))
-    outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
-        inputs=inputs[inside_interval_mask],
-        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
-        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
-        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
-        inverse=inverse,
-        left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
-        min_bin_width=min_bin_width,
-        min_bin_height=min_bin_height,
-        min_derivative=min_derivative
-    )
-    return outputs, logabsdet

Architectures/__init__.py DELETED Viewed

File without changes

InferenceInterfaces/ControllableInterface.py CHANGED Viewed

@@ -2,8 +2,8 @@ import os
 import torch
-from Architectures.ControllabilityGAN.GAN import GanWrapper
 from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
 from Utility.storage_config import MODELS_DIR
@@ -15,7 +15,7 @@ class ControllableInterface:
         else:
             os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
             os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model = ToucanTTSInterface(device=self.device, tts_model_path="Meta")
         self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device=self.device)
         self.generated_speaker_embeds = list()
@@ -25,9 +25,11 @@ class ControllableInterface:
     def read(self,
              prompt,
              language,
              accent,
              voice_seed,
              duration_scaling_factor,
              pause_duration_scaling_factor,
              pitch_variance_scale,
@@ -37,24 +39,29 @@ class ControllableInterface:
              emb_slider_3,
              emb_slider_4,
              emb_slider_5,
-             emb_slider_6
              ):
         if self.current_language != language:
             self.model.set_phonemizer_language(language)
             self.current_language = language
         if self.current_accent != accent:
             self.model.set_accent_language(accent)
             self.current_accent = accent
-        self.wgan.set_latent(voice_seed)
-        controllability_vector = torch.tensor([emb_slider_1,
-                                               emb_slider_2,
-                                               emb_slider_3,
-                                               emb_slider_4,
-                                               emb_slider_5,
-                                               emb_slider_6], dtype=torch.float32)
-        embedding = self.wgan.modify_embed(controllability_vector)
-        self.model.set_utterance_embedding(embedding=embedding)
         phones = self.model.text2phone.get_phone_string(prompt)
         if len(phones) > 1800:
@@ -92,15 +99,15 @@ class ControllableInterface:
                 if self.current_accent != "eng":
                     self.model.set_accent_language("eng")
                     self.current_accent = "eng"
-        print("\n\n")
-        print(prompt)
-        print(language)
-        print("\n\n")
         wav, sr, fig = self.model(prompt,
                                   input_is_phones=False,
                                   duration_scaling_factor=duration_scaling_factor,
                                   pitch_variance_scale=pitch_variance_scale,
                                   energy_variance_scale=energy_variance_scale,
                                   pause_duration_scaling_factor=pause_duration_scaling_factor,
-                                  return_plot_as_filepath=True)
         return sr, wav, fig

 import torch
 from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
+from Modules.ControllabilityGAN.GAN import GanWrapper
 from Utility.storage_config import MODELS_DIR
         else:
             os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
             os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
+        self.device = "cuda" if gpu_id != "cpu" else "cpu"
         self.model = ToucanTTSInterface(device=self.device, tts_model_path="Meta")
         self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device=self.device)
         self.generated_speaker_embeds = list()
     def read(self,
              prompt,
+             reference_audio,
              language,
              accent,
              voice_seed,
+             prosody_creativity,
              duration_scaling_factor,
              pause_duration_scaling_factor,
              pitch_variance_scale,
              emb_slider_3,
              emb_slider_4,
              emb_slider_5,
+             emb_slider_6,
+             loudness_in_db
              ):
         if self.current_language != language:
             self.model.set_phonemizer_language(language)
+            print(f"switched phonemizer language to {language}")
             self.current_language = language
         if self.current_accent != accent:
             self.model.set_accent_language(accent)
+            print(f"switched accent language to {accent}")
             self.current_accent = accent
+        if reference_audio is None:
+            self.wgan.set_latent(voice_seed)
+            controllability_vector = torch.tensor([emb_slider_1,
+                                                   emb_slider_2,
+                                                   emb_slider_3,
+                                                   emb_slider_4,
+                                                   emb_slider_5,
+                                                   emb_slider_6], dtype=torch.float32)
+            embedding = self.wgan.modify_embed(controllability_vector)
+            self.model.set_utterance_embedding(embedding=embedding)
+        else:
+            self.model.set_utterance_embedding(reference_audio)
         phones = self.model.text2phone.get_phone_string(prompt)
         if len(phones) > 1800:
                 if self.current_accent != "eng":
                     self.model.set_accent_language("eng")
                     self.current_accent = "eng"
+        print(prompt + "\n\n")
         wav, sr, fig = self.model(prompt,
                                   input_is_phones=False,
                                   duration_scaling_factor=duration_scaling_factor,
                                   pitch_variance_scale=pitch_variance_scale,
                                   energy_variance_scale=energy_variance_scale,
                                   pause_duration_scaling_factor=pause_duration_scaling_factor,
+                                  return_plot_as_filepath=True,
+                                  prosody_creativity=prosody_creativity,
+                                  loudness_in_db=loudness_in_db)
         return sr, wav, fig

InferenceInterfaces/ToucanTTSInterface.py CHANGED Viewed

@@ -1,19 +1,17 @@
 import itertools
 import os
-import warnings
 import matplotlib.pyplot as plt
 import pyloudnorm
 import sounddevice
 import soundfile
 import torch
-with warnings.catch_warnings():
-    warnings.simplefilter("ignore")
-    from speechbrain.pretrained import EncoderClassifier
-    from torchaudio.transforms import Resample
-from Architectures.ToucanTTS.InferenceToucanTTS import ToucanTTS
-from Architectures.Vocoder.HiFiGAN_Generator import HiFiGAN
 from Preprocessing.AudioPreprocessor import AudioPreprocessor
 from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
 from Preprocessing.TextFrontend import get_language_id
@@ -29,7 +27,6 @@ class ToucanTTSInterface(torch.nn.Module):
                  tts_model_path=os.path.join(MODELS_DIR, f"ToucanTTS_Meta", "best.pt"),  # path to the ToucanTTS checkpoint or just a shorthand if run standalone
                  vocoder_model_path=os.path.join(MODELS_DIR, f"Vocoder", "best.pt"),  # path to the Vocoder checkpoint
                  language="eng",  # initial language of the model, can be changed later with the setter methods
-                 enhance=None  # legacy argument
                  ):
         super().__init__()
         self.device = device
@@ -40,7 +37,7 @@ class ToucanTTSInterface(torch.nn.Module):
         ################################
         #   build text to phone        #
         ################################
-        self.text2phone = ArticulatoryCombinedTextFrontend(language=language, add_silence_to_end=True)
         #####################################
         #   load phone to features model    #
@@ -92,8 +89,12 @@ class ToucanTTSInterface(torch.nn.Module):
             speaker_embs = list()
             for path in path_to_reference_audio:
                 wave, sr = soundfile.read(path)
                 wave = Resample(orig_freq=sr, new_freq=16000).to(self.device)(torch.tensor(wave, device=self.device, dtype=torch.float32))
-                speaker_embedding = self.speaker_embedding_func_ecapa.encode_batch(wavs=wave.to(self.device).unsqueeze(0)).squeeze()
                 speaker_embs.append(speaker_embedding)
             self.default_utterance_embedding = sum(speaker_embs) / len(speaker_embs)
@@ -105,10 +106,10 @@ class ToucanTTSInterface(torch.nn.Module):
         self.set_accent_language(lang_id=lang_id)
     def set_phonemizer_language(self, lang_id):
-        self.text2phone.change_lang(language=lang_id, add_silence_to_end=True)
     def set_accent_language(self, lang_id):
-        if lang_id in ['ajp', 'ajt', 'lak', 'lno', 'nul', 'pii', 'plj', 'slq', 'smd', 'snb', 'tpw', 'wya', 'zua', 'en-us', 'en-sc', 'fr-be', 'fr-sw', 'pt-br', 'spa-lat', 'vi-ctr', 'vi-so']:
             if lang_id == 'vi-so' or lang_id == 'vi-ctr':
                 lang_id = 'vie'
             elif lang_id == 'spa-lat':
@@ -120,7 +121,7 @@ class ToucanTTSInterface(torch.nn.Module):
             elif lang_id == 'en-sc' or lang_id == 'en-us':
                 lang_id = 'eng'
             else:
-                # no clue where these others are even coming from, they are not in ISO 639-2
                 lang_id = 'eng'
         self.lang_id = get_language_id(lang_id).to(self.device)
@@ -138,7 +139,7 @@ class ToucanTTSInterface(torch.nn.Module):
                 input_is_phones=False,
                 return_plot_as_filepath=False,
                 loudness_in_db=-24.0,
-                glow_sampling_temperature=0.2):
         """
         duration_scaling_factor: reasonable values are 0.8 < scale < 1.2.
                                      1.0 means no scaling happens, higher values increase durations for the whole
@@ -154,16 +155,16 @@ class ToucanTTSInterface(torch.nn.Module):
             phones = self.text2phone.string_to_tensor(text, input_phonemes=input_is_phones).to(torch.device(self.device))
             mel, durations, pitch, energy = self.phone2mel(phones,
                                                            return_duration_pitch_energy=True,
-                                                           utterance_embedding=self.default_utterance_embedding.to(self.device),
                                                            durations=durations,
                                                            pitch=pitch,
                                                            energy=energy,
-                                                           lang_id=self.lang_id.to(self.device),
                                                            duration_scaling_factor=duration_scaling_factor,
                                                            pitch_variance_scale=pitch_variance_scale,
                                                            energy_variance_scale=energy_variance_scale,
                                                            pause_duration_scaling_factor=pause_duration_scaling_factor,
-                                                           glow_sampling_temperature=glow_sampling_temperature)
             wave, _, _ = self.vocoder(mel.unsqueeze(0))
             wave = wave.squeeze().cpu()
@@ -177,63 +178,56 @@ class ToucanTTSInterface(torch.nn.Module):
             pass
         if view or return_plot_as_filepath:
-            try:
-                fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(9, 5))
-                ax.imshow(mel.cpu().numpy(), origin="lower", cmap='GnBu')
-                ax.yaxis.set_visible(False)
-                duration_splits, label_positions = cumsum_durations(durations.cpu().numpy())
-                ax.xaxis.grid(True, which='minor')
-                ax.set_xticks(label_positions, minor=False)
-                if input_is_phones:
-                    phones = text.replace(" ", "|")
-                else:
-                    phones = self.text2phone.get_phone_string(text, for_plot_labels=True)
-                try:
-                    ax.set_xticklabels(phones)
-                except IndexError:
-                    pass
-                word_boundaries = list()
-                for label_index, phone in enumerate(phones):
-                    if phone == "|":
-                        word_boundaries.append(label_positions[label_index])
-                try:
-                    prev_word_boundary = 0
-                    word_label_positions = list()
-                    for word_boundary in word_boundaries:
-                        word_label_positions.append((word_boundary + prev_word_boundary) / 2)
-                        prev_word_boundary = word_boundary
-                    word_label_positions.append((duration_splits[-1] + prev_word_boundary) / 2)
-                    secondary_ax = ax.secondary_xaxis('bottom')
-                    secondary_ax.tick_params(axis="x", direction="out", pad=24)
-                    secondary_ax.set_xticks(word_label_positions, minor=False)
-                    secondary_ax.set_xticklabels(text.split())
-                    secondary_ax.tick_params(axis='x', colors='orange')
-                    secondary_ax.xaxis.label.set_color('orange')
-                except ValueError:
-                    ax.set_title(text)
-                except IndexError:
-                    ax.set_title(text)
-                except RuntimeError:
-                    ax.set_title(text)
-                ax.vlines(x=duration_splits, colors="green", linestyles="solid", ymin=0, ymax=120, linewidth=0.5)
-                ax.vlines(x=word_boundaries, colors="orange", linestyles="solid", ymin=0, ymax=120, linewidth=1.0)
-                plt.subplots_adjust(left=0.02, bottom=0.2, right=0.98, top=.9, wspace=0.0, hspace=0.0)
-                ax.set_aspect("auto")
-            except:
-                pass
             if return_plot_as_filepath:
-                try:
-                    plt.savefig("tmp.png")
-                    plt.close()
-                except:
-                    pass
                 return wave, sr, "tmp.png"
         return wave, sr
     def read_to_file(self,
@@ -247,7 +241,7 @@ class ToucanTTSInterface(torch.nn.Module):
                      dur_list=None,
                      pitch_list=None,
                      energy_list=None,
-                     glow_sampling_temperature=0.2):
         """
         Args:
             silent: Whether to be verbose about the process
@@ -259,12 +253,19 @@ class ToucanTTSInterface(torch.nn.Module):
             duration_scaling_factor: reasonable values are 0.8 < scale < 1.2.
                                      1.0 means no scaling happens, higher values increase durations for the whole
                                      utterance, lower values decrease durations for the whole utterance.
             pitch_variance_scale: reasonable values are 0.6 < scale < 1.4.
                                   1.0 means no scaling happens, higher values increase variance of the pitch curve,
                                   lower values decrease variance of the pitch curve.
             energy_variance_scale: reasonable values are 0.6 < scale < 1.4.
                                    1.0 means no scaling happens, higher values increase variance of the energy curve,
                                    lower values decrease variance of the energy curve.
         """
         if not dur_list:
             dur_list = []
@@ -272,7 +273,7 @@ class ToucanTTSInterface(torch.nn.Module):
             pitch_list = []
         if not energy_list:
             energy_list = []
-        silence = torch.zeros([14300])
         wav = silence.clone()
         for (text, durations, pitch, energy) in itertools.zip_longest(text_list, dur_list, pitch_list, energy_list):
             if text.strip() != "":
@@ -286,7 +287,7 @@ class ToucanTTSInterface(torch.nn.Module):
                                            pitch_variance_scale=pitch_variance_scale,
                                            energy_variance_scale=energy_variance_scale,
                                            pause_duration_scaling_factor=pause_duration_scaling_factor,
-                                           glow_sampling_temperature=glow_sampling_temperature)
                 spoken_sentence = torch.tensor(spoken_sentence).cpu()
                 wav = torch.cat((wav, spoken_sentence, silence), 0)
         soundfile.write(file=file_location, data=float2pcm(wav), samplerate=sr, subtype="PCM_16")
@@ -298,7 +299,7 @@ class ToucanTTSInterface(torch.nn.Module):
                    pitch_variance_scale=1.0,
                    energy_variance_scale=1.0,
                    blocking=False,
-                   glow_sampling_temperature=0.2):
         if text.strip() == "":
             return
         wav, sr = self(text,
@@ -306,7 +307,7 @@ class ToucanTTSInterface(torch.nn.Module):
                        duration_scaling_factor=duration_scaling_factor,
                        pitch_variance_scale=pitch_variance_scale,
                        energy_variance_scale=energy_variance_scale,
-                       glow_sampling_temperature=glow_sampling_temperature)
         silence = torch.zeros([sr // 2])
         wav = torch.cat((silence, torch.tensor(wav), silence), 0).numpy()
         sounddevice.play(float2pcm(wav), samplerate=sr)

 import itertools
 import os
+import librosa
 import matplotlib.pyplot as plt
 import pyloudnorm
 import sounddevice
 import soundfile
 import torch
+from speechbrain.pretrained import EncoderClassifier
+from torchaudio.transforms import Resample
+from Modules.ToucanTTS.InferenceToucanTTS import ToucanTTS
+from Modules.Vocoder.HiFiGAN_Generator import HiFiGAN
 from Preprocessing.AudioPreprocessor import AudioPreprocessor
 from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
 from Preprocessing.TextFrontend import get_language_id
                  tts_model_path=os.path.join(MODELS_DIR, f"ToucanTTS_Meta", "best.pt"),  # path to the ToucanTTS checkpoint or just a shorthand if run standalone
                  vocoder_model_path=os.path.join(MODELS_DIR, f"Vocoder", "best.pt"),  # path to the Vocoder checkpoint
                  language="eng",  # initial language of the model, can be changed later with the setter methods
                  ):
         super().__init__()
         self.device = device
         ################################
         #   build text to phone        #
         ################################
+        self.text2phone = ArticulatoryCombinedTextFrontend(language=language, add_silence_to_end=True, device=device)
         #####################################
         #   load phone to features model    #
             speaker_embs = list()
             for path in path_to_reference_audio:
                 wave, sr = soundfile.read(path)
+                if len(wave.shape) > 1:  # oh no, we found a stereo audio!
+                    if len(wave[0]) == 2:  # let's figure out whether we need to switch the axes
+                        wave = wave.transpose()  # if yes, we switch the axes.
+                wave = librosa.to_mono(wave)
                 wave = Resample(orig_freq=sr, new_freq=16000).to(self.device)(torch.tensor(wave, device=self.device, dtype=torch.float32))
+                speaker_embedding = self.speaker_embedding_func_ecapa.encode_batch(wavs=wave.to(self.device).squeeze().unsqueeze(0)).squeeze()
                 speaker_embs.append(speaker_embedding)
             self.default_utterance_embedding = sum(speaker_embs) / len(speaker_embs)
         self.set_accent_language(lang_id=lang_id)
     def set_phonemizer_language(self, lang_id):
+        self.text2phone = ArticulatoryCombinedTextFrontend(language=lang_id, add_silence_to_end=True, device=self.device)
     def set_accent_language(self, lang_id):
+        if lang_id in {'ajp', 'ajt', 'lak', 'lno', 'nul', 'pii', 'plj', 'slq', 'smd', 'snb', 'tpw', 'wya', 'zua', 'en-us', 'en-sc', 'fr-be', 'fr-sw', 'pt-br', 'spa-lat', 'vi-ctr', 'vi-so'}:
             if lang_id == 'vi-so' or lang_id == 'vi-ctr':
                 lang_id = 'vie'
             elif lang_id == 'spa-lat':
             elif lang_id == 'en-sc' or lang_id == 'en-us':
                 lang_id = 'eng'
             else:
+                # no clue where these others are even coming from, they are not in ISO 639-3
                 lang_id = 'eng'
         self.lang_id = get_language_id(lang_id).to(self.device)
                 input_is_phones=False,
                 return_plot_as_filepath=False,
                 loudness_in_db=-24.0,
+                prosody_creativity=0.1):
         """
         duration_scaling_factor: reasonable values are 0.8 < scale < 1.2.
                                      1.0 means no scaling happens, higher values increase durations for the whole
             phones = self.text2phone.string_to_tensor(text, input_phonemes=input_is_phones).to(torch.device(self.device))
             mel, durations, pitch, energy = self.phone2mel(phones,
                                                            return_duration_pitch_energy=True,
+                                                           utterance_embedding=self.default_utterance_embedding,
                                                            durations=durations,
                                                            pitch=pitch,
                                                            energy=energy,
+                                                           lang_id=self.lang_id,
                                                            duration_scaling_factor=duration_scaling_factor,
                                                            pitch_variance_scale=pitch_variance_scale,
                                                            energy_variance_scale=energy_variance_scale,
                                                            pause_duration_scaling_factor=pause_duration_scaling_factor,
+                                                           prosody_creativity=prosody_creativity)
             wave, _, _ = self.vocoder(mel.unsqueeze(0))
             wave = wave.squeeze().cpu()
             pass
         if view or return_plot_as_filepath:
+            fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(9, 5))
+            ax.imshow(mel.cpu().numpy(), origin="lower", cmap='GnBu')
+            ax.yaxis.set_visible(False)
+            duration_splits, label_positions = cumsum_durations(durations.cpu().numpy())
+            ax.xaxis.grid(True, which='minor')
+            ax.set_xticks(label_positions, minor=False)
+            if input_is_phones:
+                phones = text.replace(" ", "|")
+            else:
+                phones = self.text2phone.get_phone_string(text, for_plot_labels=True)
+            try:
+                ax.set_xticklabels(phones)
+            except IndexError:
+                pass
+            except ValueError:
+                pass
+            word_boundaries = list()
+            for label_index, phone in enumerate(phones):
+                if phone == "|":
+                    word_boundaries.append(label_positions[label_index])
+            try:
+                prev_word_boundary = 0
+                word_label_positions = list()
+                for word_boundary in word_boundaries:
+                    word_label_positions.append((word_boundary + prev_word_boundary) / 2)
+                    prev_word_boundary = word_boundary
+                word_label_positions.append((duration_splits[-1] + prev_word_boundary) / 2)
+                secondary_ax = ax.secondary_xaxis('bottom')
+                secondary_ax.tick_params(axis="x", direction="out", pad=24)
+                secondary_ax.set_xticks(word_label_positions, minor=False)
+                secondary_ax.set_xticklabels(text.split())
+                secondary_ax.tick_params(axis='x', colors='orange')
+                secondary_ax.xaxis.label.set_color('orange')
+            except ValueError:
+                ax.set_title(text)
+            except IndexError:
+                ax.set_title(text)
+            ax.vlines(x=duration_splits, colors="green", linestyles="solid", ymin=0, ymax=120, linewidth=0.5)
+            ax.vlines(x=word_boundaries, colors="orange", linestyles="solid", ymin=0, ymax=120, linewidth=1.0)
+            plt.subplots_adjust(left=0.02, bottom=0.2, right=0.98, top=.9, wspace=0.0, hspace=0.0)
+            ax.set_aspect("auto")
             if return_plot_as_filepath:
+                plt.savefig("tmp.png")
+                plt.close()
                 return wave, sr, "tmp.png"
         return wave, sr
     def read_to_file(self,
                      dur_list=None,
                      pitch_list=None,
                      energy_list=None,
+                     prosody_creativity=0.1):
         """
         Args:
             silent: Whether to be verbose about the process
             duration_scaling_factor: reasonable values are 0.8 < scale < 1.2.
                                      1.0 means no scaling happens, higher values increase durations for the whole
                                      utterance, lower values decrease durations for the whole utterance.
+            pause_duration_scaling_factor: reasonable values are 0.8 < scale < 1.2.
+                                     1.0 means no scaling happens, higher values increase durations for the pauses,
+                                     lower values decrease durations for the whole utterance.
             pitch_variance_scale: reasonable values are 0.6 < scale < 1.4.
                                   1.0 means no scaling happens, higher values increase variance of the pitch curve,
                                   lower values decrease variance of the pitch curve.
             energy_variance_scale: reasonable values are 0.6 < scale < 1.4.
                                    1.0 means no scaling happens, higher values increase variance of the energy curve,
                                    lower values decrease variance of the energy curve.
+            prosody_creativity: sampling temperature of the generative model that comes up with the pitch, energy and
+                                durations. Higher values mena more variance, lower temperature means less variance across
+                                generations. reasonable values are between 0.0 and 1.2, anything higher makes the voice
+                                sound very weird.
         """
         if not dur_list:
             dur_list = []
             pitch_list = []
         if not energy_list:
             energy_list = []
+        silence = torch.zeros([400])
         wav = silence.clone()
         for (text, durations, pitch, energy) in itertools.zip_longest(text_list, dur_list, pitch_list, energy_list):
             if text.strip() != "":
                                            pitch_variance_scale=pitch_variance_scale,
                                            energy_variance_scale=energy_variance_scale,
                                            pause_duration_scaling_factor=pause_duration_scaling_factor,
+                                           prosody_creativity=prosody_creativity)
                 spoken_sentence = torch.tensor(spoken_sentence).cpu()
                 wav = torch.cat((wav, spoken_sentence, silence), 0)
         soundfile.write(file=file_location, data=float2pcm(wav), samplerate=sr, subtype="PCM_16")
                    pitch_variance_scale=1.0,
                    energy_variance_scale=1.0,
                    blocking=False,
+                   prosody_creativity=0.1):
         if text.strip() == "":
             return
         wav, sr = self(text,
                        duration_scaling_factor=duration_scaling_factor,
                        pitch_variance_scale=pitch_variance_scale,
                        energy_variance_scale=energy_variance_scale,
+                       prosody_creativity=prosody_creativity)
         silence = torch.zeros([sr // 2])
         wav = torch.cat((silence, torch.tensor(wav), silence), 0).numpy()
         sounddevice.play(float2pcm(wav), samplerate=sr)

InferenceInterfaces/UtteranceCloner.py CHANGED Viewed

@@ -4,11 +4,11 @@ import numpy
 import soundfile as sf
 import torch
-from Architectures.Aligner.Aligner import Aligner
-from Architectures.ToucanTTS.DurationCalculator import DurationCalculator
-from Architectures.ToucanTTS.EnergyCalculator import EnergyCalculator
-from Architectures.ToucanTTS.PitchCalculator import Parselmouth
 from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
 from Preprocessing.AudioPreprocessor import AudioPreprocessor
 from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
 from Preprocessing.articulatory_features import get_feature_to_index_lookup
@@ -26,7 +26,7 @@ class UtteranceCloner:
     def __init__(self, model_id, device, language="eng"):
         self.tts = ToucanTTSInterface(device=device, tts_model_path=model_id)
         self.ap = AudioPreprocessor(input_sr=100, output_sr=16000, cut_silence=False)
-        self.tf = ArticulatoryCombinedTextFrontend(language=language)
         self.device = device
         acoustic_checkpoint_path = os.path.join(MODELS_DIR, "Aligner", "aligner.pt")
         self.aligner_weights = torch.load(acoustic_checkpoint_path, map_location=device)["asr_model"]
@@ -43,6 +43,7 @@ class UtteranceCloner:
         self.acoustic_model = Aligner()
         self.acoustic_model = self.acoustic_model.to(self.device)
         self.acoustic_model.load_state_dict(self.aligner_weights)
         self.parsel = Parselmouth(reduction_factor=1, fs=16000)
         self.energy_calc = EnergyCalculator(reduction_factor=1, fs=16000)
         self.dc = DurationCalculator(reduction_factor=1)
@@ -50,10 +51,11 @@ class UtteranceCloner:
     def extract_prosody(self, transcript, ref_audio_path, lang="eng", on_line_fine_tune=True):
         if on_line_fine_tune:
             self.acoustic_model.load_state_dict(self.aligner_weights)
         wave, sr = sf.read(ref_audio_path)
         if self.tf.language != lang:
-            self.tf = ArticulatoryCombinedTextFrontend(language=lang)
         if self.ap.input_sr != sr:
             self.ap = AudioPreprocessor(input_sr=sr, output_sr=16000, cut_silence=False)
         try:

 import soundfile as sf
 import torch
 from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
+from Modules.Aligner.Aligner import Aligner
+from Modules.ToucanTTS.DurationCalculator import DurationCalculator
+from Modules.ToucanTTS.EnergyCalculator import EnergyCalculator
+from Modules.ToucanTTS.PitchCalculator import Parselmouth
 from Preprocessing.AudioPreprocessor import AudioPreprocessor
 from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
 from Preprocessing.articulatory_features import get_feature_to_index_lookup
     def __init__(self, model_id, device, language="eng"):
         self.tts = ToucanTTSInterface(device=device, tts_model_path=model_id)
         self.ap = AudioPreprocessor(input_sr=100, output_sr=16000, cut_silence=False)
+        self.tf = ArticulatoryCombinedTextFrontend(language=language, device=device)
         self.device = device
         acoustic_checkpoint_path = os.path.join(MODELS_DIR, "Aligner", "aligner.pt")
         self.aligner_weights = torch.load(acoustic_checkpoint_path, map_location=device)["asr_model"]
         self.acoustic_model = Aligner()
         self.acoustic_model = self.acoustic_model.to(self.device)
         self.acoustic_model.load_state_dict(self.aligner_weights)
+        self.acoustic_model.eval()
         self.parsel = Parselmouth(reduction_factor=1, fs=16000)
         self.energy_calc = EnergyCalculator(reduction_factor=1, fs=16000)
         self.dc = DurationCalculator(reduction_factor=1)
     def extract_prosody(self, transcript, ref_audio_path, lang="eng", on_line_fine_tune=True):
         if on_line_fine_tune:
             self.acoustic_model.load_state_dict(self.aligner_weights)
+            self.acoustic_model.eval()
         wave, sr = sf.read(ref_audio_path)
         if self.tf.language != lang:
+            self.tf = ArticulatoryCombinedTextFrontend(language=lang, device=self.device)
         if self.ap.input_sr != sr:
             self.ap = AudioPreprocessor(input_sr=sr, output_sr=16000, cut_silence=False)
         try:

InferenceInterfaces/audioseal_wm_16bits.yaml DELETED Viewed

@@ -1,39 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-name: audioseal_wm_16bits
-model_type: seanet
-checkpoint: "https://dl.fbaipublicfiles.com/audioseal/6edcf62f/generator.pth"
-nbits: 16
-seanet:
-  activation: ELU
-  activation_params:
-    alpha: 1.0
-  causal: false
-  channels: 1
-  compress: 2
-  dilation_base: 2
-  dimension: 128
-  disable_norm_outer_blocks: 0
-  kernel_size: 7
-  last_kernel_size: 7
-  lstm: 2
-  n_filters: 32
-  n_residual_layers: 1
-  norm: weight_norm
-  norm_params: { }
-  pad_mode: constant
-  ratios:
-    - 8
-    - 5
-    - 4
-    - 2
-  residual_kernel_size: 3
-  true_skip: true
-decoder:
-  final_activation: null
-  final_activation_params: null
-  trim_right_ratio: 1.0

{Architectures → Modules}/Aligner/Aligner.py RENAMED Viewed

@@ -1,27 +1,31 @@
 """
 taken and adapted from https://github.com/as-ideas/DeepForcedAligner
 """
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
 import torch.multiprocessing
-import torch.nn as nn
 from torch.nn import CTCLoss
 from torch.nn.utils.rnn import pack_padded_sequence
 from torch.nn.utils.rnn import pad_packed_sequence
 from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
-class BatchNormConv(nn.Module):
     def __init__(self, in_channels: int, out_channels: int, kernel_size: int):
         super().__init__()
-        self.conv = nn.Conv1d(
             in_channels, out_channels, kernel_size,
             stride=1, padding=kernel_size // 2, bias=False)
-        self.bnorm = nn.BatchNorm1d(out_channels)
-        self.relu = nn.ReLU()
     def forward(self, x):
         x = x.transpose(1, 2)
@@ -37,22 +41,23 @@ class Aligner(torch.nn.Module):
     def __init__(self,
                  n_features=128,
                  num_symbols=145,
-                 lstm_dim=512,
-                 conv_dim=512):
         super().__init__()
-        self.convs = nn.ModuleList([
             BatchNormConv(n_features, conv_dim, 3),
-            nn.Dropout(p=0.5),
             BatchNormConv(conv_dim, conv_dim, 3),
-            nn.Dropout(p=0.5),
             BatchNormConv(conv_dim, conv_dim, 3),
-            nn.Dropout(p=0.5),
             BatchNormConv(conv_dim, conv_dim, 3),
-            nn.Dropout(p=0.5),
             BatchNormConv(conv_dim, conv_dim, 3),
-            nn.Dropout(p=0.5),
         ])
-        self.rnn = torch.nn.LSTM(conv_dim, lstm_dim, batch_first=True, bidirectional=True)
         self.proj = torch.nn.Linear(2 * lstm_dim, num_symbols)
         self.tf = ArticulatoryCombinedTextFrontend(language="eng")
         self.ctc_loss = CTCLoss(blank=144, zero_infinity=True)
@@ -61,14 +66,17 @@ class Aligner(torch.nn.Module):
     def forward(self, x, lens=None):
         for conv in self.convs:
             x = conv(x)
         if lens is not None:
             x = pack_padded_sequence(x, lens.cpu(), batch_first=True, enforce_sorted=False)
-        x, _ = self.rnn(x)
         if lens is not None:
             x, _ = pad_packed_sequence(x, batch_first=True)
         x = self.proj(x)
         return x
@@ -88,15 +96,12 @@ class Aligner(torch.nn.Module):
         pred_max = pred[:, tokens]
         # run monotonic alignment search
         alignment_matrix = binarize_alignment(pred_max)
         if save_img_for_debug is not None:
             phones = list()
             for index in tokens:
-                for phone in self.tf.phone_to_id:
-                    if self.tf.phone_to_id[phone] == index:
-                        phones.append(phone)
             fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))
             ax.imshow(alignment_matrix, interpolation='nearest', aspect='auto', origin="lower", cmap='cividis')
@@ -115,7 +120,6 @@ class Aligner(torch.nn.Module):
         return alignment_matrix
 def binarize_alignment(alignment_prob):
     """
     # Implementation by:
@@ -152,13 +156,5 @@ def binarize_alignment(alignment_prob):
 if __name__ == '__main__':
-    tf = ArticulatoryCombinedTextFrontend(language="eng")
-    from Preprocessing.HiFiCodecAudioPreprocessor import CodecAudioPreprocessor
-    cap = CodecAudioPreprocessor(input_sr=-1)
-    dummy_codebook_indexes = torch.randint(low=0, high=1023, size=[9, 20])
-    codebook_frames = cap.indexes_to_codec_frames(dummy_codebook_indexes)
-    alignment = Aligner().inference(codebook_frames.transpose(0, 1), tokens=tf.string_to_tensor("Hello world"))
-    print(alignment.shape)
-    plt.imshow(alignment, origin="lower", cmap="GnBu")
-    plt.show()

 """
 taken and adapted from https://github.com/as-ideas/DeepForcedAligner
+refined with insights from https://www.audiolabs-erlangen.de/resources/NLUI/2023-ICASSP-eval-alignment-tts
+EVALUATING SPEECH–PHONEME ALIGNMENT AND ITS IMPACT ON NEURAL TEXT-TO-SPEECH SYNTHESIS
+by Frank Zalkow, Prachi Govalkar, Meinard Muller, Emanuel A. P. Habets, Christian Dittmar
 """
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
 import torch.multiprocessing
 from torch.nn import CTCLoss
 from torch.nn.utils.rnn import pack_padded_sequence
 from torch.nn.utils.rnn import pad_packed_sequence
 from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
+from Utility.utils import make_non_pad_mask
+class BatchNormConv(torch.nn.Module):
     def __init__(self, in_channels: int, out_channels: int, kernel_size: int):
         super().__init__()
+        self.conv = torch.nn.Conv1d(
             in_channels, out_channels, kernel_size,
             stride=1, padding=kernel_size // 2, bias=False)
+        self.bnorm = torch.nn.SyncBatchNorm.convert_sync_batchnorm(torch.nn.BatchNorm1d(out_channels))
+        self.relu = torch.nn.ReLU()
     def forward(self, x):
         x = x.transpose(1, 2)
     def __init__(self,
                  n_features=128,
                  num_symbols=145,
+                 conv_dim=512,
+                 lstm_dim=512):
         super().__init__()
+        self.convs = torch.nn.ModuleList([
             BatchNormConv(n_features, conv_dim, 3),
+            torch.nn.Dropout(p=0.5),
             BatchNormConv(conv_dim, conv_dim, 3),
+            torch.nn.Dropout(p=0.5),
             BatchNormConv(conv_dim, conv_dim, 3),
+            torch.nn.Dropout(p=0.5),
             BatchNormConv(conv_dim, conv_dim, 3),
+            torch.nn.Dropout(p=0.5),
             BatchNormConv(conv_dim, conv_dim, 3),
+            torch.nn.Dropout(p=0.5),
         ])
+        self.rnn1 = torch.nn.LSTM(conv_dim, lstm_dim, batch_first=True, bidirectional=True)
+        self.rnn2 = torch.nn.LSTM(2 * lstm_dim, lstm_dim, batch_first=True, bidirectional=True)
         self.proj = torch.nn.Linear(2 * lstm_dim, num_symbols)
         self.tf = ArticulatoryCombinedTextFrontend(language="eng")
         self.ctc_loss = CTCLoss(blank=144, zero_infinity=True)
     def forward(self, x, lens=None):
         for conv in self.convs:
             x = conv(x)
         if lens is not None:
             x = pack_padded_sequence(x, lens.cpu(), batch_first=True, enforce_sorted=False)
+        x, _ = self.rnn1(x)
+        x, _ = self.rnn2(x)
         if lens is not None:
             x, _ = pad_packed_sequence(x, batch_first=True)
         x = self.proj(x)
+        if lens is not None:
+            out_masks = make_non_pad_mask(lens).unsqueeze(-1).to(x.device)
+            x = x * out_masks.float()
         return x
         pred_max = pred[:, tokens]
         # run monotonic alignment search
         alignment_matrix = binarize_alignment(pred_max)
         if save_img_for_debug is not None:
             phones = list()
             for index in tokens:
+                phones.append(self.tf.id_to_phone[index])
             fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))
             ax.imshow(alignment_matrix, interpolation='nearest', aspect='auto', origin="lower", cmap='cividis')
         return alignment_matrix
 def binarize_alignment(alignment_prob):
     """
     # Implementation by:
 if __name__ == '__main__':
+    print(sum(p.numel() for p in Aligner().parameters() if p.requires_grad))
+    print(Aligner()(x=torch.randn(size=[3, 30, 128]), lens=torch.LongTensor([20, 30, 10])).shape)

{Architectures → Modules}/Aligner/CodecAlignerDataset.py RENAMED Viewed

@@ -32,6 +32,7 @@ class CodecAlignerDataset(Dataset):
                  allow_unknown_symbols=False,
                  gpu_count=1,
                  rank=0):
         self.gpu_count = gpu_count
         self.rank = rank
         if not os.path.exists(os.path.join(cache_dir, "aligner_train_cache.pt")) or rebuild_cache:
@@ -50,9 +51,10 @@ class CodecAlignerDataset(Dataset):
         self.lang = lang
         self.device = device
         self.cache_dir = cache_dir
-        self.tf = ArticulatoryCombinedTextFrontend(language=self.lang)
         cache = torch.load(os.path.join(self.cache_dir, "aligner_train_cache.pt"), map_location='cpu')
         self.speaker_embeddings = cache[2]
         self.datapoints = cache[0]
         if self.gpu_count > 1:
             # we only keep a chunk of the dataset in memory to avoid redundancy. Which chunk, we figure out using the rank.
@@ -85,6 +87,7 @@ class CodecAlignerDataset(Dataset):
         if type(path_to_transcript_dict) != dict:
             path_to_transcript_dict = path_to_transcript_dict()  # in this case we passed a function instead of the dict, so that the function isn't executed if not necessary.
         torch.multiprocessing.set_start_method('spawn', force=True)
         resource_manager = Manager()
         self.path_to_transcript_dict = resource_manager.dict(path_to_transcript_dict)
         key_list = list(self.path_to_transcript_dict.keys())
@@ -93,6 +96,13 @@ class CodecAlignerDataset(Dataset):
         fisher_yates_shuffle(key_list)
         # build cache
         print("... building dataset cache ...")
         self.result_pool = resource_manager.list()
         # make processes
         key_splits = list()
@@ -176,8 +186,8 @@ class CodecAlignerDataset(Dataset):
         torch.set_grad_enabled(True)  # finding this issue was very infuriating: silero sets
         # this to false globally during model loading rather than using inference mode or no_grad
         silero_model = silero_model.to(device)
-        silence = torch.zeros([16000 // 4], device=device)
-        tf = ArticulatoryCombinedTextFrontend(language=lang)
         _, sr = sf.read(path_list[0])
         assumed_sr = sr
         ap = CodecAudioPreprocessor(input_sr=assumed_sr, device=device)
@@ -186,13 +196,15 @@ class CodecAlignerDataset(Dataset):
         for path in tqdm(path_list):
             if self.path_to_transcript_dict[path].strip() == "":
                 continue
             try:
                 wave, sr = sf.read(path)
             except:
                 print(f"Problem with an audio file: {path}")
                 continue
             wave = librosa.to_mono(wave)
             if sr != assumed_sr:
@@ -210,16 +222,19 @@ class CodecAlignerDataset(Dataset):
                 if verbose:
                     print(f"Excluding {path} because of its duration of {round(dur_in_seconds, 2)} seconds.")
                 continue
-            # remove silences from front and back, then add constant 1/4th second silences back to front and back
-            with torch.no_grad():
                 speech_timestamps = get_speech_timestamps(norm_wave, silero_model, sampling_rate=16000)
             try:
                 result = norm_wave[speech_timestamps[0]['start']:speech_timestamps[-1]['end']]
             except IndexError:
                 print("Audio might be too short to cut silences from front and back.")
                 continue
-            wave = torch.cat([silence, result, silence])
             # raw audio preprocessing is done
             transcript = self.path_to_transcript_dict[path]
@@ -238,10 +253,10 @@ class CodecAlignerDataset(Dataset):
                 # this can happen for Mandarin Chinese, when the syllabification of pinyin doesn't work. In that case, we just skip the sample.
                 continue
-            cached_speech = ap.audio_to_codebook_indexes(audio=wave, current_sampling_rate=16000).transpose(0, 1).cpu().numpy()
             process_internal_dataset_chunk.append([cached_text,
                                                    cached_speech,
-                                                   result.cpu().detach().numpy(),
                                                    path])
         self.result_pool.append(process_internal_dataset_chunk)
@@ -256,16 +271,44 @@ class CodecAlignerDataset(Dataset):
             codes = codes.transpose(0, 1)
         return tokens, \
-               token_len, \
-               codes, \
-               None, \
-               self.speaker_embeddings[index]
     def __len__(self):
         return len(self.datapoints)
 def fisher_yates_shuffle(lst):
     for i in range(len(lst) - 1, 0, -1):
         j = random.randint(0, i)
         lst[i], lst[j] = lst[j], lst[i]

                  allow_unknown_symbols=False,
                  gpu_count=1,
                  rank=0):
         self.gpu_count = gpu_count
         self.rank = rank
         if not os.path.exists(os.path.join(cache_dir, "aligner_train_cache.pt")) or rebuild_cache:
         self.lang = lang
         self.device = device
         self.cache_dir = cache_dir
+        self.tf = ArticulatoryCombinedTextFrontend(language=self.lang, device=device)
         cache = torch.load(os.path.join(self.cache_dir, "aligner_train_cache.pt"), map_location='cpu')
         self.speaker_embeddings = cache[2]
+        self.filepaths = cache[3]
         self.datapoints = cache[0]
         if self.gpu_count > 1:
             # we only keep a chunk of the dataset in memory to avoid redundancy. Which chunk, we figure out using the rank.
         if type(path_to_transcript_dict) != dict:
             path_to_transcript_dict = path_to_transcript_dict()  # in this case we passed a function instead of the dict, so that the function isn't executed if not necessary.
         torch.multiprocessing.set_start_method('spawn', force=True)
+        torch.multiprocessing.set_sharing_strategy('file_system')
         resource_manager = Manager()
         self.path_to_transcript_dict = resource_manager.dict(path_to_transcript_dict)
         key_list = list(self.path_to_transcript_dict.keys())
         fisher_yates_shuffle(key_list)
         # build cache
         print("... building dataset cache ...")
+        torch.hub._validate_not_a_forked_repo = lambda a, b, c: True  # torch 1.9 has a bug in the hub loading, this is a workaround
+        # careful: assumes 16kHz or 8kHz audio
+        _, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad',  # make sure it gets downloaded during single-processing first, if it's not already downloaded
+                              model='silero_vad',
+                              force_reload=False,
+                              onnx=False,
+                              verbose=False)
         self.result_pool = resource_manager.list()
         # make processes
         key_splits = list()
         torch.set_grad_enabled(True)  # finding this issue was very infuriating: silero sets
         # this to false globally during model loading rather than using inference mode or no_grad
         silero_model = silero_model.to(device)
+        silence = torch.zeros([16000 // 8]).to(device)
+        tf = ArticulatoryCombinedTextFrontend(language=lang, device=device)
         _, sr = sf.read(path_list[0])
         assumed_sr = sr
         ap = CodecAudioPreprocessor(input_sr=assumed_sr, device=device)
         for path in tqdm(path_list):
             if self.path_to_transcript_dict[path].strip() == "":
                 continue
             try:
                 wave, sr = sf.read(path)
             except:
                 print(f"Problem with an audio file: {path}")
                 continue
+            if len(wave.shape) > 1:  # oh no, we found a stereo audio!
+                if len(wave[0]) == 2:  # let's figure out whether we need to switch the axes
+                    wave = wave.transpose()  # if yes, we switch the axes.
             wave = librosa.to_mono(wave)
             if sr != assumed_sr:
                 if verbose:
                     print(f"Excluding {path} because of its duration of {round(dur_in_seconds, 2)} seconds.")
                 continue
+            with torch.inference_mode():
                 speech_timestamps = get_speech_timestamps(norm_wave, silero_model, sampling_rate=16000)
             try:
+                silence_timestamps = invert_segments(speech_timestamps, len(norm_wave))
+                for silence_timestamp in silence_timestamps:
+                    begin = silence_timestamp['start']
+                    end = silence_timestamp['end']
+                    norm_wave = torch.cat([norm_wave[:begin], torch.zeros([end - begin], device=device), norm_wave[end:]])
                 result = norm_wave[speech_timestamps[0]['start']:speech_timestamps[-1]['end']]
             except IndexError:
                 print("Audio might be too short to cut silences from front and back.")
                 continue
+            norm_wave = torch.cat([silence, result, silence])
             # raw audio preprocessing is done
             transcript = self.path_to_transcript_dict[path]
                 # this can happen for Mandarin Chinese, when the syllabification of pinyin doesn't work. In that case, we just skip the sample.
                 continue
+            cached_speech = ap.audio_to_codebook_indexes(audio=norm_wave, current_sampling_rate=16000).transpose(0, 1).cpu().numpy()
             process_internal_dataset_chunk.append([cached_text,
                                                    cached_speech,
+                                                   norm_wave.cpu().detach().numpy(),
                                                    path])
         self.result_pool.append(process_internal_dataset_chunk)
             codes = codes.transpose(0, 1)
         return tokens, \
+            token_len, \
+            codes, \
+            None, \
+            self.speaker_embeddings[index]
     def __len__(self):
         return len(self.datapoints)
+    def remove_samples(self, list_of_samples_to_remove):
+        for remove_id in sorted(list_of_samples_to_remove, reverse=True):
+            self.datapoints.pop(remove_id)
+            self.speaker_embeddings.pop(remove_id)
+            self.filepaths.pop(remove_id)
+        torch.save((self.datapoints, None, self.speaker_embeddings, self.filepaths),
+                   os.path.join(self.cache_dir, "aligner_train_cache.pt"))
+        print("Dataset updated!")
 def fisher_yates_shuffle(lst):
     for i in range(len(lst) - 1, 0, -1):
         j = random.randint(0, i)
         lst[i], lst[j] = lst[j], lst[i]
+def invert_segments(segments, total_duration):
+    if not segments:
+        return [{'start': 0, 'end': total_duration}]
+    inverted_segments = []
+    previous_end = 0
+    for segment in segments:
+        start = segment['start']
+        if previous_end < start:
+            inverted_segments.append({'start': previous_end, 'end': start})
+        previous_end = segment['end']
+    if previous_end < total_duration:
+        inverted_segments.append({'start': previous_end, 'end': total_duration})
+    return inverted_segments

{Architectures → Modules}/Aligner/README.md RENAMED Viewed

File without changes

{Architectures → Modules}/Aligner/Reconstructor.py RENAMED Viewed

@@ -1,7 +1,5 @@
 import torch
 import torch.multiprocessing
-from torch.nn.utils.rnn import pack_padded_sequence
-from torch.nn.utils.rnn import pad_packed_sequence
 from Utility.utils import make_non_pad_mask
@@ -12,28 +10,23 @@ class Reconstructor(torch.nn.Module):
                  n_features=128,
                  num_symbols=145,
                  speaker_embedding_dim=192,
-                 lstm_dim=256):
         super().__init__()
-        self.in_proj = torch.nn.Linear(num_symbols + speaker_embedding_dim, lstm_dim)
-        self.rnn1 = torch.nn.LSTM(lstm_dim, lstm_dim, batch_first=True, bidirectional=True)
-        self.rnn2 = torch.nn.LSTM(2 * lstm_dim, lstm_dim, batch_first=True, bidirectional=True)
-        self.out_proj = torch.nn.Linear(2 * lstm_dim, n_features)
         self.l1_criterion = torch.nn.L1Loss(reduction="none")
-        self.l2_criterion = torch.nn.MSELoss(reduction="none")
     def forward(self, x, lens, ys):
         x = self.in_proj(x)
-        x = pack_padded_sequence(x, lens.cpu(), batch_first=True, enforce_sorted=False)
-        x, _ = self.rnn1(x)
-        x, _ = self.rnn2(x)
-        x, _ = pad_packed_sequence(x, batch_first=True)
         x = self.out_proj(x)
         out_masks = make_non_pad_mask(lens).unsqueeze(-1).to(ys.device)
         out_weights = out_masks.float() / out_masks.sum(dim=1, keepdim=True).float()
         out_weights /= ys.size(0) * ys.size(2)
-        l1_loss = self.l1_criterion(x, ys).mul(out_weights).masked_select(out_masks).sum()
-        l2_loss = self.l2_criterion(x, ys).mul(out_weights).masked_select(out_masks).sum()
-        return l1_loss + l2_loss
 if __name__ == '__main__':

 import torch
 import torch.multiprocessing
 from Utility.utils import make_non_pad_mask
                  n_features=128,
                  num_symbols=145,
                  speaker_embedding_dim=192,
+                 hidden_dim=256):
         super().__init__()
+        self.in_proj = torch.nn.Linear(num_symbols + speaker_embedding_dim, hidden_dim)
+        self.hidden_proj = torch.nn.Linear(hidden_dim, hidden_dim)
+        self.out_proj = torch.nn.Linear(hidden_dim, n_features)
         self.l1_criterion = torch.nn.L1Loss(reduction="none")
     def forward(self, x, lens, ys):
         x = self.in_proj(x)
+        x = torch.nn.functional.leaky_relu(x)
+        x = self.hidden_proj(x)
+        x = torch.nn.functional.leaky_relu(x)
         x = self.out_proj(x)
         out_masks = make_non_pad_mask(lens).unsqueeze(-1).to(ys.device)
         out_weights = out_masks.float() / out_masks.sum(dim=1, keepdim=True).float()
         out_weights /= ys.size(0) * ys.size(2)
+        return self.l1_criterion(x, ys).mul(out_weights).masked_select(out_masks).sum()
 if __name__ == '__main__':

{Architectures → Modules}/Aligner/__init__.py RENAMED Viewed

File without changes

{Architectures → Modules}/Aligner/autoaligner_train_loop.py RENAMED Viewed

@@ -8,8 +8,8 @@ from torch.optim import RAdam
 from torch.utils.data.dataloader import DataLoader
 from tqdm import tqdm
-from Architectures.Aligner.Aligner import Aligner
-from Architectures.Aligner.Reconstructor import Reconstructor
 from Preprocessing.AudioPreprocessor import AudioPreprocessor
 from Preprocessing.EnCodecAudioPreprocessor import CodecAudioPreprocessor
@@ -152,6 +152,8 @@ def train_loop(train_dataset,
             optim_asr.zero_grad()
             if use_reconstruction:
                 optim_tts.zero_grad()
             loss.backward()
             torch.nn.utils.clip_grad_norm_(asr_model.parameters(), 1.0)
             if use_reconstruction:

 from torch.utils.data.dataloader import DataLoader
 from tqdm import tqdm
+from Modules.Aligner.Aligner import Aligner
+from Modules.Aligner.Reconstructor import Reconstructor
 from Preprocessing.AudioPreprocessor import AudioPreprocessor
 from Preprocessing.EnCodecAudioPreprocessor import CodecAudioPreprocessor
             optim_asr.zero_grad()
             if use_reconstruction:
                 optim_tts.zero_grad()
+            if gpu_count > 1:
+                torch.distributed.barrier()
             loss.backward()
             torch.nn.utils.clip_grad_norm_(asr_model.parameters(), 1.0)
             if use_reconstruction:

{Architectures → Modules}/ControllabilityGAN/GAN.py RENAMED Viewed

@@ -1,12 +1,11 @@
 import torch
-from Architectures.ControllabilityGAN.wgan.init_wgan import create_wgan
-class GanWrapper(torch.nn.Module):
-    def __init__(self, path_wgan, device, *args, **kwargs):
-        super().__init__(*args, **kwargs)
         self.device = device
         self.path_wgan = path_wgan
@@ -20,27 +19,41 @@ class GanWrapper(torch.nn.Module):
         self.U = self.compute_controllability()
         self.z_list = list()
         for _ in range(1100):
-            self.z_list.append(self.wgan.G.module.sample_latent(1, 32).to("cpu"))
         self.z = self.z_list[0]
     def set_latent(self, seed):
         self.z = self.z = self.z_list[seed]
     def reset_default_latent(self):
-        self.z = self.wgan.G.module.sample_latent(1, 32).to("cpu")
     def load_model(self, path):
         gan_checkpoint = torch.load(path, map_location="cpu")
         self.wgan = create_wgan(parameters=gan_checkpoint['model_parameters'], device=self.device)
-        self.wgan.G.load_state_dict(gan_checkpoint['generator_state_dict'])
-        self.wgan.D.load_state_dict(gan_checkpoint['critic_state_dict'])
         self.mean = gan_checkpoint["dataset_mean"]
         self.std = gan_checkpoint["dataset_std"]
-    def compute_controllability(self, n_samples=50000):
         _, intermediate, z = self.wgan.sample_generator(num_samples=n_samples, nograd=True, return_intermediate=True)
         intermediate = intermediate.cpu()
         z = z.cpu()
@@ -69,7 +82,7 @@ class GanWrapper(torch.nn.Module):
     def modify_embed(self, x):
         self.wgan.G.eval()
         z_new = self.z.squeeze() + torch.matmul(self.U.solution.t(), x)
-        embed_modified = self.wgan.G.module.forward(z_new.unsqueeze(0).to(self.device))
         if self.normalize:
             embed_modified = inverse_normalize(
                 embed_modified.cpu(),

 import torch
+from Modules.ControllabilityGAN.wgan.init_wgan import create_wgan
+class GanWrapper:
+    def __init__(self, path_wgan, device):
         self.device = device
         self.path_wgan = path_wgan
         self.U = self.compute_controllability()
         self.z_list = list()
         for _ in range(1100):
+            self.z_list.append(self.wgan.G.sample_latent(1, self.wgan.G.z_dim, temperature=0.8))
         self.z = self.z_list[0]
     def set_latent(self, seed):
         self.z = self.z = self.z_list[seed]
     def reset_default_latent(self):
+        self.z = self.wgan.G.sample_latent(1, self.wgan.G.z_dim, temperature=0.8)
     def load_model(self, path):
         gan_checkpoint = torch.load(path, map_location="cpu")
         self.wgan = create_wgan(parameters=gan_checkpoint['model_parameters'], device=self.device)
+        # Create a new state dict without 'module.' prefix
+        new_state_dict_G = {}
+        for key, value in gan_checkpoint['generator_state_dict'].items():
+            # Remove 'module.' prefix
+            new_key = key.replace('module.', '')
+            new_state_dict_G[new_key] = value
+        new_state_dict_D = {}
+        for key, value in gan_checkpoint['critic_state_dict'].items():
+            # Remove 'module.' prefix
+            new_key = key.replace('module.', '')
+            new_state_dict_D[new_key] = value
+        self.wgan.G.load_state_dict(new_state_dict_G)
+        self.wgan.D.load_state_dict(new_state_dict_D)
         self.mean = gan_checkpoint["dataset_mean"]
         self.std = gan_checkpoint["dataset_std"]
+    def compute_controllability(self, n_samples=100000):
         _, intermediate, z = self.wgan.sample_generator(num_samples=n_samples, nograd=True, return_intermediate=True)
         intermediate = intermediate.cpu()
         z = z.cpu()
     def modify_embed(self, x):
         self.wgan.G.eval()
         z_new = self.z.squeeze() + torch.matmul(self.U.solution.t(), x)
+        embed_modified = self.wgan.G.forward(z_new.unsqueeze(0).to(self.device))
         if self.normalize:
             embed_modified = inverse_normalize(
                 embed_modified.cpu(),

{Architectures → Modules}/ControllabilityGAN/__init__.py RENAMED Viewed

File without changes

{Architectures → Modules}/ControllabilityGAN/dataset/__init__.py RENAMED Viewed

File without changes

{Architectures → Modules}/ControllabilityGAN/dataset/speaker_embeddings_dataset.py RENAMED Viewed

File without changes

{Architectures → Modules}/ControllabilityGAN/wgan/__init__.py RENAMED Viewed

File without changes

{Architectures → Modules}/ControllabilityGAN/wgan/init_weights.py RENAMED Viewed

File without changes

{Architectures → Modules}/ControllabilityGAN/wgan/init_wgan.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import torch
-from Architectures.ControllabilityGAN.wgan.resnet_init import init_resnet
-from Architectures.ControllabilityGAN.wgan.wgan_qc import WassersteinGanQuadraticCost
 def create_wgan(parameters, device, optimizer='adam'):

 import torch
+from Modules.ControllabilityGAN.wgan.resnet_init import init_resnet
+from Modules.ControllabilityGAN.wgan.wgan_qc import WassersteinGanQuadraticCost
 def create_wgan(parameters, device, optimizer='adam'):

{Architectures → Modules}/ControllabilityGAN/wgan/resnet_1.py RENAMED Viewed

@@ -76,8 +76,8 @@ class ResNet_G(nn.Module):
             return out, l_1
         return out
-    def sample_latent(self, n_samples, z_size):
-        return torch.randn((n_samples, z_size))
 class ResNet_D(nn.Module):

             return out, l_1
         return out
+    def sample_latent(self, n_samples, z_size, temperature=0.7):
+        return torch.randn((n_samples, z_size)) * temperature
 class ResNet_D(nn.Module):

{Architectures → Modules}/ControllabilityGAN/wgan/resnet_init.py RENAMED Viewed

@@ -1,7 +1,7 @@
-from Architectures.ControllabilityGAN.wgan.init_weights import weights_init_D
-from Architectures.ControllabilityGAN.wgan.init_weights import weights_init_G
-from Architectures.ControllabilityGAN.wgan.resnet_1 import ResNet_D
-from Architectures.ControllabilityGAN.wgan.resnet_1 import ResNet_G
 def init_resnet(parameters):

+from Modules.ControllabilityGAN.wgan.init_weights import weights_init_D
+from Modules.ControllabilityGAN.wgan.init_weights import weights_init_G
+from Modules.ControllabilityGAN.wgan.resnet_1 import ResNet_D
+from Modules.ControllabilityGAN.wgan.resnet_1 import ResNet_G
 def init_resnet(parameters):

{Architectures → Modules}/ControllabilityGAN/wgan/wgan_qc.py RENAMED Viewed

@@ -3,7 +3,6 @@ import time
 import numpy as np
 import torch
-import torch.nn as nn
 import torch.optim as optim
 from cvxopt import matrix
 from cvxopt import solvers
@@ -11,13 +10,12 @@ from cvxopt import sparse
 from cvxopt import spmatrix
 from torch.autograd import grad as torch_grad
 from tqdm import tqdm
-import spaces
-class WassersteinGanQuadraticCost(torch.nn.Module):
-    def __init__(self, generator, discriminator, gen_optimizer, dis_optimizer, criterion, epochs, n_max_iterations, data_dimensions, batch_size, device, gamma=0.1, K=-1, milestones=[150000, 250000], lr_anneal=1.0, *args, **kwargs):
-        super().__init__(*args, **kwargs)
         self.G = generator
         self.G_opt = gen_optimizer
         self.D = discriminator
@@ -46,8 +44,8 @@ class WassersteinGanQuadraticCost(torch.nn.Module):
         self.Kr = np.sqrt(self.K)
         self.LAMBDA = 2 * self.Kr * gamma * 2
-        self.G = nn.DataParallel(self.G.to(self.device))
-        self.D = nn.DataParallel(self.D.to(self.device))
         self.schedulerD = self._build_lr_scheduler_(self.D_opt, milestones, lr_anneal)
         self.schedulerG = self._build_lr_scheduler_(self.G_opt, milestones, lr_anneal)
@@ -245,10 +243,7 @@ class WassersteinGanQuadraticCost(torch.nn.Module):
         latent_samples = latent_samples.to(self.device)
         if nograd:
             with torch.no_grad():
-                if isinstance(self.G, torch.nn.parallel.DataParallel):
-                    generated_data = self.G.module(latent_samples, return_intermediate=return_intermediate)
-                else:
-                    generated_data = self.G(latent_samples, return_intermediate=return_intermediate)
         else:
             generated_data = self.G(latent_samples)
         self.G.train()

 import numpy as np
 import torch
 import torch.optim as optim
 from cvxopt import matrix
 from cvxopt import solvers
 from cvxopt import spmatrix
 from torch.autograd import grad as torch_grad
 from tqdm import tqdm
+class WassersteinGanQuadraticCost:
+    def __init__(self, generator, discriminator, gen_optimizer, dis_optimizer, criterion, epochs, n_max_iterations,
+                 data_dimensions, batch_size, device, gamma=0.1, K=-1, milestones=[150000, 250000], lr_anneal=1.0):
         self.G = generator
         self.G_opt = gen_optimizer
         self.D = discriminator
         self.Kr = np.sqrt(self.K)
         self.LAMBDA = 2 * self.Kr * gamma * 2
+        self.G = self.G.to(self.device)
+        self.D = self.D.to(self.device)
         self.schedulerD = self._build_lr_scheduler_(self.D_opt, milestones, lr_anneal)
         self.schedulerG = self._build_lr_scheduler_(self.G_opt, milestones, lr_anneal)
         latent_samples = latent_samples.to(self.device)
         if nograd:
             with torch.no_grad():
+                generated_data = self.G(latent_samples, return_intermediate=return_intermediate)
         else:
             generated_data = self.G(latent_samples)
         self.G.train()

{Architectures → Modules}/EmbeddingModel/GST.py RENAMED Viewed

@@ -3,7 +3,7 @@
 import torch
-from Architectures.GeneralLayers.Attention import MultiHeadedAttention as BaseMultiHeadedAttention
 class GSTStyleEncoder(torch.nn.Module):

 import torch
+from Modules.GeneralLayers.Attention import MultiHeadedAttention as BaseMultiHeadedAttention
 class GSTStyleEncoder(torch.nn.Module):

{Architectures → Modules}/EmbeddingModel/README.md RENAMED Viewed

File without changes

{Architectures → Modules}/EmbeddingModel/StyleEmbedding.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import torch
-from Architectures.EmbeddingModel.GST import GSTStyleEncoder
-from Architectures.EmbeddingModel.StyleTTSEncoder import StyleEncoder as StyleTTSEncoder
 class StyleEmbedding(torch.nn.Module):

 import torch
+from Modules.EmbeddingModel.GST import GSTStyleEncoder
+from Modules.EmbeddingModel.StyleTTSEncoder import StyleEncoder as StyleTTSEncoder
 class StyleEmbedding(torch.nn.Module):

{Architectures → Modules}/EmbeddingModel/StyleTTSEncoder.py RENAMED Viewed

File without changes

{Architectures → Modules}/EmbeddingModel/__init__.py RENAMED Viewed

File without changes

{Architectures → Modules}/GeneralLayers/Attention.py RENAMED Viewed

File without changes

{Architectures → Modules}/GeneralLayers/ConditionalLayerNorm.py RENAMED Viewed

File without changes

{Architectures → Modules}/GeneralLayers/Conformer.py RENAMED Viewed

@@ -4,16 +4,16 @@ Taken from ESPNet, but heavily modified
 import torch
-from Architectures.GeneralLayers.Attention import RelPositionMultiHeadedAttention
-from Architectures.GeneralLayers.ConditionalLayerNorm import AdaIN1d
-from Architectures.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
-from Architectures.GeneralLayers.Convolution import ConvolutionModule
-from Architectures.GeneralLayers.EncoderLayer import EncoderLayer
-from Architectures.GeneralLayers.LayerNorm import LayerNorm
-from Architectures.GeneralLayers.MultiLayeredConv1d import MultiLayeredConv1d
-from Architectures.GeneralLayers.MultiSequential import repeat
-from Architectures.GeneralLayers.PositionalEncoding import RelPositionalEncoding
-from Architectures.GeneralLayers.Swish import Swish
 from Utility.utils import integrate_with_utt_embed
@@ -84,8 +84,12 @@ class Conformer(torch.nn.Module):
                     self.decoder_embedding_projections = repeat(num_blocks, lambda lnum: torch.nn.Linear(attention_dim + utt_embed, attention_dim))
         if lang_embs is not None:
             self.language_embedding = torch.nn.Embedding(num_embeddings=lang_embs, embedding_dim=lang_emb_size)
-            self.language_embedding_projection = torch.nn.Linear(lang_emb_size, attention_dim)
             self.language_emb_norm = LayerNorm(attention_dim)
         # self-attention module definition
         encoder_selfattn_layer = RelPositionMultiHeadedAttention
         encoder_selfattn_layer_args = (attention_heads, attention_dim, attention_dropout_rate, zero_triu)
@@ -138,21 +142,28 @@ class Conformer(torch.nn.Module):
                 if isinstance(xs, tuple):
                     x, pos_emb = xs[0], xs[1]
                     if self.conformer_type != "encoder":
-                        x = integrate_with_utt_embed(hs=x, utt_embeddings=utterance_embedding, projection=self.decoder_embedding_projections[encoder_index], embedding_training=self.use_conditional_layernorm_embedding_integration)
                     xs = (x, pos_emb)
                 else:
                     if self.conformer_type != "encoder":
-                        xs = integrate_with_utt_embed(hs=xs, utt_embeddings=utterance_embedding, projection=self.decoder_embedding_projections[encoder_index], embedding_training=self.use_conditional_layernorm_embedding_integration)
             xs, masks = encoder(xs, masks)
         if isinstance(xs, tuple):
             xs = xs[0]
-        if self.use_output_norm and not (self.utt_embed and self.conformer_type == "encoder"):
-            xs = self.output_norm(xs)
         if self.utt_embed and self.conformer_type == "encoder":
-            xs = integrate_with_utt_embed(hs=xs, utt_embeddings=utterance_embedding,
-                                          projection=self.encoder_embedding_projection, embedding_training=self.use_conditional_layernorm_embedding_integration)
         return xs, masks

 import torch
+from Modules.GeneralLayers.Attention import RelPositionMultiHeadedAttention
+from Modules.GeneralLayers.ConditionalLayerNorm import AdaIN1d
+from Modules.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
+from Modules.GeneralLayers.Convolution import ConvolutionModule
+from Modules.GeneralLayers.EncoderLayer import EncoderLayer
+from Modules.GeneralLayers.LayerNorm import LayerNorm
+from Modules.GeneralLayers.MultiLayeredConv1d import MultiLayeredConv1d
+from Modules.GeneralLayers.MultiSequential import repeat
+from Modules.GeneralLayers.PositionalEncoding import RelPositionalEncoding
+from Modules.GeneralLayers.Swish import Swish
 from Utility.utils import integrate_with_utt_embed
                     self.decoder_embedding_projections = repeat(num_blocks, lambda lnum: torch.nn.Linear(attention_dim + utt_embed, attention_dim))
         if lang_embs is not None:
             self.language_embedding = torch.nn.Embedding(num_embeddings=lang_embs, embedding_dim=lang_emb_size)
+            if lang_emb_size == attention_dim:
+                self.language_embedding_projection = lambda x: x
+            else:
+                self.language_embedding_projection = torch.nn.Linear(lang_emb_size, attention_dim)
             self.language_emb_norm = LayerNorm(attention_dim)
         # self-attention module definition
         encoder_selfattn_layer = RelPositionMultiHeadedAttention
         encoder_selfattn_layer_args = (attention_heads, attention_dim, attention_dropout_rate, zero_triu)
                 if isinstance(xs, tuple):
                     x, pos_emb = xs[0], xs[1]
                     if self.conformer_type != "encoder":
+                        x = integrate_with_utt_embed(hs=x,
+                                                     utt_embeddings=utterance_embedding,
+                                                     projection=self.decoder_embedding_projections[encoder_index],
+                                                     embedding_training=self.use_conditional_layernorm_embedding_integration)
                     xs = (x, pos_emb)
                 else:
                     if self.conformer_type != "encoder":
+                        xs = integrate_with_utt_embed(hs=xs,
+                                                      utt_embeddings=utterance_embedding,
+                                                      projection=self.decoder_embedding_projections[encoder_index],
+                                                      embedding_training=self.use_conditional_layernorm_embedding_integration)
             xs, masks = encoder(xs, masks)
         if isinstance(xs, tuple):
             xs = xs[0]
         if self.utt_embed and self.conformer_type == "encoder":
+            xs = integrate_with_utt_embed(hs=xs,
+                                          utt_embeddings=utterance_embedding,
+                                          projection=self.encoder_embedding_projection,
+                                          embedding_training=self.use_conditional_layernorm_embedding_integration)
+        elif self.use_output_norm:
+            xs = self.output_norm(xs)
         return xs, masks

{Architectures → Modules}/GeneralLayers/Convolution.py RENAMED Viewed

@@ -24,7 +24,7 @@ class ConvolutionModule(nn.Module):
         self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=bias, )
         self.depthwise_conv = nn.Conv1d(channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=bias, )
-        self.norm = nn.BatchNorm1d(channels)
         self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=bias, )
         self.activation = activation

         self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=bias, )
         self.depthwise_conv = nn.Conv1d(channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=bias, )
+        self.norm = nn.SyncBatchNorm.convert_sync_batchnorm(nn.BatchNorm1d(channels))
         self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=bias, )
         self.activation = activation

{Architectures → Modules}/GeneralLayers/DurationPredictor.py RENAMED Viewed

@@ -5,9 +5,9 @@
 import torch
-from Architectures.GeneralLayers.ConditionalLayerNorm import AdaIN1d
-from Architectures.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
-from Architectures.GeneralLayers.LayerNorm import LayerNorm
 from Utility.utils import integrate_with_utt_embed

 import torch
+from Modules.GeneralLayers.ConditionalLayerNorm import AdaIN1d
+from Modules.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
+from Modules.GeneralLayers.LayerNorm import LayerNorm
 from Utility.utils import integrate_with_utt_embed

{Architectures → Modules}/GeneralLayers/EncoderLayer.py RENAMED Viewed

@@ -7,7 +7,7 @@
 import torch
 from torch import nn
-from Architectures.GeneralLayers.LayerNorm import LayerNorm
 class EncoderLayer(nn.Module):

 import torch
 from torch import nn
+from Modules.GeneralLayers.LayerNorm import LayerNorm
 class EncoderLayer(nn.Module):

{Architectures → Modules}/GeneralLayers/LayerNorm.py RENAMED Viewed

File without changes

{Architectures → Modules}/GeneralLayers/LengthRegulator.py RENAMED Viewed

File without changes

{Architectures → Modules}/GeneralLayers/MultiLayeredConv1d.py RENAMED Viewed

File without changes

{Architectures → Modules}/GeneralLayers/MultiSequential.py RENAMED Viewed

File without changes

{Architectures → Modules}/GeneralLayers/PositionalEncoding.py RENAMED Viewed

File without changes

{Architectures → Modules}/GeneralLayers/PositionwiseFeedForward.py RENAMED Viewed

File without changes

{Architectures → Modules}/GeneralLayers/README.md RENAMED Viewed

File without changes

{Architectures → Modules}/GeneralLayers/ResidualBlock.py RENAMED Viewed

File without changes

{Architectures → Modules}/GeneralLayers/ResidualStack.py RENAMED Viewed

File without changes

{Architectures → Modules}/GeneralLayers/STFT.py RENAMED Viewed

File without changes

{Architectures → Modules}/GeneralLayers/Swish.py RENAMED Viewed

File without changes

{Architectures → Modules}/GeneralLayers/VariancePredictor.py RENAMED Viewed

@@ -6,9 +6,9 @@ from abc import ABC
 import torch
-from Architectures.GeneralLayers.ConditionalLayerNorm import AdaIN1d
-from Architectures.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
-from Architectures.GeneralLayers.LayerNorm import LayerNorm
 from Utility.utils import integrate_with_utt_embed

 import torch
+from Modules.GeneralLayers.ConditionalLayerNorm import AdaIN1d
+from Modules.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
+from Modules.GeneralLayers.LayerNorm import LayerNorm
 from Utility.utils import integrate_with_utt_embed

{Architectures → Modules}/GeneralLayers/__init__.py RENAMED Viewed

File without changes

{Architectures → Modules}/README.md RENAMED Viewed

File without changes

{Architectures → Modules}/ToucanTTS/CodecDiscriminator.py RENAMED Viewed

File without changes