File size: 6,939 Bytes

import os

from trainer import Trainer, TrainerArgs

from TTS.tts.configs.shared_configs import BaseDatasetConfig , CharactersConfig
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import  CharactersConfig, Vits, VitsArgs, VitsAudioConfig
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.speakers import SpeakerManager
# from TTS.tts.datasets.formatters import mozilla_with_speaker

output_path = os.path.dirname(os.path.abspath(__file__))




def mozilla_with_speaker(root_path, meta_file, **kwargs):  
    """Loades three kaggle datasets in Mozilla format as a multispeaker dataset
    Kaggle datasets are:
    magnoliasis/persian-tts-dataset-famale
    magnoliasis/persian-tts-dataset
    magnoliasis/persian-tts-dataset-male
    
    This function is very usefull while using kaggle notebooks.
    
    Args:
        root_path (str): root folder where all three datasets downloaded. for example on kaggle notebooks: /kaggle/input
        meta_files (str):  list of meta files to be used in the training.
    """
    dataset_names={
    "persian-tts-dataset-famale":"dilara",
    "persian-tts-dataset":"changiz",
    "persian-tts-dataset-male":"farid"
    }
    items = []
    for data_root_path in dataset_names.keys():
        new_root_path=os.path.join(root_path,data_root_path)
        txt_file = os.path.join(new_root_path, meta_file)
        speaker_name = dataset_names[data_root_path]
        print(speaker_name)
        with open(txt_file, "r", encoding="utf-8") as ttf:
            for line in ttf:
                cols = line.split("|")
                wav_file = cols[1].strip()
                text = cols[0].strip()
                wav_file = os.path.join(new_root_path, "wavs", wav_file)
                items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": new_root_path})
    return items






dataset_config = BaseDatasetConfig(
    # formatter="mozilla_with_speaker",
    formatter="mozilla",
    dataset_name="multi_persian",
    meta_file_train="metadata.csv",
    language="fa",
    phonemizer="espeak",
    path="/kaggle/input",

)




audio_config = BaseAudioConfig(
    sample_rate=22050,
    do_trim_silence=False,
    resample=False,
)


### Extract speaker embeddings
SPEAKER_ENCODER_CHECKPOINT_PATH = (
    "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
)
SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"


character_config=CharactersConfig(
  characters='ءابتثجحخدذرزسشصضطظعغفقلمنهويِپچژکگیآأؤإئًَُّ',
  punctuations='!(),-.:;? ̠،؛؟‌<>',
  phonemes='ˈˌːˑpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟaegiouwyɪʊ̩æɑɔəɚɛɝɨ̃ʉʌʍ0123456789"#$%*+/=ABCDEFGHIJKLMNOPRSTUVWXYZ[]^_{}',
  pad="<PAD>",
  eos="<EOS>",
  bos="<BOS>",
  blank="<BLNK>",
  characters_class="TTS.tts.utils.text.characters.IPAPhonemes",
  )



model_args = VitsArgs(
    d_vector_file=['/kaggle/working/speakers.pth'],
    use_d_vector_file=True,
    d_vector_dim=512,
    num_layers_text_encoder=10,
    speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
    speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
    # resblock_type_decoder="2",  # On the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
    # Usefull parameters to enable the Speaker Consistency Loss (SCL) discribed in the paper
    # use_speaker_encoder_as_loss=True,
    # Usefull parameters to the enable multilingual training
    # use_language_embedding=True,
    # embedded_language_dim=4,
)


config = VitsConfig(
    audio=audio_config,
    run_name="vits_fa_female",
    model_args=model_args,
    batch_size=8,
    eval_batch_size=4,
    batch_group_size=5,
    num_loader_workers=0,
    num_eval_loader_workers=2,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    save_step=1000,
    text_cleaner="basic_cleaners",
    use_phonemes=True,
    phoneme_language="fa",
    characters=character_config,
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    compute_input_seq_cache=True,
    print_step=25,
    print_eval=True,
    mixed_precision=False,
    test_sentences=[
        ["سلطان محمود در زمستانی سخت به طلخک گفت که","dilara",None,"fa"],
        [" با این جامه ی یک لا در این سرما چه می کنی ","farid",None,"fa"],
        ["مردی نزد بقالی آمد و گفت پیاز هم ده تا دهان بدان خو شبوی سازم.","farid",None,"fa"],
        ["از مال خود پاره ای گوشت بستان و زیره بایی معطّر بساز","dilara",None,"fa"],
        ["یک بار هم از جهنم بگویید.","changiz",None,"fa"],
        ["یکی اسبی به عاریت خواست","changiz",None,"fa"],
    ],
    output_path=output_path,
    datasets=[audio_config],
    
    # Enable the weighted sampler
    use_weighted_sampler=True,
    # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
    weighted_sampler_attrs={"speaker_name": 1.0},
    weighted_sampler_multipliers={},
    # It defines the Speaker Consistency Loss (SCL) α to 9 like the paper
    speaker_encoder_loss_alpha=9.0,
)

# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)

# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# config is updated with the default characters if not defined in the config.
tokenizer, config = TTSTokenizer.init_from_config(config)

# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.



# Load all the datasets samples and split traning and evaluation sets
train_samples, eval_samples = load_tts_samples(
    config.datasets,
    formatter=mozilla_with_speaker,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

# Init the model
model = Vits.init_from_config(config,ap, tokenizer)




# init the trainer and 🚀
trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)
trainer.fit()