Kamtera
/

persian-tts-multispeaker-vits

Transformers

TensorBoard

Inference Endpoints

Model card Files Files and versions Metrics Training metrics Community

Kamtera commited on Mar 18, 2023

Commit

e793253

•

1 Parent(s): f0011fc

Update train_vits-2.py

Browse files

Files changed (1) hide show

train_vits-2.py +36 -42

train_vits-2.py CHANGED Viewed

@@ -15,38 +15,16 @@ from TTS.tts.utils.speakers import SpeakerManager
 output_path = os.path.dirname(os.path.abspath(__file__))
-dataset_names={
-    "persian-tts-dataset-famale":"dilara",
-    "persian-tts-dataset":"changiz",
-    "persian-tts-dataset-male":"farid"
-}
-def mozilla_with_speaker(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
-    """Normalizes Mozilla meta data files to TTS format"""
-    txt_file = os.path.join(root_path, meta_file)
-    items = []
-    speaker_name = dataset_names[os.path.basename(root_path)]
-    print(speaker_name)
-    with open(txt_file, "r", encoding="utf-8") as ttf:
-        for line in ttf:
-            cols = line.split("|")
-            wav_file = cols[1].strip()
-            text = cols[0].strip()
-            wav_file = os.path.join(root_path, "wavs", wav_file)
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
-    return items
-dataset_config1 = BaseDatasetConfig(
-    formatter="mozilla" ,meta_file_train="metadata.csv", path="/kaggle/input/persian-tts-dataset"
-)
-dataset_config2 = BaseDatasetConfig(
-    formatter="mozilla" ,meta_file_train="metadata.csv", path="/kaggle/input/persian-tts-dataset-famale"
 )
-dataset_config3 = BaseDatasetConfig(
-    formatter="mozilla" ,meta_file_train="metadata.csv", path="/kaggle/input/persian-tts-dataset-male"
-)
@@ -54,9 +32,16 @@ audio_config = BaseAudioConfig(
     sample_rate=22050,
     do_trim_silence=False,
     resample=False,
-    mel_fmin=0,
-    mel_fmax=None
 )
 character_config=CharactersConfig(
   characters='ءابتثجحخدذرزسشصضطظعغفقلمنهويِپچژکگیآأؤإئًَُّ',
   punctuations='!(),-.:;? ̠،؛؟‌<>',
@@ -97,7 +82,20 @@ config = VitsConfig(
         ["یکی اسبی به عاریت خواست","changiz",null,"fa"]
     ],
     output_path=output_path,
-    datasets=[dataset_config1,dataset_config2,dataset_config3],
 )
 # INITIALIZE THE AUDIO PROCESSOR
@@ -115,26 +113,22 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(
     config.datasets,
-    formatter=mozilla_with_speaker,
     eval_split=True,
     eval_split_max_size=config.eval_split_max_size,
     eval_split_size=config.eval_split_size,
 )
-speaker_manager = SpeakerManager()
-speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
-config.num_speakers = speaker_manager.num_speakers
-print("\n"*10)
-print("#>"*10)
-print(speaker_manager.speaker_names)
-print("\n"*10)
-# init model
-model = Vits(config, ap, tokenizer, speaker_manager=speaker_manager)
 # init the trainer and 🚀
 trainer = Trainer(

 output_path = os.path.dirname(os.path.abspath(__file__))
+dataset_config = BaseDatasetConfig(
+    formatter="mozilla_with_speaker",
+    dataset_name="multi_persian",
+    meta_file_train="metadata.csv",
+    language="fa",
+    phonemizer="espeak",
+    path="/kaggle/input",
 )
     sample_rate=22050,
     do_trim_silence=False,
     resample=False,
 )
+### Extract speaker embeddings
+SPEAKER_ENCODER_CHECKPOINT_PATH = (
+    "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
+)
+SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
 character_config=CharactersConfig(
   characters='ءابتثجحخدذرزسشصضطظعغفقلمنهويِپچژکگیآأؤإئًَُّ',
   punctuations='!(),-.:;? ̠،؛؟‌<>',
         ["یکی اسبی به عاریت خواست","changiz",null,"fa"]
     ],
     output_path=output_path,
+    datasets=[audio_config],
+    d_vector_file=['/kaggle/working/speakers.pth'],
+    use_d_vector_file=True,
+    d_vector_dim=512,
+    num_layers_text_encoder=10,
+    speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
+    speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
+    # Enable the weighted sampler
+    use_weighted_sampler=True,
+    # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
+    weighted_sampler_attrs={"speaker_name": 1.0},
+    weighted_sampler_multipliers={},
+    # It defines the Speaker Consistency Loss (SCL) α to 9 like the paper
+    speaker_encoder_loss_alpha=9.0,
 )
 # INITIALIZE THE AUDIO PROCESSOR
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
+# Load all the datasets samples and split traning and evaluation sets
 train_samples, eval_samples = load_tts_samples(
     config.datasets,
     eval_split=True,
     eval_split_max_size=config.eval_split_max_size,
     eval_split_size=config.eval_split_size,
 )
+# Init the model
+model = Vits.init_from_config(config,ap, tokenizer)
 # init the trainer and 🚀
 trainer = Trainer(