Kamtera
/

persian-tts-multispeaker-vits

Inference Endpoints

Model card Files Files and versions Metrics Training metrics Community

Kamtera commited on Mar 18, 2023

Commit

ee6552c

•

1 Parent(s): 8d96194

Update train_vits-2.py

Files changed (1) hide show

train_vits-2.py +21 -6

train_vits-2.py CHANGED Viewed

@@ -52,9 +52,29 @@ character_config=CharactersConfig(
   blank="<BLNK>",
   characters_class="TTS.tts.utils.text.characters.IPAPhonemes",
   )
 config = VitsConfig(
     audio=audio_config,
     run_name="vits_fa_female",
     batch_size=8,
     eval_batch_size=4,
     batch_group_size=5,
@@ -83,12 +103,7 @@ config = VitsConfig(
     ],
     output_path=output_path,
     datasets=[audio_config],
-    d_vector_file=['/kaggle/working/speakers.pth'],
-    use_d_vector_file=True,
-    d_vector_dim=512,
-    num_layers_text_encoder=10,
-    speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
-    speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
     # Enable the weighted sampler
     use_weighted_sampler=True,
     # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has

   blank="<BLNK>",
   characters_class="TTS.tts.utils.text.characters.IPAPhonemes",
   )
+model_args = VitsArgs(
+    d_vector_file=['/kaggle/working/speakers.pth'],
+    use_d_vector_file=True,
+    d_vector_dim=512,
+    num_layers_text_encoder=10,
+    speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
+    speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
+    # resblock_type_decoder="2",  # On the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
+    # Usefull parameters to enable the Speaker Consistency Loss (SCL) discribed in the paper
+    # use_speaker_encoder_as_loss=True,
+    # Usefull parameters to the enable multilingual training
+    # use_language_embedding=True,
+    # embedded_language_dim=4,
+)
 config = VitsConfig(
     audio=audio_config,
     run_name="vits_fa_female",
+    model_args=model_args,
     batch_size=8,
     eval_batch_size=4,
     batch_group_size=5,
     ],
     output_path=output_path,
     datasets=[audio_config],
     # Enable the weighted sampler
     use_weighted_sampler=True,
     # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has