Kamtera commited on
Commit
e793253
1 Parent(s): f0011fc

Update train_vits-2.py

Browse files
Files changed (1) hide show
  1. train_vits-2.py +36 -42
train_vits-2.py CHANGED
@@ -15,38 +15,16 @@ from TTS.tts.utils.speakers import SpeakerManager
15
  output_path = os.path.dirname(os.path.abspath(__file__))
16
 
17
 
18
- dataset_names={
19
- "persian-tts-dataset-famale":"dilara",
20
- "persian-tts-dataset":"changiz",
21
- "persian-tts-dataset-male":"farid"
22
- }
23
- def mozilla_with_speaker(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
24
- """Normalizes Mozilla meta data files to TTS format"""
25
- txt_file = os.path.join(root_path, meta_file)
26
- items = []
27
- speaker_name = dataset_names[os.path.basename(root_path)]
28
- print(speaker_name)
29
- with open(txt_file, "r", encoding="utf-8") as ttf:
30
- for line in ttf:
31
- cols = line.split("|")
32
- wav_file = cols[1].strip()
33
- text = cols[0].strip()
34
- wav_file = os.path.join(root_path, "wavs", wav_file)
35
- items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
36
- return items
37
-
38
-
39
- dataset_config1 = BaseDatasetConfig(
40
- formatter="mozilla" ,meta_file_train="metadata.csv", path="/kaggle/input/persian-tts-dataset"
41
- )
42
 
43
- dataset_config2 = BaseDatasetConfig(
44
- formatter="mozilla" ,meta_file_train="metadata.csv", path="/kaggle/input/persian-tts-dataset-famale"
45
  )
46
 
47
- dataset_config3 = BaseDatasetConfig(
48
- formatter="mozilla" ,meta_file_train="metadata.csv", path="/kaggle/input/persian-tts-dataset-male"
49
- )
50
 
51
 
52
 
@@ -54,9 +32,16 @@ audio_config = BaseAudioConfig(
54
  sample_rate=22050,
55
  do_trim_silence=False,
56
  resample=False,
57
- mel_fmin=0,
58
- mel_fmax=None
59
  )
 
 
 
 
 
 
 
 
 
60
  character_config=CharactersConfig(
61
  characters='ءابتثجحخدذرزسشصضطظعغفقلمنهويِپچژکگیآأؤإئًَُّ',
62
  punctuations='!(),-.:;? ̠،؛؟‌<>',
@@ -97,7 +82,20 @@ config = VitsConfig(
97
  ["یکی اسبی به عاریت خواست","changiz",null,"fa"]
98
  ],
99
  output_path=output_path,
100
- datasets=[dataset_config1,dataset_config2,dataset_config3],
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  )
102
 
103
  # INITIALIZE THE AUDIO PROCESSOR
@@ -115,26 +113,22 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
115
  # You can define your custom sample loader returning the list of samples.
116
  # Or define your custom formatter and pass it to the `load_tts_samples`.
117
  # Check `TTS.tts.datasets.load_tts_samples` for more details.
 
 
 
 
118
  train_samples, eval_samples = load_tts_samples(
119
  config.datasets,
120
- formatter=mozilla_with_speaker,
121
  eval_split=True,
122
  eval_split_max_size=config.eval_split_max_size,
123
  eval_split_size=config.eval_split_size,
124
  )
125
 
 
 
126
 
127
 
128
- speaker_manager = SpeakerManager()
129
- speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
130
- config.num_speakers = speaker_manager.num_speakers
131
- print("\n"*10)
132
- print("#>"*10)
133
- print(speaker_manager.speaker_names)
134
- print("\n"*10)
135
 
136
- # init model
137
- model = Vits(config, ap, tokenizer, speaker_manager=speaker_manager)
138
 
139
  # init the trainer and 🚀
140
  trainer = Trainer(
 
15
  output_path = os.path.dirname(os.path.abspath(__file__))
16
 
17
 
18
+ dataset_config = BaseDatasetConfig(
19
+ formatter="mozilla_with_speaker",
20
+ dataset_name="multi_persian",
21
+ meta_file_train="metadata.csv",
22
+ language="fa",
23
+ phonemizer="espeak",
24
+ path="/kaggle/input",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
 
 
26
  )
27
 
 
 
 
28
 
29
 
30
 
 
32
  sample_rate=22050,
33
  do_trim_silence=False,
34
  resample=False,
 
 
35
  )
36
+
37
+
38
+ ### Extract speaker embeddings
39
+ SPEAKER_ENCODER_CHECKPOINT_PATH = (
40
+ "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
41
+ )
42
+ SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
43
+
44
+
45
  character_config=CharactersConfig(
46
  characters='ءابتثجحخدذرزسشصضطظعغفقلمنهويِپچژکگیآأؤإئًَُّ',
47
  punctuations='!(),-.:;? ̠،؛؟‌<>',
 
82
  ["یکی اسبی به عاریت خواست","changiz",null,"fa"]
83
  ],
84
  output_path=output_path,
85
+ datasets=[audio_config],
86
+ d_vector_file=['/kaggle/working/speakers.pth'],
87
+ use_d_vector_file=True,
88
+ d_vector_dim=512,
89
+ num_layers_text_encoder=10,
90
+ speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
91
+ speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
92
+ # Enable the weighted sampler
93
+ use_weighted_sampler=True,
94
+ # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
95
+ weighted_sampler_attrs={"speaker_name": 1.0},
96
+ weighted_sampler_multipliers={},
97
+ # It defines the Speaker Consistency Loss (SCL) α to 9 like the paper
98
+ speaker_encoder_loss_alpha=9.0,
99
  )
100
 
101
  # INITIALIZE THE AUDIO PROCESSOR
 
113
  # You can define your custom sample loader returning the list of samples.
114
  # Or define your custom formatter and pass it to the `load_tts_samples`.
115
  # Check `TTS.tts.datasets.load_tts_samples` for more details.
116
+
117
+
118
+
119
+ # Load all the datasets samples and split traning and evaluation sets
120
  train_samples, eval_samples = load_tts_samples(
121
  config.datasets,
 
122
  eval_split=True,
123
  eval_split_max_size=config.eval_split_max_size,
124
  eval_split_size=config.eval_split_size,
125
  )
126
 
127
+ # Init the model
128
+ model = Vits.init_from_config(config,ap, tokenizer)
129
 
130
 
 
 
 
 
 
 
 
131
 
 
 
132
 
133
  # init the trainer and 🚀
134
  trainer = Trainer(