Update train_vits-2.py
Browse files- train_vits-2.py +45 -4
train_vits-2.py
CHANGED
@@ -10,13 +10,54 @@ from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConf
|
|
10 |
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
11 |
from TTS.utils.audio import AudioProcessor
|
12 |
from TTS.tts.utils.speakers import SpeakerManager
|
13 |
-
from TTS.tts.datasets.formatters import mozilla_with_speaker
|
14 |
|
15 |
output_path = os.path.dirname(os.path.abspath(__file__))
|
16 |
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
dataset_config = BaseDatasetConfig(
|
19 |
-
formatter="mozilla_with_speaker",
|
|
|
20 |
dataset_name="multi_persian",
|
21 |
meta_file_train="metadata.csv",
|
22 |
language="fa",
|
@@ -99,7 +140,7 @@ config = VitsConfig(
|
|
99 |
["مردی نزد بقالی آمد و گفت پیاز هم ده تا دهان بدان خو شبوی سازم.","farid",None,"fa"],
|
100 |
["از مال خود پاره ای گوشت بستان و زیره بایی معطّر بساز","dilara",None,"fa"],
|
101 |
["یک بار هم از جهنم بگویید.","changiz",None,"fa"],
|
102 |
-
["یکی اسبی به عاریت خواست","changiz",None,"fa"]
|
103 |
],
|
104 |
output_path=output_path,
|
105 |
datasets=[audio_config],
|
@@ -134,7 +175,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
|
|
134 |
# Load all the datasets samples and split traning and evaluation sets
|
135 |
train_samples, eval_samples = load_tts_samples(
|
136 |
config.datasets,
|
137 |
-
|
138 |
eval_split=True,
|
139 |
eval_split_max_size=config.eval_split_max_size,
|
140 |
eval_split_size=config.eval_split_size,
|
|
|
10 |
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
11 |
from TTS.utils.audio import AudioProcessor
|
12 |
from TTS.tts.utils.speakers import SpeakerManager
|
13 |
+
# from TTS.tts.datasets.formatters import mozilla_with_speaker
|
14 |
|
15 |
output_path = os.path.dirname(os.path.abspath(__file__))
|
16 |
|
17 |
|
18 |
+
|
19 |
+
|
20 |
+
def mozilla_with_speaker(root_path, meta_file, **kwargs):
|
21 |
+
"""Loades three kaggle datasets in Mozilla format as a multispeaker dataset
|
22 |
+
Kaggle datasets are:
|
23 |
+
magnoliasis/persian-tts-dataset-famale
|
24 |
+
magnoliasis/persian-tts-dataset
|
25 |
+
magnoliasis/persian-tts-dataset-male
|
26 |
+
|
27 |
+
This function is very usefull while using kaggle notebooks.
|
28 |
+
|
29 |
+
Args:
|
30 |
+
root_path (str): root folder where all three datasets downloaded. for example on kaggle notebooks: /kaggle/input
|
31 |
+
meta_files (str): list of meta files to be used in the training.
|
32 |
+
"""
|
33 |
+
dataset_names={
|
34 |
+
"persian-tts-dataset-famale":"dilara",
|
35 |
+
"persian-tts-dataset":"changiz",
|
36 |
+
"persian-tts-dataset-male":"farid"
|
37 |
+
}
|
38 |
+
items = []
|
39 |
+
for data_root_path in dataset_names.keys():
|
40 |
+
new_root_path=os.path.join(root_path,data_root_path)
|
41 |
+
txt_file = os.path.join(new_root_path, meta_file)
|
42 |
+
speaker_name = dataset_names[data_root_path]
|
43 |
+
print(speaker_name)
|
44 |
+
with open(txt_file, "r", encoding="utf-8") as ttf:
|
45 |
+
for line in ttf:
|
46 |
+
cols = line.split("|")
|
47 |
+
wav_file = cols[1].strip()
|
48 |
+
text = cols[0].strip()
|
49 |
+
wav_file = os.path.join(new_root_path, "wavs", wav_file)
|
50 |
+
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": new_root_path})
|
51 |
+
return items
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
dataset_config = BaseDatasetConfig(
|
59 |
+
# formatter="mozilla_with_speaker",
|
60 |
+
formatter="mozilla",
|
61 |
dataset_name="multi_persian",
|
62 |
meta_file_train="metadata.csv",
|
63 |
language="fa",
|
|
|
140 |
["مردی نزد بقالی آمد و گفت پیاز هم ده تا دهان بدان خو شبوی سازم.","farid",None,"fa"],
|
141 |
["از مال خود پاره ای گوشت بستان و زیره بایی معطّر بساز","dilara",None,"fa"],
|
142 |
["یک بار هم از جهنم بگویید.","changiz",None,"fa"],
|
143 |
+
["یکی اسبی به عاریت خواست","changiz",None,"fa"],
|
144 |
],
|
145 |
output_path=output_path,
|
146 |
datasets=[audio_config],
|
|
|
175 |
# Load all the datasets samples and split traning and evaluation sets
|
176 |
train_samples, eval_samples = load_tts_samples(
|
177 |
config.datasets,
|
178 |
+
formatter=mozilla_with_speaker,
|
179 |
eval_split=True,
|
180 |
eval_split_max_size=config.eval_split_max_size,
|
181 |
eval_split_size=config.eval_split_size,
|