Commit
·
0eca443
1
Parent(s):
7b8b31f
Update paths and add new training data for XTTS with Modi voice
Browse files- recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-April-04-2024_06+55AM-0000000/train_gpt_xtts.py +253 -0
- recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-April-04-2024_07+01AM-0000000/train_gpt_xtts.py +253 -0
- recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-April-04-2024_09+33AM-0000000/train_gpt_xtts.py +253 -0
- recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-March-29-2024_10+35AM-0000000/train_gpt_xtts.py +204 -0
- recipes/ljspeech/xtts_v2/train_gpt_xtts.py +76 -10
recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-April-04-2024_06+55AM-0000000/train_gpt_xtts.py
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from trainer import Trainer, TrainerArgs
|
4 |
+
|
5 |
+
from TTS.config.shared_configs import BaseDatasetConfig
|
6 |
+
from TTS.tts.datasets import load_tts_samples
|
7 |
+
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
|
8 |
+
from TTS.utils.manage import ModelManager
|
9 |
+
|
10 |
+
# Logging parameters
|
11 |
+
RUN_NAME = "NewModiSpeech"
|
12 |
+
PROJECT_NAME = "XTTS_trainer"
|
13 |
+
DASHBOARD_LOGGER = "tensorboard"
|
14 |
+
LOGGER_URI = None
|
15 |
+
|
16 |
+
# Set here the path that the checkpoints will be saved. Default: ./run/training/
|
17 |
+
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training")
|
18 |
+
|
19 |
+
# Training Parameters
|
20 |
+
OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False
|
21 |
+
START_WITH_EVAL = True # if True it will star with evaluation
|
22 |
+
BATCH_SIZE = 5 # set here the batch size
|
23 |
+
GRAD_ACUMM_STEPS = 84 # set here the grad accumulation steps
|
24 |
+
# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
|
25 |
+
|
26 |
+
# Define here the dataset that you want to use for the fine-tuning on.
|
27 |
+
config_dataset = BaseDatasetConfig(
|
28 |
+
formatter="ljspeech",
|
29 |
+
dataset_name="ljspeech",
|
30 |
+
path="/home/ubuntu/voicetts/modi/hindi",
|
31 |
+
meta_file_train="/home/ubuntu/voicetts/modi/hindi/metadata.txt",
|
32 |
+
language="hi",
|
33 |
+
)
|
34 |
+
|
35 |
+
# Add here the configs of the datasets
|
36 |
+
DATASETS_CONFIG_LIST = [config_dataset]
|
37 |
+
|
38 |
+
# Define the path where XTTS v2.0.1 files will be downloaded
|
39 |
+
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "NewModiSpeech-March-29-2024_10+35AM-0000000/")
|
40 |
+
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
|
41 |
+
|
42 |
+
|
43 |
+
# DVAE files
|
44 |
+
DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
|
45 |
+
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
|
46 |
+
|
47 |
+
# Set the path to the downloaded files
|
48 |
+
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
|
49 |
+
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
|
50 |
+
|
51 |
+
# download DVAE files if needed
|
52 |
+
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
|
53 |
+
print(" > Downloading DVAE files!")
|
54 |
+
ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
|
55 |
+
|
56 |
+
|
57 |
+
# Download XTTS v2.0 checkpoint if needed
|
58 |
+
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
|
59 |
+
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
|
60 |
+
|
61 |
+
# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
|
62 |
+
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
|
63 |
+
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
|
64 |
+
|
65 |
+
# download XTTS v2.0 files if needed
|
66 |
+
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
|
67 |
+
print(" > Downloading XTTS v2.0 files!")
|
68 |
+
ModelManager._download_model_files(
|
69 |
+
[TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
|
70 |
+
)
|
71 |
+
|
72 |
+
|
73 |
+
# Training sentences generations
|
74 |
+
SPEAKER_REFERENCE = [
|
75 |
+
# "./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences
|
76 |
+
"/home/ubuntu/voicetts/ds_path/modispeech.wav",
|
77 |
+
"/home/ubuntu/voicetts/final/wavs/clip_1.wav",
|
78 |
+
"/home/ubuntu/voicetts/final/wavs/clip_4.wav",
|
79 |
+
"/home/ubuntu/voicetts/final/wavs/clip_5.wav",
|
80 |
+
"/home/ubuntu/voicetts/final/wavs/clip_6.wav",
|
81 |
+
"/home/ubuntu/voicetts/final/wavs/clip_7.wav",
|
82 |
+
"/home/ubuntu/voicetts/final/wavs/clip_8.wav",
|
83 |
+
|
84 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_2.wav",
|
85 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_3.wav",
|
86 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_4.wav",
|
87 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_5.wav",
|
88 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_6.wav",
|
89 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_7.wav",
|
90 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_8.wav",
|
91 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_9.wav",
|
92 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_10.wav",
|
93 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_11.wav",
|
94 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_12.wav",
|
95 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_13.wav",
|
96 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_14.wav",
|
97 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_15.wav",
|
98 |
+
]
|
99 |
+
LANGUAGE = config_dataset.language
|
100 |
+
|
101 |
+
BEST_LOSS = None
|
102 |
+
MAX_PATIENCE = 1 # early stopping patience
|
103 |
+
CURRENT_PATIENCE = 0 # current patience
|
104 |
+
def early_stopping_fn(eval_results):
|
105 |
+
"""
|
106 |
+
This function is called after each evaluation step.
|
107 |
+
If you want to implement early stopping, you can do it here.
|
108 |
+
|
109 |
+
If model has not imporoved for 25 epoches, it will stop the training.
|
110 |
+
"""
|
111 |
+
global BEST_LOSS
|
112 |
+
global CURRENT_PATIENCE
|
113 |
+
print(" > Early stopping function called!")
|
114 |
+
print(f" > Current patience: {CURRENT_PATIENCE}/{MAX_PATIENCE}")
|
115 |
+
print('Best Loss', BEST_LOSS)
|
116 |
+
print('Current Best Loss', eval_results.best_loss)
|
117 |
+
current_best_loss = eval_results.best_loss['eval_loss']
|
118 |
+
if BEST_LOSS is None:
|
119 |
+
BEST_LOSS = current_best_loss
|
120 |
+
else:
|
121 |
+
if CURRENT_PATIENCE <= MAX_PATIENCE:
|
122 |
+
print(" > Early stopping!")
|
123 |
+
return True
|
124 |
+
elif current_best_loss < BEST_LOSS:
|
125 |
+
BEST_LOSS = current_best_loss
|
126 |
+
CURRENT_PATIENCE = 0
|
127 |
+
elif CURRENT_PATIENCE < MAX_PATIENCE:
|
128 |
+
CURRENT_PATIENCE += 1
|
129 |
+
if CURRENT_PATIENCE >= MAX_PATIENCE:
|
130 |
+
print(" > Early stopping!")
|
131 |
+
return True
|
132 |
+
print('Updated Best Loss', BEST_LOSS)
|
133 |
+
return False
|
134 |
+
|
135 |
+
|
136 |
+
def main():
|
137 |
+
# init args and config
|
138 |
+
model_args = GPTArgs(
|
139 |
+
max_conditioning_length=132300, # 6 secs
|
140 |
+
min_conditioning_length=66150, # 3 secs
|
141 |
+
debug_loading_failures=False,
|
142 |
+
max_wav_length=255995, # ~11.6 seconds
|
143 |
+
max_text_length=300,
|
144 |
+
mel_norm_file=MEL_NORM_FILE,
|
145 |
+
dvae_checkpoint=DVAE_CHECKPOINT,
|
146 |
+
xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
|
147 |
+
tokenizer_file=TOKENIZER_FILE,
|
148 |
+
gpt_num_audio_tokens=1026,
|
149 |
+
gpt_start_audio_token=1024,
|
150 |
+
gpt_stop_audio_token=1025,
|
151 |
+
gpt_use_masking_gt_prompt_approach=True,
|
152 |
+
gpt_use_perceiver_resampler=True,
|
153 |
+
)
|
154 |
+
# define audio config
|
155 |
+
audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
|
156 |
+
# training parameters config
|
157 |
+
config = GPTTrainerConfig(
|
158 |
+
output_path=OUT_PATH,
|
159 |
+
model_args=model_args,
|
160 |
+
run_name=RUN_NAME,
|
161 |
+
project_name=PROJECT_NAME,
|
162 |
+
run_description="""
|
163 |
+
GPT XTTS training
|
164 |
+
""",
|
165 |
+
dashboard_logger=DASHBOARD_LOGGER,
|
166 |
+
logger_uri=LOGGER_URI,
|
167 |
+
audio=audio_config,
|
168 |
+
batch_size=BATCH_SIZE,
|
169 |
+
batch_group_size=48,
|
170 |
+
eval_batch_size=BATCH_SIZE,
|
171 |
+
num_loader_workers=8,
|
172 |
+
eval_split_max_size=256,
|
173 |
+
print_step=50,
|
174 |
+
plot_step=100,
|
175 |
+
log_model_step=1000,
|
176 |
+
save_step=10000,
|
177 |
+
save_n_checkpoints=1,
|
178 |
+
save_checkpoints=True,
|
179 |
+
# target_loss="loss",
|
180 |
+
print_eval=True,
|
181 |
+
# Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
|
182 |
+
optimizer="AdamW",
|
183 |
+
optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
|
184 |
+
optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
|
185 |
+
lr=5e-06, # learning rate
|
186 |
+
lr_scheduler="MultiStepLR",
|
187 |
+
# it was adjusted accordly for the new step scheme
|
188 |
+
lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
|
189 |
+
test_sentences=[
|
190 |
+
{
|
191 |
+
"text": "इसका फायदा भी उठा सकते हैं एक छोटे जी और आग्रह पूर्वक इसको करिए आप देखिए गरीब के साथ आपका कैसा जुड़ा होता है उस पर हम को कैसे सफलता मिलती",
|
192 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
193 |
+
"language": LANGUAGE,
|
194 |
+
},
|
195 |
+
{
|
196 |
+
"text": "अपने आप को भूल चुके हम जैसे निराश्रित बन गए नहीं मेरे प्यारे भाइयों बहनों ऐसा नहीं हो सकता स्वामी विवेकानंद अक्सर एक बात हमेशा बताया कर शायद यह बात उन्होंने",
|
197 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
198 |
+
"language": LANGUAGE,
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"text": "मुझे उनका सुझाव अच्छा लगा मेरा अपना अनुभव है गुजरात में मुख्यमंत्री 2011 में एथेंस में",
|
202 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
203 |
+
"language": LANGUAGE,
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"text": "मुख्यमंत्री",
|
207 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
208 |
+
"language": LANGUAGE,
|
209 |
+
},
|
210 |
+
{
|
211 |
+
"text": "ओलंपिक",
|
212 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
213 |
+
"language": LANGUAGE,
|
214 |
+
},
|
215 |
+
{
|
216 |
+
"text": "मेरे देशवासियों जब तक हम चलने",
|
217 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
218 |
+
"language": LANGUAGE,
|
219 |
+
},
|
220 |
+
],
|
221 |
+
eval_split_size=0.05
|
222 |
+
)
|
223 |
+
|
224 |
+
# init the model from config
|
225 |
+
model = GPTTrainer.init_from_config(config)
|
226 |
+
|
227 |
+
# load training samples
|
228 |
+
train_samples, eval_samples = load_tts_samples(
|
229 |
+
DATASETS_CONFIG_LIST,
|
230 |
+
eval_split=True,
|
231 |
+
eval_split_max_size=config.eval_split_max_size,
|
232 |
+
eval_split_size=config.eval_split_size,
|
233 |
+
)
|
234 |
+
|
235 |
+
# init the trainer and 🚀
|
236 |
+
trainer = Trainer(
|
237 |
+
TrainerArgs(
|
238 |
+
restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
|
239 |
+
skip_train_epoch=False,
|
240 |
+
start_with_eval=START_WITH_EVAL,
|
241 |
+
grad_accum_steps=GRAD_ACUMM_STEPS,
|
242 |
+
),
|
243 |
+
config,
|
244 |
+
output_path=OUT_PATH,
|
245 |
+
model=model,
|
246 |
+
train_samples=train_samples,
|
247 |
+
eval_samples=eval_samples,
|
248 |
+
)
|
249 |
+
trainer.fit()
|
250 |
+
|
251 |
+
|
252 |
+
if __name__ == "__main__":
|
253 |
+
main()
|
recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-April-04-2024_07+01AM-0000000/train_gpt_xtts.py
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from trainer import Trainer, TrainerArgs
|
4 |
+
|
5 |
+
from TTS.config.shared_configs import BaseDatasetConfig
|
6 |
+
from TTS.tts.datasets import load_tts_samples
|
7 |
+
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
|
8 |
+
from TTS.utils.manage import ModelManager
|
9 |
+
|
10 |
+
# Logging parameters
|
11 |
+
RUN_NAME = "NewModiSpeech"
|
12 |
+
PROJECT_NAME = "XTTS_trainer"
|
13 |
+
DASHBOARD_LOGGER = "tensorboard"
|
14 |
+
LOGGER_URI = None
|
15 |
+
|
16 |
+
# Set here the path that the checkpoints will be saved. Default: ./run/training/
|
17 |
+
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training")
|
18 |
+
|
19 |
+
# Training Parameters
|
20 |
+
OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False
|
21 |
+
START_WITH_EVAL = True # if True it will star with evaluation
|
22 |
+
BATCH_SIZE = 5 # set here the batch size
|
23 |
+
GRAD_ACUMM_STEPS = 84 # set here the grad accumulation steps
|
24 |
+
# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
|
25 |
+
|
26 |
+
# Define here the dataset that you want to use for the fine-tuning on.
|
27 |
+
config_dataset = BaseDatasetConfig(
|
28 |
+
formatter="ljspeech",
|
29 |
+
dataset_name="ljspeech",
|
30 |
+
path="/home/ubuntu/voicetts/modi/hindi",
|
31 |
+
meta_file_train="/home/ubuntu/voicetts/modi/hindi/metadata.txt",
|
32 |
+
language="hi",
|
33 |
+
)
|
34 |
+
|
35 |
+
# Add here the configs of the datasets
|
36 |
+
DATASETS_CONFIG_LIST = [config_dataset]
|
37 |
+
|
38 |
+
# Define the path where XTTS v2.0.1 files will be downloaded
|
39 |
+
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "NewModiSpeech-March-29-2024_10+35AM-0000000/")
|
40 |
+
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
|
41 |
+
|
42 |
+
|
43 |
+
# DVAE files
|
44 |
+
DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
|
45 |
+
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
|
46 |
+
|
47 |
+
# Set the path to the downloaded files
|
48 |
+
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
|
49 |
+
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
|
50 |
+
|
51 |
+
# download DVAE files if needed
|
52 |
+
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
|
53 |
+
print(" > Downloading DVAE files!")
|
54 |
+
ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
|
55 |
+
|
56 |
+
|
57 |
+
# Download XTTS v2.0 checkpoint if needed
|
58 |
+
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
|
59 |
+
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
|
60 |
+
|
61 |
+
# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
|
62 |
+
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
|
63 |
+
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
|
64 |
+
|
65 |
+
# download XTTS v2.0 files if needed
|
66 |
+
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
|
67 |
+
print(" > Downloading XTTS v2.0 files!")
|
68 |
+
ModelManager._download_model_files(
|
69 |
+
[TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
|
70 |
+
)
|
71 |
+
|
72 |
+
|
73 |
+
# Training sentences generations
|
74 |
+
SPEAKER_REFERENCE = [
|
75 |
+
# "./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences
|
76 |
+
"/home/ubuntu/voicetts/ds_path/modispeech.wav",
|
77 |
+
"/home/ubuntu/voicetts/final/wavs/clip_1.wav",
|
78 |
+
"/home/ubuntu/voicetts/final/wavs/clip_4.wav",
|
79 |
+
"/home/ubuntu/voicetts/final/wavs/clip_5.wav",
|
80 |
+
"/home/ubuntu/voicetts/final/wavs/clip_6.wav",
|
81 |
+
"/home/ubuntu/voicetts/final/wavs/clip_7.wav",
|
82 |
+
"/home/ubuntu/voicetts/final/wavs/clip_8.wav",
|
83 |
+
|
84 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_2.wav",
|
85 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_3.wav",
|
86 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_4.wav",
|
87 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_5.wav",
|
88 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_6.wav",
|
89 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_7.wav",
|
90 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_8.wav",
|
91 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_9.wav",
|
92 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_10.wav",
|
93 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_11.wav",
|
94 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_12.wav",
|
95 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_13.wav",
|
96 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_14.wav",
|
97 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_15.wav",
|
98 |
+
]
|
99 |
+
LANGUAGE = config_dataset.language
|
100 |
+
|
101 |
+
BEST_LOSS = None
|
102 |
+
MAX_PATIENCE = 1 # early stopping patience
|
103 |
+
CURRENT_PATIENCE = 0 # current patience
|
104 |
+
def early_stopping_fn(eval_results):
|
105 |
+
"""
|
106 |
+
This function is called after each evaluation step.
|
107 |
+
If you want to implement early stopping, you can do it here.
|
108 |
+
|
109 |
+
If model has not imporoved for 25 epoches, it will stop the training.
|
110 |
+
"""
|
111 |
+
global BEST_LOSS
|
112 |
+
global CURRENT_PATIENCE
|
113 |
+
print(" > Early stopping function called!")
|
114 |
+
print(f" > Current patience: {CURRENT_PATIENCE}/{MAX_PATIENCE}")
|
115 |
+
print('Best Loss', BEST_LOSS)
|
116 |
+
print('Current Best Loss', eval_results.best_loss)
|
117 |
+
current_best_loss = eval_results.best_loss['eval_loss']
|
118 |
+
if BEST_LOSS is None:
|
119 |
+
BEST_LOSS = current_best_loss
|
120 |
+
else:
|
121 |
+
if CURRENT_PATIENCE <= MAX_PATIENCE:
|
122 |
+
print(" > Early stopping!")
|
123 |
+
return True
|
124 |
+
elif current_best_loss < BEST_LOSS:
|
125 |
+
BEST_LOSS = current_best_loss
|
126 |
+
CURRENT_PATIENCE = 0
|
127 |
+
elif CURRENT_PATIENCE < MAX_PATIENCE:
|
128 |
+
CURRENT_PATIENCE += 1
|
129 |
+
if CURRENT_PATIENCE >= MAX_PATIENCE:
|
130 |
+
print(" > Early stopping!")
|
131 |
+
return True
|
132 |
+
print('Updated Best Loss', BEST_LOSS)
|
133 |
+
return False
|
134 |
+
|
135 |
+
|
136 |
+
def main():
|
137 |
+
# init args and config
|
138 |
+
model_args = GPTArgs(
|
139 |
+
max_conditioning_length=132300, # 6 secs
|
140 |
+
min_conditioning_length=66150, # 3 secs
|
141 |
+
debug_loading_failures=False,
|
142 |
+
max_wav_length=255995, # ~11.6 seconds
|
143 |
+
max_text_length=250,
|
144 |
+
mel_norm_file=MEL_NORM_FILE,
|
145 |
+
dvae_checkpoint=DVAE_CHECKPOINT,
|
146 |
+
xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
|
147 |
+
tokenizer_file=TOKENIZER_FILE,
|
148 |
+
gpt_num_audio_tokens=1026,
|
149 |
+
gpt_start_audio_token=1024,
|
150 |
+
gpt_stop_audio_token=1025,
|
151 |
+
gpt_use_masking_gt_prompt_approach=True,
|
152 |
+
gpt_use_perceiver_resampler=True,
|
153 |
+
)
|
154 |
+
# define audio config
|
155 |
+
audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
|
156 |
+
# training parameters config
|
157 |
+
config = GPTTrainerConfig(
|
158 |
+
output_path=OUT_PATH,
|
159 |
+
model_args=model_args,
|
160 |
+
run_name=RUN_NAME,
|
161 |
+
project_name=PROJECT_NAME,
|
162 |
+
run_description="""
|
163 |
+
GPT XTTS training
|
164 |
+
""",
|
165 |
+
dashboard_logger=DASHBOARD_LOGGER,
|
166 |
+
logger_uri=LOGGER_URI,
|
167 |
+
audio=audio_config,
|
168 |
+
batch_size=BATCH_SIZE,
|
169 |
+
batch_group_size=48,
|
170 |
+
eval_batch_size=BATCH_SIZE,
|
171 |
+
num_loader_workers=8,
|
172 |
+
eval_split_max_size=256,
|
173 |
+
print_step=50,
|
174 |
+
plot_step=100,
|
175 |
+
log_model_step=1000,
|
176 |
+
save_step=10000,
|
177 |
+
save_n_checkpoints=1,
|
178 |
+
save_checkpoints=True,
|
179 |
+
# target_loss="loss",
|
180 |
+
print_eval=True,
|
181 |
+
# Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
|
182 |
+
optimizer="AdamW",
|
183 |
+
optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
|
184 |
+
optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
|
185 |
+
lr=5e-06, # learning rate
|
186 |
+
lr_scheduler="MultiStepLR",
|
187 |
+
# it was adjusted accordly for the new step scheme
|
188 |
+
lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
|
189 |
+
test_sentences=[
|
190 |
+
{
|
191 |
+
"text": "इसका फायदा भी उठा सकते हैं एक छोटे जी और आग्रह पूर्वक इसको करिए आप देखिए गरीब के साथ आपका कैसा जुड़ा होता है उस पर हम को कैसे सफलता मिलती",
|
192 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
193 |
+
"language": LANGUAGE,
|
194 |
+
},
|
195 |
+
{
|
196 |
+
"text": "अपने आप को भूल चुके हम जैसे निराश्रित बन गए नहीं मेरे प्यारे भाइयों बहनों ऐसा नहीं हो सकता स्वामी विवेकानंद अक्सर एक बात हमेशा बताया कर शायद यह बात उन्होंने",
|
197 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
198 |
+
"language": LANGUAGE,
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"text": "मुझे उनका सुझाव अच्छा लगा मेरा अपना अनुभव है गुजरात में मुख्यमंत्री 2011 में एथेंस में",
|
202 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
203 |
+
"language": LANGUAGE,
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"text": "मुख्यमंत्री",
|
207 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
208 |
+
"language": LANGUAGE,
|
209 |
+
},
|
210 |
+
{
|
211 |
+
"text": "ओलंपिक",
|
212 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
213 |
+
"language": LANGUAGE,
|
214 |
+
},
|
215 |
+
{
|
216 |
+
"text": "मेरे देशवासियों जब तक हम चलने",
|
217 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
218 |
+
"language": LANGUAGE,
|
219 |
+
},
|
220 |
+
],
|
221 |
+
eval_split_size=0.05
|
222 |
+
)
|
223 |
+
|
224 |
+
# init the model from config
|
225 |
+
model = GPTTrainer.init_from_config(config)
|
226 |
+
|
227 |
+
# load training samples
|
228 |
+
train_samples, eval_samples = load_tts_samples(
|
229 |
+
DATASETS_CONFIG_LIST,
|
230 |
+
eval_split=True,
|
231 |
+
eval_split_max_size=config.eval_split_max_size,
|
232 |
+
eval_split_size=config.eval_split_size,
|
233 |
+
)
|
234 |
+
|
235 |
+
# init the trainer and 🚀
|
236 |
+
trainer = Trainer(
|
237 |
+
TrainerArgs(
|
238 |
+
restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
|
239 |
+
skip_train_epoch=False,
|
240 |
+
start_with_eval=START_WITH_EVAL,
|
241 |
+
grad_accum_steps=GRAD_ACUMM_STEPS,
|
242 |
+
),
|
243 |
+
config,
|
244 |
+
output_path=OUT_PATH,
|
245 |
+
model=model,
|
246 |
+
train_samples=train_samples,
|
247 |
+
eval_samples=eval_samples,
|
248 |
+
)
|
249 |
+
trainer.fit()
|
250 |
+
|
251 |
+
|
252 |
+
if __name__ == "__main__":
|
253 |
+
main()
|
recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-April-04-2024_09+33AM-0000000/train_gpt_xtts.py
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from trainer import Trainer, TrainerArgs
|
4 |
+
|
5 |
+
from TTS.config.shared_configs import BaseDatasetConfig
|
6 |
+
from TTS.tts.datasets import load_tts_samples
|
7 |
+
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
|
8 |
+
from TTS.utils.manage import ModelManager
|
9 |
+
|
10 |
+
# Logging parameters
|
11 |
+
RUN_NAME = "NewModiSpeech"
|
12 |
+
PROJECT_NAME = "XTTS_trainer"
|
13 |
+
DASHBOARD_LOGGER = "tensorboard"
|
14 |
+
LOGGER_URI = None
|
15 |
+
|
16 |
+
# Set here the path that the checkpoints will be saved. Default: ./run/training/
|
17 |
+
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training")
|
18 |
+
|
19 |
+
# Training Parameters
|
20 |
+
OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False
|
21 |
+
START_WITH_EVAL = True # if True it will star with evaluation
|
22 |
+
BATCH_SIZE = 10 # set here the batch size
|
23 |
+
GRAD_ACUMM_STEPS = 84 # set here the grad accumulation steps
|
24 |
+
# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
|
25 |
+
|
26 |
+
# Define here the dataset that you want to use for the fine-tuning on.
|
27 |
+
config_dataset = BaseDatasetConfig(
|
28 |
+
formatter="ljspeech",
|
29 |
+
dataset_name="ljspeech",
|
30 |
+
path="/home/ubuntu/voicetts/modi/hindi",
|
31 |
+
meta_file_train="/home/ubuntu/voicetts/modi/hindi/metadata.txt",
|
32 |
+
language="hi",
|
33 |
+
)
|
34 |
+
|
35 |
+
# Add here the configs of the datasets
|
36 |
+
DATASETS_CONFIG_LIST = [config_dataset]
|
37 |
+
|
38 |
+
# Define the path where XTTS v2.0.1 files will be downloaded
|
39 |
+
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "NewModiSpeech-March-29-2024_10+35AM-0000000/")
|
40 |
+
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
|
41 |
+
|
42 |
+
|
43 |
+
# DVAE files
|
44 |
+
DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
|
45 |
+
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
|
46 |
+
|
47 |
+
# Set the path to the downloaded files
|
48 |
+
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
|
49 |
+
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
|
50 |
+
|
51 |
+
# download DVAE files if needed
|
52 |
+
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
|
53 |
+
print(" > Downloading DVAE files!")
|
54 |
+
ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
|
55 |
+
|
56 |
+
|
57 |
+
# Download XTTS v2.0 checkpoint if needed
|
58 |
+
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
|
59 |
+
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
|
60 |
+
|
61 |
+
# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
|
62 |
+
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
|
63 |
+
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
|
64 |
+
|
65 |
+
# download XTTS v2.0 files if needed
|
66 |
+
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
|
67 |
+
print(" > Downloading XTTS v2.0 files!")
|
68 |
+
ModelManager._download_model_files(
|
69 |
+
[TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
|
70 |
+
)
|
71 |
+
|
72 |
+
|
73 |
+
# Training sentences generations
|
74 |
+
SPEAKER_REFERENCE = [
|
75 |
+
# "./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences
|
76 |
+
"/home/ubuntu/voicetts/ds_path/modispeech.wav",
|
77 |
+
"/home/ubuntu/voicetts/final/wavs/clip_1.wav",
|
78 |
+
"/home/ubuntu/voicetts/final/wavs/clip_4.wav",
|
79 |
+
"/home/ubuntu/voicetts/final/wavs/clip_5.wav",
|
80 |
+
"/home/ubuntu/voicetts/final/wavs/clip_6.wav",
|
81 |
+
"/home/ubuntu/voicetts/final/wavs/clip_7.wav",
|
82 |
+
"/home/ubuntu/voicetts/final/wavs/clip_8.wav",
|
83 |
+
|
84 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_2.wav",
|
85 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_3.wav",
|
86 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_4.wav",
|
87 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_5.wav",
|
88 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_6.wav",
|
89 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_7.wav",
|
90 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_8.wav",
|
91 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_9.wav",
|
92 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_10.wav",
|
93 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_11.wav",
|
94 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_12.wav",
|
95 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_13.wav",
|
96 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_14.wav",
|
97 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_15.wav",
|
98 |
+
]
|
99 |
+
LANGUAGE = config_dataset.language
|
100 |
+
|
101 |
+
BEST_LOSS = None
|
102 |
+
MAX_PATIENCE = 1 # early stopping patience
|
103 |
+
CURRENT_PATIENCE = 0 # current patience
|
104 |
+
def early_stopping_fn(eval_results):
|
105 |
+
"""
|
106 |
+
This function is called after each evaluation step.
|
107 |
+
If you want to implement early stopping, you can do it here.
|
108 |
+
|
109 |
+
If model has not imporoved for 25 epoches, it will stop the training.
|
110 |
+
"""
|
111 |
+
global BEST_LOSS
|
112 |
+
global CURRENT_PATIENCE
|
113 |
+
print(" > Early stopping function called!")
|
114 |
+
print(f" > Current patience: {CURRENT_PATIENCE}/{MAX_PATIENCE}")
|
115 |
+
print('Best Loss', BEST_LOSS)
|
116 |
+
print('Current Best Loss', eval_results.best_loss)
|
117 |
+
current_best_loss = eval_results.best_loss['eval_loss']
|
118 |
+
if BEST_LOSS is None:
|
119 |
+
BEST_LOSS = current_best_loss
|
120 |
+
else:
|
121 |
+
if CURRENT_PATIENCE <= MAX_PATIENCE:
|
122 |
+
print(" > Early stopping!")
|
123 |
+
return True
|
124 |
+
elif current_best_loss < BEST_LOSS:
|
125 |
+
BEST_LOSS = current_best_loss
|
126 |
+
CURRENT_PATIENCE = 0
|
127 |
+
elif CURRENT_PATIENCE < MAX_PATIENCE:
|
128 |
+
CURRENT_PATIENCE += 1
|
129 |
+
if CURRENT_PATIENCE >= MAX_PATIENCE:
|
130 |
+
print(" > Early stopping!")
|
131 |
+
return True
|
132 |
+
print('Updated Best Loss', BEST_LOSS)
|
133 |
+
return False
|
134 |
+
|
135 |
+
|
136 |
+
def main():
|
137 |
+
# init args and config
|
138 |
+
model_args = GPTArgs(
|
139 |
+
max_conditioning_length=132300, # 6 secs
|
140 |
+
min_conditioning_length=66150, # 3 secs
|
141 |
+
debug_loading_failures=False,
|
142 |
+
max_wav_length=255995, # ~11.6 seconds
|
143 |
+
max_text_length=250,
|
144 |
+
mel_norm_file=MEL_NORM_FILE,
|
145 |
+
dvae_checkpoint=DVAE_CHECKPOINT,
|
146 |
+
xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
|
147 |
+
tokenizer_file=TOKENIZER_FILE,
|
148 |
+
gpt_num_audio_tokens=1026,
|
149 |
+
gpt_start_audio_token=1024,
|
150 |
+
gpt_stop_audio_token=1025,
|
151 |
+
gpt_use_masking_gt_prompt_approach=True,
|
152 |
+
gpt_use_perceiver_resampler=True,
|
153 |
+
)
|
154 |
+
# define audio config
|
155 |
+
audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
|
156 |
+
# training parameters config
|
157 |
+
config = GPTTrainerConfig(
|
158 |
+
output_path=OUT_PATH,
|
159 |
+
model_args=model_args,
|
160 |
+
run_name=RUN_NAME,
|
161 |
+
project_name=PROJECT_NAME,
|
162 |
+
run_description="""
|
163 |
+
GPT XTTS training
|
164 |
+
""",
|
165 |
+
dashboard_logger=DASHBOARD_LOGGER,
|
166 |
+
logger_uri=LOGGER_URI,
|
167 |
+
audio=audio_config,
|
168 |
+
batch_size=BATCH_SIZE,
|
169 |
+
batch_group_size=48,
|
170 |
+
eval_batch_size=BATCH_SIZE,
|
171 |
+
num_loader_workers=8,
|
172 |
+
eval_split_max_size=256,
|
173 |
+
print_step=50,
|
174 |
+
plot_step=100,
|
175 |
+
log_model_step=1000,
|
176 |
+
save_step=10000,
|
177 |
+
save_n_checkpoints=1,
|
178 |
+
save_checkpoints=True,
|
179 |
+
# target_loss="loss",
|
180 |
+
print_eval=False,
|
181 |
+
# Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
|
182 |
+
optimizer="AdamW",
|
183 |
+
optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
|
184 |
+
optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
|
185 |
+
lr=5e-06, # learning rate
|
186 |
+
lr_scheduler="MultiStepLR",
|
187 |
+
# it was adjusted accordly for the new step scheme
|
188 |
+
lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
|
189 |
+
test_sentences=[
|
190 |
+
{
|
191 |
+
"text": "इसका फायदा भी उठा सकते हैं एक छोटे जी और आग्रह पूर्वक इसको करिए आप देखिए गरीब के साथ आपका कैसा जुड़ा होता है उस पर हम को कैसे सफलता मिलती",
|
192 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
193 |
+
"language": LANGUAGE,
|
194 |
+
},
|
195 |
+
{
|
196 |
+
"text": "अपने आप को भूल चुके हम जैसे निराश्रित बन गए नहीं मेरे प्यारे भाइयों बहनों ऐसा नहीं हो सकता स्वामी विवेकानंद अक्सर एक बात हमेशा बताया कर शायद यह बात उन्होंने",
|
197 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
198 |
+
"language": LANGUAGE,
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"text": "मुझे उनका सुझाव अच्छा लगा मेरा अपना अनुभव है गुजरात में मुख्यमंत्री 2011 में एथेंस में",
|
202 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
203 |
+
"language": LANGUAGE,
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"text": "मुख्यमंत्री",
|
207 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
208 |
+
"language": LANGUAGE,
|
209 |
+
},
|
210 |
+
{
|
211 |
+
"text": "ओलंपिक",
|
212 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
213 |
+
"language": LANGUAGE,
|
214 |
+
},
|
215 |
+
{
|
216 |
+
"text": "मेरे देशवासियों जब तक हम चलने",
|
217 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
218 |
+
"language": LANGUAGE,
|
219 |
+
},
|
220 |
+
],
|
221 |
+
eval_split_size=0.03
|
222 |
+
)
|
223 |
+
|
224 |
+
# init the model from config
|
225 |
+
model = GPTTrainer.init_from_config(config)
|
226 |
+
|
227 |
+
# load training samples
|
228 |
+
train_samples, eval_samples = load_tts_samples(
|
229 |
+
DATASETS_CONFIG_LIST,
|
230 |
+
eval_split=True,
|
231 |
+
eval_split_max_size=config.eval_split_max_size,
|
232 |
+
eval_split_size=config.eval_split_size,
|
233 |
+
)
|
234 |
+
|
235 |
+
# init the trainer and 🚀
|
236 |
+
trainer = Trainer(
|
237 |
+
TrainerArgs(
|
238 |
+
restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
|
239 |
+
skip_train_epoch=False,
|
240 |
+
start_with_eval=START_WITH_EVAL,
|
241 |
+
grad_accum_steps=GRAD_ACUMM_STEPS,
|
242 |
+
),
|
243 |
+
config,
|
244 |
+
output_path=OUT_PATH,
|
245 |
+
model=model,
|
246 |
+
train_samples=train_samples,
|
247 |
+
eval_samples=eval_samples,
|
248 |
+
)
|
249 |
+
trainer.fit()
|
250 |
+
|
251 |
+
|
252 |
+
if __name__ == "__main__":
|
253 |
+
main()
|
recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-March-29-2024_10+35AM-0000000/train_gpt_xtts.py
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from trainer import Trainer, TrainerArgs
|
4 |
+
|
5 |
+
from TTS.config.shared_configs import BaseDatasetConfig
|
6 |
+
from TTS.tts.datasets import load_tts_samples
|
7 |
+
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
|
8 |
+
from TTS.utils.manage import ModelManager
|
9 |
+
|
10 |
+
# Logging parameters
|
11 |
+
RUN_NAME = "NewModiSpeech"
|
12 |
+
PROJECT_NAME = "XTTS_trainer"
|
13 |
+
DASHBOARD_LOGGER = "tensorboard"
|
14 |
+
LOGGER_URI = None
|
15 |
+
|
16 |
+
# Set here the path that the checkpoints will be saved. Default: ./run/training/
|
17 |
+
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training")
|
18 |
+
|
19 |
+
# Training Parameters
|
20 |
+
OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False
|
21 |
+
START_WITH_EVAL = True # if True it will star with evaluation
|
22 |
+
BATCH_SIZE = 5 # set here the batch size
|
23 |
+
GRAD_ACUMM_STEPS = 84 # set here the grad accumulation steps
|
24 |
+
# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
|
25 |
+
|
26 |
+
# Define here the dataset that you want to use for the fine-tuning on.
|
27 |
+
config_dataset = BaseDatasetConfig(
|
28 |
+
formatter="ljspeech",
|
29 |
+
dataset_name="ljspeech",
|
30 |
+
path="/home/ubuntu/voicetts/finalsingle",
|
31 |
+
meta_file_train="/home/ubuntu/voicetts/finalsingle/metadata.txt",
|
32 |
+
language="hi",
|
33 |
+
)
|
34 |
+
|
35 |
+
# Add here the configs of the datasets
|
36 |
+
DATASETS_CONFIG_LIST = [config_dataset]
|
37 |
+
|
38 |
+
# Define the path where XTTS v2.0.1 files will be downloaded
|
39 |
+
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "NewModiSpeech-March-28-2024_06+57AM-0000000/")
|
40 |
+
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
|
41 |
+
|
42 |
+
|
43 |
+
# DVAE files
|
44 |
+
DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
|
45 |
+
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
|
46 |
+
|
47 |
+
# Set the path to the downloaded files
|
48 |
+
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
|
49 |
+
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
|
50 |
+
|
51 |
+
# download DVAE files if needed
|
52 |
+
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
|
53 |
+
print(" > Downloading DVAE files!")
|
54 |
+
ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
|
55 |
+
|
56 |
+
|
57 |
+
# Download XTTS v2.0 checkpoint if needed
|
58 |
+
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
|
59 |
+
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
|
60 |
+
|
61 |
+
# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
|
62 |
+
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
|
63 |
+
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
|
64 |
+
|
65 |
+
# download XTTS v2.0 files if needed
|
66 |
+
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
|
67 |
+
print(" > Downloading XTTS v2.0 files!")
|
68 |
+
ModelManager._download_model_files(
|
69 |
+
[TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
|
70 |
+
)
|
71 |
+
|
72 |
+
|
73 |
+
# Training sentences generations
|
74 |
+
SPEAKER_REFERENCE = [
|
75 |
+
# "./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences
|
76 |
+
"/home/ubuntu/voicetts/ds_path/modispeech.wav",
|
77 |
+
"/home/ubuntu/voicetts/final/wavs/clip_1.wav",
|
78 |
+
"/home/ubuntu/voicetts/final/wavs/clip_4.wav",
|
79 |
+
"/home/ubuntu/voicetts/final/wavs/clip_5.wav",
|
80 |
+
"/home/ubuntu/voicetts/final/wavs/clip_6.wav",
|
81 |
+
"/home/ubuntu/voicetts/final/wavs/clip_7.wav",
|
82 |
+
"/home/ubuntu/voicetts/final/wavs/clip_8.wav",
|
83 |
+
]
|
84 |
+
LANGUAGE = config_dataset.language
|
85 |
+
|
86 |
+
|
87 |
+
def main():
|
88 |
+
# init args and config
|
89 |
+
model_args = GPTArgs(
|
90 |
+
max_conditioning_length=132300, # 6 secs
|
91 |
+
min_conditioning_length=66150, # 3 secs
|
92 |
+
debug_loading_failures=False,
|
93 |
+
max_wav_length=255995, # ~11.6 seconds
|
94 |
+
max_text_length=200,
|
95 |
+
mel_norm_file=MEL_NORM_FILE,
|
96 |
+
dvae_checkpoint=DVAE_CHECKPOINT,
|
97 |
+
xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
|
98 |
+
tokenizer_file=TOKENIZER_FILE,
|
99 |
+
gpt_num_audio_tokens=1026,
|
100 |
+
gpt_start_audio_token=1024,
|
101 |
+
gpt_stop_audio_token=1025,
|
102 |
+
gpt_use_masking_gt_prompt_approach=True,
|
103 |
+
gpt_use_perceiver_resampler=True,
|
104 |
+
)
|
105 |
+
# define audio config
|
106 |
+
audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
|
107 |
+
# training parameters config
|
108 |
+
config = GPTTrainerConfig(
|
109 |
+
epochs=50,
|
110 |
+
output_path=OUT_PATH,
|
111 |
+
model_args=model_args,
|
112 |
+
run_name=RUN_NAME,
|
113 |
+
project_name=PROJECT_NAME,
|
114 |
+
run_description="""
|
115 |
+
GPT XTTS training
|
116 |
+
""",
|
117 |
+
dashboard_logger=DASHBOARD_LOGGER,
|
118 |
+
logger_uri=LOGGER_URI,
|
119 |
+
audio=audio_config,
|
120 |
+
batch_size=BATCH_SIZE,
|
121 |
+
batch_group_size=48,
|
122 |
+
eval_batch_size=BATCH_SIZE,
|
123 |
+
num_loader_workers=8,
|
124 |
+
eval_split_max_size=256,
|
125 |
+
print_step=50,
|
126 |
+
plot_step=100,
|
127 |
+
log_model_step=1000,
|
128 |
+
save_step=10000,
|
129 |
+
save_n_checkpoints=1,
|
130 |
+
save_checkpoints=True,
|
131 |
+
# target_loss="loss",
|
132 |
+
print_eval=False,
|
133 |
+
# Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
|
134 |
+
optimizer="AdamW",
|
135 |
+
optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
|
136 |
+
optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
|
137 |
+
lr=5e-06, # learning rate
|
138 |
+
lr_scheduler="MultiStepLR",
|
139 |
+
# it was adjusted accordly for the new step scheme
|
140 |
+
lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
|
141 |
+
test_sentences=[
|
142 |
+
{
|
143 |
+
"text": "प्रतिबद्धता",
|
144 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
145 |
+
"language": LANGUAGE,
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"text": "भव्य",
|
149 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
150 |
+
"language": LANGUAGE,
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"text": "मयना",
|
154 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
155 |
+
"language": LANGUAGE,
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"text": "फाल्गुनिबेन",
|
159 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
160 |
+
"language": LANGUAGE,
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"text": "भव्य नमस्कार , मैं नरेंद्र मोदी बात कर रहा हूँ.",
|
164 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
165 |
+
"language": LANGUAGE,
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"text": "राम राम फाल्गुनिबेन , मैं नरेंद्र मोदी बात कर रहा हूँ.",
|
169 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
170 |
+
"language": LANGUAGE,
|
171 |
+
},
|
172 |
+
],
|
173 |
+
)
|
174 |
+
|
175 |
+
# init the model from config
|
176 |
+
model = GPTTrainer.init_from_config(config)
|
177 |
+
|
178 |
+
# load training samples
|
179 |
+
train_samples, eval_samples = load_tts_samples(
|
180 |
+
DATASETS_CONFIG_LIST,
|
181 |
+
eval_split=True,
|
182 |
+
eval_split_max_size=config.eval_split_max_size,
|
183 |
+
eval_split_size=config.eval_split_size,
|
184 |
+
)
|
185 |
+
|
186 |
+
# init the trainer and 🚀
|
187 |
+
trainer = Trainer(
|
188 |
+
TrainerArgs(
|
189 |
+
restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
|
190 |
+
skip_train_epoch=False,
|
191 |
+
start_with_eval=START_WITH_EVAL,
|
192 |
+
grad_accum_steps=GRAD_ACUMM_STEPS,
|
193 |
+
),
|
194 |
+
config,
|
195 |
+
output_path=OUT_PATH,
|
196 |
+
model=model,
|
197 |
+
train_samples=train_samples,
|
198 |
+
eval_samples=eval_samples,
|
199 |
+
)
|
200 |
+
trainer.fit()
|
201 |
+
|
202 |
+
|
203 |
+
if __name__ == "__main__":
|
204 |
+
main()
|
recipes/ljspeech/xtts_v2/train_gpt_xtts.py
CHANGED
@@ -27,8 +27,8 @@ GRAD_ACUMM_STEPS = 84 # set here the grad accumulation steps
|
|
27 |
config_dataset = BaseDatasetConfig(
|
28 |
formatter="ljspeech",
|
29 |
dataset_name="ljspeech",
|
30 |
-
path="/home/ubuntu/voicetts/
|
31 |
-
meta_file_train="/home/ubuntu/voicetts/
|
32 |
language="hi",
|
33 |
)
|
34 |
|
@@ -36,7 +36,7 @@ config_dataset = BaseDatasetConfig(
|
|
36 |
DATASETS_CONFIG_LIST = [config_dataset]
|
37 |
|
38 |
# Define the path where XTTS v2.0.1 files will be downloaded
|
39 |
-
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "
|
40 |
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
|
41 |
|
42 |
|
@@ -73,10 +73,65 @@ if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
|
|
73 |
# Training sentences generations
|
74 |
SPEAKER_REFERENCE = [
|
75 |
# "./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences
|
76 |
-
"/home/ubuntu/voicetts/ds_path/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
]
|
78 |
LANGUAGE = config_dataset.language
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
def main():
|
82 |
# init args and config
|
@@ -85,7 +140,7 @@ def main():
|
|
85 |
min_conditioning_length=66150, # 3 secs
|
86 |
debug_loading_failures=False,
|
87 |
max_wav_length=255995, # ~11.6 seconds
|
88 |
-
max_text_length=
|
89 |
mel_norm_file=MEL_NORM_FILE,
|
90 |
dvae_checkpoint=DVAE_CHECKPOINT,
|
91 |
xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
|
@@ -133,26 +188,37 @@ def main():
|
|
133 |
lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
|
134 |
test_sentences=[
|
135 |
{
|
136 |
-
"text": "
|
|
|
|
|
|
|
|
|
|
|
137 |
"speaker_wav": SPEAKER_REFERENCE,
|
138 |
"language": LANGUAGE,
|
139 |
},
|
140 |
{
|
141 |
-
"text": "
|
142 |
"speaker_wav": SPEAKER_REFERENCE,
|
143 |
"language": LANGUAGE,
|
144 |
},
|
145 |
{
|
146 |
-
"text": "
|
147 |
"speaker_wav": SPEAKER_REFERENCE,
|
148 |
"language": LANGUAGE,
|
149 |
},
|
150 |
{
|
151 |
-
"text": "
|
152 |
"speaker_wav": SPEAKER_REFERENCE,
|
153 |
-
"language":
|
|
|
|
|
|
|
|
|
|
|
154 |
},
|
155 |
],
|
|
|
156 |
)
|
157 |
|
158 |
# init the model from config
|
|
|
27 |
config_dataset = BaseDatasetConfig(
|
28 |
formatter="ljspeech",
|
29 |
dataset_name="ljspeech",
|
30 |
+
path="/home/ubuntu/voicetts/modi/hindi",
|
31 |
+
meta_file_train="/home/ubuntu/voicetts/modi/hindi/metadata.txt",
|
32 |
language="hi",
|
33 |
)
|
34 |
|
|
|
36 |
DATASETS_CONFIG_LIST = [config_dataset]
|
37 |
|
38 |
# Define the path where XTTS v2.0.1 files will be downloaded
|
39 |
+
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "NewModiSpeech-March-29-2024_10+35AM-0000000/")
|
40 |
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
|
41 |
|
42 |
|
|
|
73 |
# Training sentences generations
|
74 |
SPEAKER_REFERENCE = [
|
75 |
# "./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences
|
76 |
+
"/home/ubuntu/voicetts/ds_path/modispeech.wav",
|
77 |
+
"/home/ubuntu/voicetts/final/wavs/clip_1.wav",
|
78 |
+
"/home/ubuntu/voicetts/final/wavs/clip_4.wav",
|
79 |
+
"/home/ubuntu/voicetts/final/wavs/clip_5.wav",
|
80 |
+
"/home/ubuntu/voicetts/final/wavs/clip_6.wav",
|
81 |
+
"/home/ubuntu/voicetts/final/wavs/clip_7.wav",
|
82 |
+
"/home/ubuntu/voicetts/final/wavs/clip_8.wav",
|
83 |
+
|
84 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_2.wav",
|
85 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_3.wav",
|
86 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_4.wav",
|
87 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_5.wav",
|
88 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_6.wav",
|
89 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_7.wav",
|
90 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_8.wav",
|
91 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_9.wav",
|
92 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_10.wav",
|
93 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_11.wav",
|
94 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_12.wav",
|
95 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_13.wav",
|
96 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_14.wav",
|
97 |
+
# "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_15.wav",
|
98 |
]
|
99 |
LANGUAGE = config_dataset.language
|
100 |
|
101 |
+
BEST_LOSS = None
|
102 |
+
MAX_PATIENCE = 1 # early stopping patience
|
103 |
+
CURRENT_PATIENCE = 0 # current patience
|
104 |
+
def early_stopping_fn(eval_results):
|
105 |
+
"""
|
106 |
+
This function is called after each evaluation step.
|
107 |
+
If you want to implement early stopping, you can do it here.
|
108 |
+
|
109 |
+
If model has not imporoved for 25 epoches, it will stop the training.
|
110 |
+
"""
|
111 |
+
global BEST_LOSS
|
112 |
+
global CURRENT_PATIENCE
|
113 |
+
print(" > Early stopping function called!")
|
114 |
+
print(f" > Current patience: {CURRENT_PATIENCE}/{MAX_PATIENCE}")
|
115 |
+
print('Best Loss', BEST_LOSS)
|
116 |
+
print('Current Best Loss', eval_results.best_loss)
|
117 |
+
current_best_loss = eval_results.best_loss['eval_loss']
|
118 |
+
if BEST_LOSS is None:
|
119 |
+
BEST_LOSS = current_best_loss
|
120 |
+
else:
|
121 |
+
if CURRENT_PATIENCE <= MAX_PATIENCE:
|
122 |
+
print(" > Early stopping!")
|
123 |
+
return True
|
124 |
+
elif current_best_loss < BEST_LOSS:
|
125 |
+
BEST_LOSS = current_best_loss
|
126 |
+
CURRENT_PATIENCE = 0
|
127 |
+
elif CURRENT_PATIENCE < MAX_PATIENCE:
|
128 |
+
CURRENT_PATIENCE += 1
|
129 |
+
if CURRENT_PATIENCE >= MAX_PATIENCE:
|
130 |
+
print(" > Early stopping!")
|
131 |
+
return True
|
132 |
+
print('Updated Best Loss', BEST_LOSS)
|
133 |
+
return False
|
134 |
+
|
135 |
|
136 |
def main():
|
137 |
# init args and config
|
|
|
140 |
min_conditioning_length=66150, # 3 secs
|
141 |
debug_loading_failures=False,
|
142 |
max_wav_length=255995, # ~11.6 seconds
|
143 |
+
max_text_length=250,
|
144 |
mel_norm_file=MEL_NORM_FILE,
|
145 |
dvae_checkpoint=DVAE_CHECKPOINT,
|
146 |
xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
|
|
|
188 |
lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
|
189 |
test_sentences=[
|
190 |
{
|
191 |
+
"text": "इसका फायदा भी उठा सकते हैं एक छोटे जी और आग्रह पूर्वक इसको करिए आप देखिए गरीब के साथ आपका कैसा जुड़ा होता है उस पर हम को कैसे सफलता मिलती",
|
192 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
193 |
+
"language": LANGUAGE,
|
194 |
+
},
|
195 |
+
{
|
196 |
+
"text": "अपने आप को भूल चुके हम जैसे निराश्रित बन गए नहीं मेरे प्यारे भाइयों बहनों ऐसा नहीं हो सकता स्वामी विवेकानंद अक्सर एक बात हमेशा बताया कर शायद यह बात उन्होंने",
|
197 |
"speaker_wav": SPEAKER_REFERENCE,
|
198 |
"language": LANGUAGE,
|
199 |
},
|
200 |
{
|
201 |
+
"text": "मुझे उनका सुझाव अच्छा लगा मेरा अपना अनुभव है गुजरात में मुख्यमंत्री 2011 में एथेंस में",
|
202 |
"speaker_wav": SPEAKER_REFERENCE,
|
203 |
"language": LANGUAGE,
|
204 |
},
|
205 |
{
|
206 |
+
"text": "मुख्यमंत्री",
|
207 |
"speaker_wav": SPEAKER_REFERENCE,
|
208 |
"language": LANGUAGE,
|
209 |
},
|
210 |
{
|
211 |
+
"text": "ओलंपिक",
|
212 |
"speaker_wav": SPEAKER_REFERENCE,
|
213 |
+
"language": LANGUAGE,
|
214 |
+
},
|
215 |
+
{
|
216 |
+
"text": "मेरे देशवासियों जब तक हम चलने",
|
217 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
218 |
+
"language": LANGUAGE,
|
219 |
},
|
220 |
],
|
221 |
+
eval_split_size=0.03
|
222 |
)
|
223 |
|
224 |
# init the model from config
|