praveenchordia commited on
Commit
0eca443
·
1 Parent(s): 7b8b31f

Update paths and add new training data for XTTS with Modi voice

Browse files
recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-April-04-2024_06+55AM-0000000/train_gpt_xtts.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from trainer import Trainer, TrainerArgs
4
+
5
+ from TTS.config.shared_configs import BaseDatasetConfig
6
+ from TTS.tts.datasets import load_tts_samples
7
+ from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
8
+ from TTS.utils.manage import ModelManager
9
+
10
+ # Logging parameters
11
+ RUN_NAME = "NewModiSpeech"
12
+ PROJECT_NAME = "XTTS_trainer"
13
+ DASHBOARD_LOGGER = "tensorboard"
14
+ LOGGER_URI = None
15
+
16
+ # Set here the path that the checkpoints will be saved. Default: ./run/training/
17
+ OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training")
18
+
19
+ # Training Parameters
20
+ OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False
21
+ START_WITH_EVAL = True # if True it will star with evaluation
22
+ BATCH_SIZE = 5 # set here the batch size
23
+ GRAD_ACUMM_STEPS = 84 # set here the grad accumulation steps
24
+ # Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
25
+
26
+ # Define here the dataset that you want to use for the fine-tuning on.
27
+ config_dataset = BaseDatasetConfig(
28
+ formatter="ljspeech",
29
+ dataset_name="ljspeech",
30
+ path="/home/ubuntu/voicetts/modi/hindi",
31
+ meta_file_train="/home/ubuntu/voicetts/modi/hindi/metadata.txt",
32
+ language="hi",
33
+ )
34
+
35
+ # Add here the configs of the datasets
36
+ DATASETS_CONFIG_LIST = [config_dataset]
37
+
38
+ # Define the path where XTTS v2.0.1 files will be downloaded
39
+ CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "NewModiSpeech-March-29-2024_10+35AM-0000000/")
40
+ os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
41
+
42
+
43
+ # DVAE files
44
+ DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
45
+ MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
46
+
47
+ # Set the path to the downloaded files
48
+ DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
49
+ MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
50
+
51
+ # download DVAE files if needed
52
+ if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
53
+ print(" > Downloading DVAE files!")
54
+ ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
55
+
56
+
57
+ # Download XTTS v2.0 checkpoint if needed
58
+ TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
59
+ XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
60
+
61
+ # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
62
+ TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
63
+ XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
64
+
65
+ # download XTTS v2.0 files if needed
66
+ if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
67
+ print(" > Downloading XTTS v2.0 files!")
68
+ ModelManager._download_model_files(
69
+ [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
70
+ )
71
+
72
+
73
+ # Training sentences generations
74
+ SPEAKER_REFERENCE = [
75
+ # "./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences
76
+ "/home/ubuntu/voicetts/ds_path/modispeech.wav",
77
+ "/home/ubuntu/voicetts/final/wavs/clip_1.wav",
78
+ "/home/ubuntu/voicetts/final/wavs/clip_4.wav",
79
+ "/home/ubuntu/voicetts/final/wavs/clip_5.wav",
80
+ "/home/ubuntu/voicetts/final/wavs/clip_6.wav",
81
+ "/home/ubuntu/voicetts/final/wavs/clip_7.wav",
82
+ "/home/ubuntu/voicetts/final/wavs/clip_8.wav",
83
+
84
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_2.wav",
85
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_3.wav",
86
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_4.wav",
87
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_5.wav",
88
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_6.wav",
89
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_7.wav",
90
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_8.wav",
91
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_9.wav",
92
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_10.wav",
93
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_11.wav",
94
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_12.wav",
95
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_13.wav",
96
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_14.wav",
97
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_15.wav",
98
+ ]
99
+ LANGUAGE = config_dataset.language
100
+
101
+ BEST_LOSS = None
102
+ MAX_PATIENCE = 1 # early stopping patience
103
+ CURRENT_PATIENCE = 0 # current patience
104
+ def early_stopping_fn(eval_results):
105
+ """
106
+ This function is called after each evaluation step.
107
+ If you want to implement early stopping, you can do it here.
108
+
109
+ If model has not imporoved for 25 epoches, it will stop the training.
110
+ """
111
+ global BEST_LOSS
112
+ global CURRENT_PATIENCE
113
+ print(" > Early stopping function called!")
114
+ print(f" > Current patience: {CURRENT_PATIENCE}/{MAX_PATIENCE}")
115
+ print('Best Loss', BEST_LOSS)
116
+ print('Current Best Loss', eval_results.best_loss)
117
+ current_best_loss = eval_results.best_loss['eval_loss']
118
+ if BEST_LOSS is None:
119
+ BEST_LOSS = current_best_loss
120
+ else:
121
+ if CURRENT_PATIENCE <= MAX_PATIENCE:
122
+ print(" > Early stopping!")
123
+ return True
124
+ elif current_best_loss < BEST_LOSS:
125
+ BEST_LOSS = current_best_loss
126
+ CURRENT_PATIENCE = 0
127
+ elif CURRENT_PATIENCE < MAX_PATIENCE:
128
+ CURRENT_PATIENCE += 1
129
+ if CURRENT_PATIENCE >= MAX_PATIENCE:
130
+ print(" > Early stopping!")
131
+ return True
132
+ print('Updated Best Loss', BEST_LOSS)
133
+ return False
134
+
135
+
136
+ def main():
137
+ # init args and config
138
+ model_args = GPTArgs(
139
+ max_conditioning_length=132300, # 6 secs
140
+ min_conditioning_length=66150, # 3 secs
141
+ debug_loading_failures=False,
142
+ max_wav_length=255995, # ~11.6 seconds
143
+ max_text_length=300,
144
+ mel_norm_file=MEL_NORM_FILE,
145
+ dvae_checkpoint=DVAE_CHECKPOINT,
146
+ xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
147
+ tokenizer_file=TOKENIZER_FILE,
148
+ gpt_num_audio_tokens=1026,
149
+ gpt_start_audio_token=1024,
150
+ gpt_stop_audio_token=1025,
151
+ gpt_use_masking_gt_prompt_approach=True,
152
+ gpt_use_perceiver_resampler=True,
153
+ )
154
+ # define audio config
155
+ audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
156
+ # training parameters config
157
+ config = GPTTrainerConfig(
158
+ output_path=OUT_PATH,
159
+ model_args=model_args,
160
+ run_name=RUN_NAME,
161
+ project_name=PROJECT_NAME,
162
+ run_description="""
163
+ GPT XTTS training
164
+ """,
165
+ dashboard_logger=DASHBOARD_LOGGER,
166
+ logger_uri=LOGGER_URI,
167
+ audio=audio_config,
168
+ batch_size=BATCH_SIZE,
169
+ batch_group_size=48,
170
+ eval_batch_size=BATCH_SIZE,
171
+ num_loader_workers=8,
172
+ eval_split_max_size=256,
173
+ print_step=50,
174
+ plot_step=100,
175
+ log_model_step=1000,
176
+ save_step=10000,
177
+ save_n_checkpoints=1,
178
+ save_checkpoints=True,
179
+ # target_loss="loss",
180
+ print_eval=True,
181
+ # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
182
+ optimizer="AdamW",
183
+ optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
184
+ optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
185
+ lr=5e-06, # learning rate
186
+ lr_scheduler="MultiStepLR",
187
+ # it was adjusted accordly for the new step scheme
188
+ lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
189
+ test_sentences=[
190
+ {
191
+ "text": "इसका फायदा भी उठा सकते हैं एक छोटे जी और आग्रह पूर्वक इसको करिए आप देखिए गरीब के साथ आपका कैसा जुड़ा होता है उस पर हम को कैसे सफलता मिलती",
192
+ "speaker_wav": SPEAKER_REFERENCE,
193
+ "language": LANGUAGE,
194
+ },
195
+ {
196
+ "text": "अपने आप को भूल चुके हम जैसे निराश्रित बन गए नहीं मेरे प्यारे भाइयों बहनों ऐसा नहीं हो सकता स्वामी विवेकानंद अक्सर एक बात हमेशा बताया कर शायद यह बात उन्होंने",
197
+ "speaker_wav": SPEAKER_REFERENCE,
198
+ "language": LANGUAGE,
199
+ },
200
+ {
201
+ "text": "मुझे उनका सुझाव अच्छा लगा मेरा अपना अनुभव है गुजरात में मुख्यमंत्री 2011 में एथेंस में",
202
+ "speaker_wav": SPEAKER_REFERENCE,
203
+ "language": LANGUAGE,
204
+ },
205
+ {
206
+ "text": "मुख्यमंत्री",
207
+ "speaker_wav": SPEAKER_REFERENCE,
208
+ "language": LANGUAGE,
209
+ },
210
+ {
211
+ "text": "ओलंपिक",
212
+ "speaker_wav": SPEAKER_REFERENCE,
213
+ "language": LANGUAGE,
214
+ },
215
+ {
216
+ "text": "मेरे देशवासियों जब तक हम चलने",
217
+ "speaker_wav": SPEAKER_REFERENCE,
218
+ "language": LANGUAGE,
219
+ },
220
+ ],
221
+ eval_split_size=0.05
222
+ )
223
+
224
+ # init the model from config
225
+ model = GPTTrainer.init_from_config(config)
226
+
227
+ # load training samples
228
+ train_samples, eval_samples = load_tts_samples(
229
+ DATASETS_CONFIG_LIST,
230
+ eval_split=True,
231
+ eval_split_max_size=config.eval_split_max_size,
232
+ eval_split_size=config.eval_split_size,
233
+ )
234
+
235
+ # init the trainer and 🚀
236
+ trainer = Trainer(
237
+ TrainerArgs(
238
+ restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
239
+ skip_train_epoch=False,
240
+ start_with_eval=START_WITH_EVAL,
241
+ grad_accum_steps=GRAD_ACUMM_STEPS,
242
+ ),
243
+ config,
244
+ output_path=OUT_PATH,
245
+ model=model,
246
+ train_samples=train_samples,
247
+ eval_samples=eval_samples,
248
+ )
249
+ trainer.fit()
250
+
251
+
252
+ if __name__ == "__main__":
253
+ main()
recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-April-04-2024_07+01AM-0000000/train_gpt_xtts.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from trainer import Trainer, TrainerArgs
4
+
5
+ from TTS.config.shared_configs import BaseDatasetConfig
6
+ from TTS.tts.datasets import load_tts_samples
7
+ from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
8
+ from TTS.utils.manage import ModelManager
9
+
10
+ # Logging parameters
11
+ RUN_NAME = "NewModiSpeech"
12
+ PROJECT_NAME = "XTTS_trainer"
13
+ DASHBOARD_LOGGER = "tensorboard"
14
+ LOGGER_URI = None
15
+
16
+ # Set here the path that the checkpoints will be saved. Default: ./run/training/
17
+ OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training")
18
+
19
+ # Training Parameters
20
+ OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False
21
+ START_WITH_EVAL = True # if True it will star with evaluation
22
+ BATCH_SIZE = 5 # set here the batch size
23
+ GRAD_ACUMM_STEPS = 84 # set here the grad accumulation steps
24
+ # Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
25
+
26
+ # Define here the dataset that you want to use for the fine-tuning on.
27
+ config_dataset = BaseDatasetConfig(
28
+ formatter="ljspeech",
29
+ dataset_name="ljspeech",
30
+ path="/home/ubuntu/voicetts/modi/hindi",
31
+ meta_file_train="/home/ubuntu/voicetts/modi/hindi/metadata.txt",
32
+ language="hi",
33
+ )
34
+
35
+ # Add here the configs of the datasets
36
+ DATASETS_CONFIG_LIST = [config_dataset]
37
+
38
+ # Define the path where XTTS v2.0.1 files will be downloaded
39
+ CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "NewModiSpeech-March-29-2024_10+35AM-0000000/")
40
+ os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
41
+
42
+
43
+ # DVAE files
44
+ DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
45
+ MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
46
+
47
+ # Set the path to the downloaded files
48
+ DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
49
+ MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
50
+
51
+ # download DVAE files if needed
52
+ if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
53
+ print(" > Downloading DVAE files!")
54
+ ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
55
+
56
+
57
+ # Download XTTS v2.0 checkpoint if needed
58
+ TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
59
+ XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
60
+
61
+ # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
62
+ TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
63
+ XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
64
+
65
+ # download XTTS v2.0 files if needed
66
+ if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
67
+ print(" > Downloading XTTS v2.0 files!")
68
+ ModelManager._download_model_files(
69
+ [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
70
+ )
71
+
72
+
73
+ # Training sentences generations
74
+ SPEAKER_REFERENCE = [
75
+ # "./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences
76
+ "/home/ubuntu/voicetts/ds_path/modispeech.wav",
77
+ "/home/ubuntu/voicetts/final/wavs/clip_1.wav",
78
+ "/home/ubuntu/voicetts/final/wavs/clip_4.wav",
79
+ "/home/ubuntu/voicetts/final/wavs/clip_5.wav",
80
+ "/home/ubuntu/voicetts/final/wavs/clip_6.wav",
81
+ "/home/ubuntu/voicetts/final/wavs/clip_7.wav",
82
+ "/home/ubuntu/voicetts/final/wavs/clip_8.wav",
83
+
84
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_2.wav",
85
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_3.wav",
86
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_4.wav",
87
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_5.wav",
88
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_6.wav",
89
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_7.wav",
90
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_8.wav",
91
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_9.wav",
92
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_10.wav",
93
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_11.wav",
94
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_12.wav",
95
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_13.wav",
96
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_14.wav",
97
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_15.wav",
98
+ ]
99
+ LANGUAGE = config_dataset.language
100
+
101
+ BEST_LOSS = None
102
+ MAX_PATIENCE = 1 # early stopping patience
103
+ CURRENT_PATIENCE = 0 # current patience
104
+ def early_stopping_fn(eval_results):
105
+ """
106
+ This function is called after each evaluation step.
107
+ If you want to implement early stopping, you can do it here.
108
+
109
+ If model has not imporoved for 25 epoches, it will stop the training.
110
+ """
111
+ global BEST_LOSS
112
+ global CURRENT_PATIENCE
113
+ print(" > Early stopping function called!")
114
+ print(f" > Current patience: {CURRENT_PATIENCE}/{MAX_PATIENCE}")
115
+ print('Best Loss', BEST_LOSS)
116
+ print('Current Best Loss', eval_results.best_loss)
117
+ current_best_loss = eval_results.best_loss['eval_loss']
118
+ if BEST_LOSS is None:
119
+ BEST_LOSS = current_best_loss
120
+ else:
121
+ if CURRENT_PATIENCE <= MAX_PATIENCE:
122
+ print(" > Early stopping!")
123
+ return True
124
+ elif current_best_loss < BEST_LOSS:
125
+ BEST_LOSS = current_best_loss
126
+ CURRENT_PATIENCE = 0
127
+ elif CURRENT_PATIENCE < MAX_PATIENCE:
128
+ CURRENT_PATIENCE += 1
129
+ if CURRENT_PATIENCE >= MAX_PATIENCE:
130
+ print(" > Early stopping!")
131
+ return True
132
+ print('Updated Best Loss', BEST_LOSS)
133
+ return False
134
+
135
+
136
+ def main():
137
+ # init args and config
138
+ model_args = GPTArgs(
139
+ max_conditioning_length=132300, # 6 secs
140
+ min_conditioning_length=66150, # 3 secs
141
+ debug_loading_failures=False,
142
+ max_wav_length=255995, # ~11.6 seconds
143
+ max_text_length=250,
144
+ mel_norm_file=MEL_NORM_FILE,
145
+ dvae_checkpoint=DVAE_CHECKPOINT,
146
+ xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
147
+ tokenizer_file=TOKENIZER_FILE,
148
+ gpt_num_audio_tokens=1026,
149
+ gpt_start_audio_token=1024,
150
+ gpt_stop_audio_token=1025,
151
+ gpt_use_masking_gt_prompt_approach=True,
152
+ gpt_use_perceiver_resampler=True,
153
+ )
154
+ # define audio config
155
+ audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
156
+ # training parameters config
157
+ config = GPTTrainerConfig(
158
+ output_path=OUT_PATH,
159
+ model_args=model_args,
160
+ run_name=RUN_NAME,
161
+ project_name=PROJECT_NAME,
162
+ run_description="""
163
+ GPT XTTS training
164
+ """,
165
+ dashboard_logger=DASHBOARD_LOGGER,
166
+ logger_uri=LOGGER_URI,
167
+ audio=audio_config,
168
+ batch_size=BATCH_SIZE,
169
+ batch_group_size=48,
170
+ eval_batch_size=BATCH_SIZE,
171
+ num_loader_workers=8,
172
+ eval_split_max_size=256,
173
+ print_step=50,
174
+ plot_step=100,
175
+ log_model_step=1000,
176
+ save_step=10000,
177
+ save_n_checkpoints=1,
178
+ save_checkpoints=True,
179
+ # target_loss="loss",
180
+ print_eval=True,
181
+ # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
182
+ optimizer="AdamW",
183
+ optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
184
+ optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
185
+ lr=5e-06, # learning rate
186
+ lr_scheduler="MultiStepLR",
187
+ # it was adjusted accordly for the new step scheme
188
+ lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
189
+ test_sentences=[
190
+ {
191
+ "text": "इसका फायदा भी उठा सकते हैं एक छोटे जी और आग्रह पूर्वक इसको करिए आप देखिए गरीब के साथ आपका कैसा जुड़ा होता है उस पर हम को कैसे सफलता मिलती",
192
+ "speaker_wav": SPEAKER_REFERENCE,
193
+ "language": LANGUAGE,
194
+ },
195
+ {
196
+ "text": "अपने आप को भूल चुके हम जैसे निराश्रित बन गए नहीं मेरे प्यारे भाइयों बहनों ऐसा नहीं हो सकता स्वामी विवेकानंद अक्सर एक बात हमेशा बताया कर शायद यह बात उन्होंने",
197
+ "speaker_wav": SPEAKER_REFERENCE,
198
+ "language": LANGUAGE,
199
+ },
200
+ {
201
+ "text": "मुझे उनका सुझाव अच्छा लगा मेरा अपना अनुभव है गुजरात में मुख्यमंत्री 2011 में एथेंस में",
202
+ "speaker_wav": SPEAKER_REFERENCE,
203
+ "language": LANGUAGE,
204
+ },
205
+ {
206
+ "text": "मुख्यमंत्री",
207
+ "speaker_wav": SPEAKER_REFERENCE,
208
+ "language": LANGUAGE,
209
+ },
210
+ {
211
+ "text": "ओलंपिक",
212
+ "speaker_wav": SPEAKER_REFERENCE,
213
+ "language": LANGUAGE,
214
+ },
215
+ {
216
+ "text": "मेरे देशवासियों जब तक हम चलने",
217
+ "speaker_wav": SPEAKER_REFERENCE,
218
+ "language": LANGUAGE,
219
+ },
220
+ ],
221
+ eval_split_size=0.05
222
+ )
223
+
224
+ # init the model from config
225
+ model = GPTTrainer.init_from_config(config)
226
+
227
+ # load training samples
228
+ train_samples, eval_samples = load_tts_samples(
229
+ DATASETS_CONFIG_LIST,
230
+ eval_split=True,
231
+ eval_split_max_size=config.eval_split_max_size,
232
+ eval_split_size=config.eval_split_size,
233
+ )
234
+
235
+ # init the trainer and 🚀
236
+ trainer = Trainer(
237
+ TrainerArgs(
238
+ restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
239
+ skip_train_epoch=False,
240
+ start_with_eval=START_WITH_EVAL,
241
+ grad_accum_steps=GRAD_ACUMM_STEPS,
242
+ ),
243
+ config,
244
+ output_path=OUT_PATH,
245
+ model=model,
246
+ train_samples=train_samples,
247
+ eval_samples=eval_samples,
248
+ )
249
+ trainer.fit()
250
+
251
+
252
+ if __name__ == "__main__":
253
+ main()
recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-April-04-2024_09+33AM-0000000/train_gpt_xtts.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from trainer import Trainer, TrainerArgs
4
+
5
+ from TTS.config.shared_configs import BaseDatasetConfig
6
+ from TTS.tts.datasets import load_tts_samples
7
+ from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
8
+ from TTS.utils.manage import ModelManager
9
+
10
+ # Logging parameters
11
+ RUN_NAME = "NewModiSpeech"
12
+ PROJECT_NAME = "XTTS_trainer"
13
+ DASHBOARD_LOGGER = "tensorboard"
14
+ LOGGER_URI = None
15
+
16
+ # Set here the path that the checkpoints will be saved. Default: ./run/training/
17
+ OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training")
18
+
19
+ # Training Parameters
20
+ OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False
21
+ START_WITH_EVAL = True # if True it will star with evaluation
22
+ BATCH_SIZE = 10 # set here the batch size
23
+ GRAD_ACUMM_STEPS = 84 # set here the grad accumulation steps
24
+ # Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
25
+
26
+ # Define here the dataset that you want to use for the fine-tuning on.
27
+ config_dataset = BaseDatasetConfig(
28
+ formatter="ljspeech",
29
+ dataset_name="ljspeech",
30
+ path="/home/ubuntu/voicetts/modi/hindi",
31
+ meta_file_train="/home/ubuntu/voicetts/modi/hindi/metadata.txt",
32
+ language="hi",
33
+ )
34
+
35
+ # Add here the configs of the datasets
36
+ DATASETS_CONFIG_LIST = [config_dataset]
37
+
38
+ # Define the path where XTTS v2.0.1 files will be downloaded
39
+ CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "NewModiSpeech-March-29-2024_10+35AM-0000000/")
40
+ os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
41
+
42
+
43
+ # DVAE files
44
+ DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
45
+ MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
46
+
47
+ # Set the path to the downloaded files
48
+ DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
49
+ MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
50
+
51
+ # download DVAE files if needed
52
+ if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
53
+ print(" > Downloading DVAE files!")
54
+ ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
55
+
56
+
57
+ # Download XTTS v2.0 checkpoint if needed
58
+ TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
59
+ XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
60
+
61
+ # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
62
+ TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
63
+ XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
64
+
65
+ # download XTTS v2.0 files if needed
66
+ if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
67
+ print(" > Downloading XTTS v2.0 files!")
68
+ ModelManager._download_model_files(
69
+ [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
70
+ )
71
+
72
+
73
+ # Training sentences generations
74
+ SPEAKER_REFERENCE = [
75
+ # "./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences
76
+ "/home/ubuntu/voicetts/ds_path/modispeech.wav",
77
+ "/home/ubuntu/voicetts/final/wavs/clip_1.wav",
78
+ "/home/ubuntu/voicetts/final/wavs/clip_4.wav",
79
+ "/home/ubuntu/voicetts/final/wavs/clip_5.wav",
80
+ "/home/ubuntu/voicetts/final/wavs/clip_6.wav",
81
+ "/home/ubuntu/voicetts/final/wavs/clip_7.wav",
82
+ "/home/ubuntu/voicetts/final/wavs/clip_8.wav",
83
+
84
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_2.wav",
85
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_3.wav",
86
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_4.wav",
87
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_5.wav",
88
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_6.wav",
89
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_7.wav",
90
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_8.wav",
91
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_9.wav",
92
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_10.wav",
93
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_11.wav",
94
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_12.wav",
95
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_13.wav",
96
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_14.wav",
97
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_15.wav",
98
+ ]
99
+ LANGUAGE = config_dataset.language
100
+
101
+ BEST_LOSS = None
102
+ MAX_PATIENCE = 1 # early stopping patience
103
+ CURRENT_PATIENCE = 0 # current patience
104
+ def early_stopping_fn(eval_results):
105
+ """
106
+ This function is called after each evaluation step.
107
+ If you want to implement early stopping, you can do it here.
108
+
109
+ If model has not imporoved for 25 epoches, it will stop the training.
110
+ """
111
+ global BEST_LOSS
112
+ global CURRENT_PATIENCE
113
+ print(" > Early stopping function called!")
114
+ print(f" > Current patience: {CURRENT_PATIENCE}/{MAX_PATIENCE}")
115
+ print('Best Loss', BEST_LOSS)
116
+ print('Current Best Loss', eval_results.best_loss)
117
+ current_best_loss = eval_results.best_loss['eval_loss']
118
+ if BEST_LOSS is None:
119
+ BEST_LOSS = current_best_loss
120
+ else:
121
+ if CURRENT_PATIENCE <= MAX_PATIENCE:
122
+ print(" > Early stopping!")
123
+ return True
124
+ elif current_best_loss < BEST_LOSS:
125
+ BEST_LOSS = current_best_loss
126
+ CURRENT_PATIENCE = 0
127
+ elif CURRENT_PATIENCE < MAX_PATIENCE:
128
+ CURRENT_PATIENCE += 1
129
+ if CURRENT_PATIENCE >= MAX_PATIENCE:
130
+ print(" > Early stopping!")
131
+ return True
132
+ print('Updated Best Loss', BEST_LOSS)
133
+ return False
134
+
135
+
136
+ def main():
137
+ # init args and config
138
+ model_args = GPTArgs(
139
+ max_conditioning_length=132300, # 6 secs
140
+ min_conditioning_length=66150, # 3 secs
141
+ debug_loading_failures=False,
142
+ max_wav_length=255995, # ~11.6 seconds
143
+ max_text_length=250,
144
+ mel_norm_file=MEL_NORM_FILE,
145
+ dvae_checkpoint=DVAE_CHECKPOINT,
146
+ xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
147
+ tokenizer_file=TOKENIZER_FILE,
148
+ gpt_num_audio_tokens=1026,
149
+ gpt_start_audio_token=1024,
150
+ gpt_stop_audio_token=1025,
151
+ gpt_use_masking_gt_prompt_approach=True,
152
+ gpt_use_perceiver_resampler=True,
153
+ )
154
+ # define audio config
155
+ audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
156
+ # training parameters config
157
+ config = GPTTrainerConfig(
158
+ output_path=OUT_PATH,
159
+ model_args=model_args,
160
+ run_name=RUN_NAME,
161
+ project_name=PROJECT_NAME,
162
+ run_description="""
163
+ GPT XTTS training
164
+ """,
165
+ dashboard_logger=DASHBOARD_LOGGER,
166
+ logger_uri=LOGGER_URI,
167
+ audio=audio_config,
168
+ batch_size=BATCH_SIZE,
169
+ batch_group_size=48,
170
+ eval_batch_size=BATCH_SIZE,
171
+ num_loader_workers=8,
172
+ eval_split_max_size=256,
173
+ print_step=50,
174
+ plot_step=100,
175
+ log_model_step=1000,
176
+ save_step=10000,
177
+ save_n_checkpoints=1,
178
+ save_checkpoints=True,
179
+ # target_loss="loss",
180
+ print_eval=False,
181
+ # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
182
+ optimizer="AdamW",
183
+ optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
184
+ optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
185
+ lr=5e-06, # learning rate
186
+ lr_scheduler="MultiStepLR",
187
+ # it was adjusted accordly for the new step scheme
188
+ lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
189
+ test_sentences=[
190
+ {
191
+ "text": "इसका फायदा भी उठा सकते हैं एक छोटे जी और आग्रह पूर्वक इसको करिए आप देखिए गरीब के साथ आपका कैसा जुड़ा होता है उस पर हम को कैसे सफलता मिलती",
192
+ "speaker_wav": SPEAKER_REFERENCE,
193
+ "language": LANGUAGE,
194
+ },
195
+ {
196
+ "text": "अपने आप को भूल चुके हम जैसे निराश्रित बन गए नहीं मेरे प्यारे भाइयों बहनों ऐसा नहीं हो सकता स्वामी विवेकानंद अक्सर एक बात हमेशा बताया कर शायद यह बात उन्होंने",
197
+ "speaker_wav": SPEAKER_REFERENCE,
198
+ "language": LANGUAGE,
199
+ },
200
+ {
201
+ "text": "मुझे उनका सुझाव अच्छा लगा मेरा अपना अनुभव है गुजरात में मुख्यमंत्री 2011 में एथेंस में",
202
+ "speaker_wav": SPEAKER_REFERENCE,
203
+ "language": LANGUAGE,
204
+ },
205
+ {
206
+ "text": "मुख्यमंत्री",
207
+ "speaker_wav": SPEAKER_REFERENCE,
208
+ "language": LANGUAGE,
209
+ },
210
+ {
211
+ "text": "ओलंपिक",
212
+ "speaker_wav": SPEAKER_REFERENCE,
213
+ "language": LANGUAGE,
214
+ },
215
+ {
216
+ "text": "मेरे देशवासियों जब तक हम चलने",
217
+ "speaker_wav": SPEAKER_REFERENCE,
218
+ "language": LANGUAGE,
219
+ },
220
+ ],
221
+ eval_split_size=0.03
222
+ )
223
+
224
+ # init the model from config
225
+ model = GPTTrainer.init_from_config(config)
226
+
227
+ # load training samples
228
+ train_samples, eval_samples = load_tts_samples(
229
+ DATASETS_CONFIG_LIST,
230
+ eval_split=True,
231
+ eval_split_max_size=config.eval_split_max_size,
232
+ eval_split_size=config.eval_split_size,
233
+ )
234
+
235
+ # init the trainer and 🚀
236
+ trainer = Trainer(
237
+ TrainerArgs(
238
+ restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
239
+ skip_train_epoch=False,
240
+ start_with_eval=START_WITH_EVAL,
241
+ grad_accum_steps=GRAD_ACUMM_STEPS,
242
+ ),
243
+ config,
244
+ output_path=OUT_PATH,
245
+ model=model,
246
+ train_samples=train_samples,
247
+ eval_samples=eval_samples,
248
+ )
249
+ trainer.fit()
250
+
251
+
252
+ if __name__ == "__main__":
253
+ main()
recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-March-29-2024_10+35AM-0000000/train_gpt_xtts.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from trainer import Trainer, TrainerArgs
4
+
5
+ from TTS.config.shared_configs import BaseDatasetConfig
6
+ from TTS.tts.datasets import load_tts_samples
7
+ from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
8
+ from TTS.utils.manage import ModelManager
9
+
10
+ # Logging parameters
11
+ RUN_NAME = "NewModiSpeech"
12
+ PROJECT_NAME = "XTTS_trainer"
13
+ DASHBOARD_LOGGER = "tensorboard"
14
+ LOGGER_URI = None
15
+
16
+ # Set here the path that the checkpoints will be saved. Default: ./run/training/
17
+ OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training")
18
+
19
+ # Training Parameters
20
+ OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False
21
+ START_WITH_EVAL = True # if True it will star with evaluation
22
+ BATCH_SIZE = 5 # set here the batch size
23
+ GRAD_ACUMM_STEPS = 84 # set here the grad accumulation steps
24
+ # Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
25
+
26
+ # Define here the dataset that you want to use for the fine-tuning on.
27
+ config_dataset = BaseDatasetConfig(
28
+ formatter="ljspeech",
29
+ dataset_name="ljspeech",
30
+ path="/home/ubuntu/voicetts/finalsingle",
31
+ meta_file_train="/home/ubuntu/voicetts/finalsingle/metadata.txt",
32
+ language="hi",
33
+ )
34
+
35
+ # Add here the configs of the datasets
36
+ DATASETS_CONFIG_LIST = [config_dataset]
37
+
38
+ # Define the path where XTTS v2.0.1 files will be downloaded
39
+ CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "NewModiSpeech-March-28-2024_06+57AM-0000000/")
40
+ os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
41
+
42
+
43
+ # DVAE files
44
+ DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
45
+ MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
46
+
47
+ # Set the path to the downloaded files
48
+ DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
49
+ MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
50
+
51
+ # download DVAE files if needed
52
+ if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
53
+ print(" > Downloading DVAE files!")
54
+ ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
55
+
56
+
57
+ # Download XTTS v2.0 checkpoint if needed
58
+ TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
59
+ XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
60
+
61
+ # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
62
+ TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
63
+ XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
64
+
65
+ # download XTTS v2.0 files if needed
66
+ if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
67
+ print(" > Downloading XTTS v2.0 files!")
68
+ ModelManager._download_model_files(
69
+ [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
70
+ )
71
+
72
+
73
+ # Training sentences generations
74
+ SPEAKER_REFERENCE = [
75
+ # "./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences
76
+ "/home/ubuntu/voicetts/ds_path/modispeech.wav",
77
+ "/home/ubuntu/voicetts/final/wavs/clip_1.wav",
78
+ "/home/ubuntu/voicetts/final/wavs/clip_4.wav",
79
+ "/home/ubuntu/voicetts/final/wavs/clip_5.wav",
80
+ "/home/ubuntu/voicetts/final/wavs/clip_6.wav",
81
+ "/home/ubuntu/voicetts/final/wavs/clip_7.wav",
82
+ "/home/ubuntu/voicetts/final/wavs/clip_8.wav",
83
+ ]
84
+ LANGUAGE = config_dataset.language
85
+
86
+
87
+ def main():
88
+ # init args and config
89
+ model_args = GPTArgs(
90
+ max_conditioning_length=132300, # 6 secs
91
+ min_conditioning_length=66150, # 3 secs
92
+ debug_loading_failures=False,
93
+ max_wav_length=255995, # ~11.6 seconds
94
+ max_text_length=200,
95
+ mel_norm_file=MEL_NORM_FILE,
96
+ dvae_checkpoint=DVAE_CHECKPOINT,
97
+ xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
98
+ tokenizer_file=TOKENIZER_FILE,
99
+ gpt_num_audio_tokens=1026,
100
+ gpt_start_audio_token=1024,
101
+ gpt_stop_audio_token=1025,
102
+ gpt_use_masking_gt_prompt_approach=True,
103
+ gpt_use_perceiver_resampler=True,
104
+ )
105
+ # define audio config
106
+ audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
107
+ # training parameters config
108
+ config = GPTTrainerConfig(
109
+ epochs=50,
110
+ output_path=OUT_PATH,
111
+ model_args=model_args,
112
+ run_name=RUN_NAME,
113
+ project_name=PROJECT_NAME,
114
+ run_description="""
115
+ GPT XTTS training
116
+ """,
117
+ dashboard_logger=DASHBOARD_LOGGER,
118
+ logger_uri=LOGGER_URI,
119
+ audio=audio_config,
120
+ batch_size=BATCH_SIZE,
121
+ batch_group_size=48,
122
+ eval_batch_size=BATCH_SIZE,
123
+ num_loader_workers=8,
124
+ eval_split_max_size=256,
125
+ print_step=50,
126
+ plot_step=100,
127
+ log_model_step=1000,
128
+ save_step=10000,
129
+ save_n_checkpoints=1,
130
+ save_checkpoints=True,
131
+ # target_loss="loss",
132
+ print_eval=False,
133
+ # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
134
+ optimizer="AdamW",
135
+ optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
136
+ optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
137
+ lr=5e-06, # learning rate
138
+ lr_scheduler="MultiStepLR",
139
+ # it was adjusted accordly for the new step scheme
140
+ lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
141
+ test_sentences=[
142
+ {
143
+ "text": "प्रतिबद्धता",
144
+ "speaker_wav": SPEAKER_REFERENCE,
145
+ "language": LANGUAGE,
146
+ },
147
+ {
148
+ "text": "भव्य",
149
+ "speaker_wav": SPEAKER_REFERENCE,
150
+ "language": LANGUAGE,
151
+ },
152
+ {
153
+ "text": "मयना",
154
+ "speaker_wav": SPEAKER_REFERENCE,
155
+ "language": LANGUAGE,
156
+ },
157
+ {
158
+ "text": "फाल्गुनिबेन",
159
+ "speaker_wav": SPEAKER_REFERENCE,
160
+ "language": LANGUAGE,
161
+ },
162
+ {
163
+ "text": "भव्य नमस्कार , मैं नरेंद्र मोदी बात कर रहा हूँ.",
164
+ "speaker_wav": SPEAKER_REFERENCE,
165
+ "language": LANGUAGE,
166
+ },
167
+ {
168
+ "text": "राम राम फाल्गुनिबेन , मैं नरेंद्र मोदी बात कर रहा हूँ.",
169
+ "speaker_wav": SPEAKER_REFERENCE,
170
+ "language": LANGUAGE,
171
+ },
172
+ ],
173
+ )
174
+
175
+ # init the model from config
176
+ model = GPTTrainer.init_from_config(config)
177
+
178
+ # load training samples
179
+ train_samples, eval_samples = load_tts_samples(
180
+ DATASETS_CONFIG_LIST,
181
+ eval_split=True,
182
+ eval_split_max_size=config.eval_split_max_size,
183
+ eval_split_size=config.eval_split_size,
184
+ )
185
+
186
+ # init the trainer and 🚀
187
+ trainer = Trainer(
188
+ TrainerArgs(
189
+ restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
190
+ skip_train_epoch=False,
191
+ start_with_eval=START_WITH_EVAL,
192
+ grad_accum_steps=GRAD_ACUMM_STEPS,
193
+ ),
194
+ config,
195
+ output_path=OUT_PATH,
196
+ model=model,
197
+ train_samples=train_samples,
198
+ eval_samples=eval_samples,
199
+ )
200
+ trainer.fit()
201
+
202
+
203
+ if __name__ == "__main__":
204
+ main()
recipes/ljspeech/xtts_v2/train_gpt_xtts.py CHANGED
@@ -27,8 +27,8 @@ GRAD_ACUMM_STEPS = 84 # set here the grad accumulation steps
27
  config_dataset = BaseDatasetConfig(
28
  formatter="ljspeech",
29
  dataset_name="ljspeech",
30
- path="/home/ubuntu/voicetts/final",
31
- meta_file_train="/home/ubuntu/voicetts/final/metadata.csv",
32
  language="hi",
33
  )
34
 
@@ -36,7 +36,7 @@ config_dataset = BaseDatasetConfig(
36
  DATASETS_CONFIG_LIST = [config_dataset]
37
 
38
  # Define the path where XTTS v2.0.1 files will be downloaded
39
- CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
40
  os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
41
 
42
 
@@ -73,10 +73,65 @@ if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
73
  # Training sentences generations
74
  SPEAKER_REFERENCE = [
75
  # "./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences
76
- "/home/ubuntu/voicetts/ds_path/2.wav"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  ]
78
  LANGUAGE = config_dataset.language
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  def main():
82
  # init args and config
@@ -85,7 +140,7 @@ def main():
85
  min_conditioning_length=66150, # 3 secs
86
  debug_loading_failures=False,
87
  max_wav_length=255995, # ~11.6 seconds
88
- max_text_length=300,
89
  mel_norm_file=MEL_NORM_FILE,
90
  dvae_checkpoint=DVAE_CHECKPOINT,
91
  xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
@@ -133,26 +188,37 @@ def main():
133
  lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
134
  test_sentences=[
135
  {
136
- "text": "राम राम फाल्गुनिबेन , मैं नरेंद्र मोदी बात कर रहा हूँ. पिछले १० साल में भारत विश्वगुरु बनने की दिशा में आगे बढ़ रहा है . आप अपने लोकसभा के प्रत्याशी हसमुख पटेलजी को वोट देकर भारतकी अद्भुत यात्रामें अपना सहयोग दें. धन्यवाद ",
 
 
 
 
 
137
  "speaker_wav": SPEAKER_REFERENCE,
138
  "language": LANGUAGE,
139
  },
140
  {
141
- "text": "भव्य नमस्कार , मैं नरेंद्र मोदी बात कर रहा हूँ. पिछले १० साल में भारत विश्वगुरु बनने की दिशा में आगे बढ़ रहा है . आप अपने लोकसभा के प्रत्याशी हसमुख पटेलजी को वोट देकर भारतकी अद्भुत यात्रामें अपना सहयोग दें. धन्यवाद ",
142
  "speaker_wav": SPEAKER_REFERENCE,
143
  "language": LANGUAGE,
144
  },
145
  {
146
- "text": "मयना नमस्कार , मैं नरेंद्र मोदी बात कर रहा हूँ. पिछले १० साल में भारत विश्वगुरु बनने की दिशा में आगे बढ़ रहा है . आप अपने लोकसभा के प्रत्याशी हसमुख पटेलजी को वोट देकर भारतकी अद्भुत यात्रामें अपना सहयोग दें. धन्यवाद ",
147
  "speaker_wav": SPEAKER_REFERENCE,
148
  "language": LANGUAGE,
149
  },
150
  {
151
- "text": "Hello, Samir.I'm Narendra Modi. In the previous ten years, our government has launched numerous measures that have benefited entrepreneurs. In my third term, India will be the world's third largest economy. Vote for the BJP to see India become a superpower. Thank you!",
152
  "speaker_wav": SPEAKER_REFERENCE,
153
- "language": "en",
 
 
 
 
 
154
  },
155
  ],
 
156
  )
157
 
158
  # init the model from config
 
27
  config_dataset = BaseDatasetConfig(
28
  formatter="ljspeech",
29
  dataset_name="ljspeech",
30
+ path="/home/ubuntu/voicetts/modi/hindi",
31
+ meta_file_train="/home/ubuntu/voicetts/modi/hindi/metadata.txt",
32
  language="hi",
33
  )
34
 
 
36
  DATASETS_CONFIG_LIST = [config_dataset]
37
 
38
  # Define the path where XTTS v2.0.1 files will be downloaded
39
+ CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "NewModiSpeech-March-29-2024_10+35AM-0000000/")
40
  os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
41
 
42
 
 
73
  # Training sentences generations
74
  SPEAKER_REFERENCE = [
75
  # "./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences
76
+ "/home/ubuntu/voicetts/ds_path/modispeech.wav",
77
+ "/home/ubuntu/voicetts/final/wavs/clip_1.wav",
78
+ "/home/ubuntu/voicetts/final/wavs/clip_4.wav",
79
+ "/home/ubuntu/voicetts/final/wavs/clip_5.wav",
80
+ "/home/ubuntu/voicetts/final/wavs/clip_6.wav",
81
+ "/home/ubuntu/voicetts/final/wavs/clip_7.wav",
82
+ "/home/ubuntu/voicetts/final/wavs/clip_8.wav",
83
+
84
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_2.wav",
85
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_3.wav",
86
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_4.wav",
87
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_5.wav",
88
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_6.wav",
89
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_7.wav",
90
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_8.wav",
91
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_9.wav",
92
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_10.wav",
93
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_11.wav",
94
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_12.wav",
95
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_13.wav",
96
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_14.wav",
97
+ # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_15.wav",
98
  ]
99
  LANGUAGE = config_dataset.language
100
 
101
+ BEST_LOSS = None
102
+ MAX_PATIENCE = 1 # early stopping patience
103
+ CURRENT_PATIENCE = 0 # current patience
104
+ def early_stopping_fn(eval_results):
105
+ """
106
+ This function is called after each evaluation step.
107
+ If you want to implement early stopping, you can do it here.
108
+
109
+ If model has not imporoved for 25 epoches, it will stop the training.
110
+ """
111
+ global BEST_LOSS
112
+ global CURRENT_PATIENCE
113
+ print(" > Early stopping function called!")
114
+ print(f" > Current patience: {CURRENT_PATIENCE}/{MAX_PATIENCE}")
115
+ print('Best Loss', BEST_LOSS)
116
+ print('Current Best Loss', eval_results.best_loss)
117
+ current_best_loss = eval_results.best_loss['eval_loss']
118
+ if BEST_LOSS is None:
119
+ BEST_LOSS = current_best_loss
120
+ else:
121
+ if CURRENT_PATIENCE <= MAX_PATIENCE:
122
+ print(" > Early stopping!")
123
+ return True
124
+ elif current_best_loss < BEST_LOSS:
125
+ BEST_LOSS = current_best_loss
126
+ CURRENT_PATIENCE = 0
127
+ elif CURRENT_PATIENCE < MAX_PATIENCE:
128
+ CURRENT_PATIENCE += 1
129
+ if CURRENT_PATIENCE >= MAX_PATIENCE:
130
+ print(" > Early stopping!")
131
+ return True
132
+ print('Updated Best Loss', BEST_LOSS)
133
+ return False
134
+
135
 
136
  def main():
137
  # init args and config
 
140
  min_conditioning_length=66150, # 3 secs
141
  debug_loading_failures=False,
142
  max_wav_length=255995, # ~11.6 seconds
143
+ max_text_length=250,
144
  mel_norm_file=MEL_NORM_FILE,
145
  dvae_checkpoint=DVAE_CHECKPOINT,
146
  xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
 
188
  lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
189
  test_sentences=[
190
  {
191
+ "text": "इसका फायदा भी उठा सकते हैं एक छोटे जी और आग्रह पूर्वक इसको करिए आप देखिए गरीब के साथ आपका कैसा जुड़ा होता है उस पर हम को कैसे सफलता मिलती",
192
+ "speaker_wav": SPEAKER_REFERENCE,
193
+ "language": LANGUAGE,
194
+ },
195
+ {
196
+ "text": "अपने आप को भूल चुके हम जैसे निराश्रित बन गए नहीं मेरे प्यारे भाइयों बहनों ऐसा नहीं हो सकता स्वामी विवेकानंद अक्सर एक बात हमेशा बताया कर शायद यह बात उन्होंने",
197
  "speaker_wav": SPEAKER_REFERENCE,
198
  "language": LANGUAGE,
199
  },
200
  {
201
+ "text": "मुझे उनका सुझाव अच्छा लगा मेरा अपना अनुभव है गुजरात में मुख्यमंत्री 2011 में एथेंस में",
202
  "speaker_wav": SPEAKER_REFERENCE,
203
  "language": LANGUAGE,
204
  },
205
  {
206
+ "text": "मुख्यमंत्री",
207
  "speaker_wav": SPEAKER_REFERENCE,
208
  "language": LANGUAGE,
209
  },
210
  {
211
+ "text": "ओलंपिक",
212
  "speaker_wav": SPEAKER_REFERENCE,
213
+ "language": LANGUAGE,
214
+ },
215
+ {
216
+ "text": "मेरे देशवासियों जब तक हम चलने",
217
+ "speaker_wav": SPEAKER_REFERENCE,
218
+ "language": LANGUAGE,
219
  },
220
  ],
221
+ eval_split_size=0.03
222
  )
223
 
224
  # init the model from config