praveenchordia
/

tts

Model card Files Files and versions Community

praveenchordia commited on Apr 8, 2024

Commit

0eca443

1 Parent(s): 7b8b31f

Update paths and add new training data for XTTS with Modi voice

Browse files

Files changed (5) hide show

recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-April-04-2024_06+55AM-0000000/train_gpt_xtts.py +253 -0
recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-April-04-2024_07+01AM-0000000/train_gpt_xtts.py +253 -0
recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-April-04-2024_09+33AM-0000000/train_gpt_xtts.py +253 -0
recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-March-29-2024_10+35AM-0000000/train_gpt_xtts.py +204 -0
recipes/ljspeech/xtts_v2/train_gpt_xtts.py +76 -10

recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-April-04-2024_06+55AM-0000000/train_gpt_xtts.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import os
+from trainer import Trainer, TrainerArgs
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
+from TTS.utils.manage import ModelManager
+# Logging parameters
+RUN_NAME = "NewModiSpeech"
+PROJECT_NAME = "XTTS_trainer"
+DASHBOARD_LOGGER = "tensorboard"
+LOGGER_URI = None
+# Set here the path that the checkpoints will be saved. Default: ./run/training/
+OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training")
+# Training Parameters
+OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
+START_WITH_EVAL = True  # if True it will star with evaluation
+BATCH_SIZE = 5  # set here the batch size
+GRAD_ACUMM_STEPS = 84  # set here the grad accumulation steps
+# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
+# Define here the dataset that you want to use for the fine-tuning on.
+config_dataset = BaseDatasetConfig(
+    formatter="ljspeech",
+    dataset_name="ljspeech",
+    path="/home/ubuntu/voicetts/modi/hindi",
+    meta_file_train="/home/ubuntu/voicetts/modi/hindi/metadata.txt",
+    language="hi",
+)
+# Add here the configs of the datasets
+DATASETS_CONFIG_LIST = [config_dataset]
+# Define the path where XTTS v2.0.1 files will be downloaded
+CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "NewModiSpeech-March-29-2024_10+35AM-0000000/")
+os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
+# DVAE files
+DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
+MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
+# Set the path to the downloaded files
+DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
+MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
+# download DVAE files if needed
+if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
+    print(" > Downloading DVAE files!")
+    ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
+# Download XTTS v2.0 checkpoint if needed
+TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
+XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
+# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
+TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json file
+XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth file
+# download XTTS v2.0 files if needed
+if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
+    print(" > Downloading XTTS v2.0 files!")
+    ModelManager._download_model_files(
+        [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
+    )
+# Training sentences generations
+SPEAKER_REFERENCE = [
+    # "./tests/data/ljspeech/wavs/LJ001-0002.wav"  # speaker reference to be used in training test sentences
+    "/home/ubuntu/voicetts/ds_path/modispeech.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_1.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_4.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_5.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_6.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_7.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_8.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_2.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_3.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_4.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_5.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_6.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_7.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_8.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_9.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_10.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_11.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_12.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_13.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_14.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_15.wav",
+]
+LANGUAGE = config_dataset.language
+BEST_LOSS = None
+MAX_PATIENCE = 1  # early stopping patience
+CURRENT_PATIENCE = 0  # current patience
+def early_stopping_fn(eval_results):
+    """
+    This function is called after each evaluation step.
+    If you want to implement early stopping, you can do it here.
+    If model has not imporoved for 25 epoches, it will stop the training.
+    """
+    global BEST_LOSS
+    global CURRENT_PATIENCE
+    print(" > Early stopping function called!")
+    print(f" > Current patience: {CURRENT_PATIENCE}/{MAX_PATIENCE}")
+    print('Best Loss', BEST_LOSS)
+    print('Current Best Loss', eval_results.best_loss)
+    current_best_loss = eval_results.best_loss['eval_loss']
+    if BEST_LOSS is None:
+        BEST_LOSS = current_best_loss
+    else:
+        if CURRENT_PATIENCE <= MAX_PATIENCE:
+            print(" > Early stopping!")
+            return True
+        elif current_best_loss < BEST_LOSS:
+            BEST_LOSS = current_best_loss
+            CURRENT_PATIENCE = 0
+        elif CURRENT_PATIENCE < MAX_PATIENCE:
+            CURRENT_PATIENCE += 1
+            if CURRENT_PATIENCE >= MAX_PATIENCE:
+                print(" > Early stopping!")
+                return True
+    print('Updated Best Loss', BEST_LOSS)
+    return False
+def main():
+    # init args and config
+    model_args = GPTArgs(
+        max_conditioning_length=132300,  # 6 secs
+        min_conditioning_length=66150,  # 3 secs
+        debug_loading_failures=False,
+        max_wav_length=255995,  # ~11.6 seconds
+        max_text_length=300,
+        mel_norm_file=MEL_NORM_FILE,
+        dvae_checkpoint=DVAE_CHECKPOINT,
+        xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
+        tokenizer_file=TOKENIZER_FILE,
+        gpt_num_audio_tokens=1026,
+        gpt_start_audio_token=1024,
+        gpt_stop_audio_token=1025,
+        gpt_use_masking_gt_prompt_approach=True,
+        gpt_use_perceiver_resampler=True,
+    )
+    # define audio config
+    audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
+    # training parameters config
+    config = GPTTrainerConfig(
+        output_path=OUT_PATH,
+        model_args=model_args,
+        run_name=RUN_NAME,
+        project_name=PROJECT_NAME,
+        run_description="""
+            GPT XTTS training
+            """,
+        dashboard_logger=DASHBOARD_LOGGER,
+        logger_uri=LOGGER_URI,
+        audio=audio_config,
+        batch_size=BATCH_SIZE,
+        batch_group_size=48,
+        eval_batch_size=BATCH_SIZE,
+        num_loader_workers=8,
+        eval_split_max_size=256,
+        print_step=50,
+        plot_step=100,
+        log_model_step=1000,
+        save_step=10000,
+        save_n_checkpoints=1,
+        save_checkpoints=True,
+        # target_loss="loss",
+        print_eval=True,
+        # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
+        optimizer="AdamW",
+        optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
+        optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
+        lr=5e-06,  # learning rate
+        lr_scheduler="MultiStepLR",
+        # it was adjusted accordly for the new step scheme
+        lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
+        test_sentences=[
+            {
+                "text": "इसका फायदा भी उठा सकते हैं एक छोटे जी और आग्रह पूर्वक इसको करिए आप देखिए गरीब के साथ आपका कैसा जुड़ा होता है उस पर हम को कैसे सफलता मिलती",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "अपने आप को भूल चुके हम जैसे निराश्रित बन गए नहीं मेरे प्यारे भाइयों बहनों ऐसा नहीं हो सकता स्वामी विवेकानंद अक्सर एक बात हमेशा बताया कर शायद यह बात उन्होंने",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "मुझे उनका सुझाव अच्छा लगा मेरा अपना अनुभव है गुजरात में मुख्यमंत्री 2011 में एथेंस में",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "मुख्यमंत्री",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "ओलंपिक",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "मेरे देशवासियों जब तक हम चलने",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+        ],
+        eval_split_size=0.05
+    )
+    # init the model from config
+    model = GPTTrainer.init_from_config(config)
+    # load training samples
+    train_samples, eval_samples = load_tts_samples(
+        DATASETS_CONFIG_LIST,
+        eval_split=True,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
+    )
+    # init the trainer and 🚀
+    trainer = Trainer(
+        TrainerArgs(
+            restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
+            skip_train_epoch=False,
+            start_with_eval=START_WITH_EVAL,
+            grad_accum_steps=GRAD_ACUMM_STEPS,
+        ),
+        config,
+        output_path=OUT_PATH,
+        model=model,
+        train_samples=train_samples,
+        eval_samples=eval_samples,
+    )
+    trainer.fit()
+if __name__ == "__main__":
+    main()

recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-April-04-2024_07+01AM-0000000/train_gpt_xtts.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import os
+from trainer import Trainer, TrainerArgs
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
+from TTS.utils.manage import ModelManager
+# Logging parameters
+RUN_NAME = "NewModiSpeech"
+PROJECT_NAME = "XTTS_trainer"
+DASHBOARD_LOGGER = "tensorboard"
+LOGGER_URI = None
+# Set here the path that the checkpoints will be saved. Default: ./run/training/
+OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training")
+# Training Parameters
+OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
+START_WITH_EVAL = True  # if True it will star with evaluation
+BATCH_SIZE = 5  # set here the batch size
+GRAD_ACUMM_STEPS = 84  # set here the grad accumulation steps
+# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
+# Define here the dataset that you want to use for the fine-tuning on.
+config_dataset = BaseDatasetConfig(
+    formatter="ljspeech",
+    dataset_name="ljspeech",
+    path="/home/ubuntu/voicetts/modi/hindi",
+    meta_file_train="/home/ubuntu/voicetts/modi/hindi/metadata.txt",
+    language="hi",
+)
+# Add here the configs of the datasets
+DATASETS_CONFIG_LIST = [config_dataset]
+# Define the path where XTTS v2.0.1 files will be downloaded
+CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "NewModiSpeech-March-29-2024_10+35AM-0000000/")
+os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
+# DVAE files
+DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
+MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
+# Set the path to the downloaded files
+DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
+MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
+# download DVAE files if needed
+if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
+    print(" > Downloading DVAE files!")
+    ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
+# Download XTTS v2.0 checkpoint if needed
+TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
+XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
+# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
+TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json file
+XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth file
+# download XTTS v2.0 files if needed
+if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
+    print(" > Downloading XTTS v2.0 files!")
+    ModelManager._download_model_files(
+        [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
+    )
+# Training sentences generations
+SPEAKER_REFERENCE = [
+    # "./tests/data/ljspeech/wavs/LJ001-0002.wav"  # speaker reference to be used in training test sentences
+    "/home/ubuntu/voicetts/ds_path/modispeech.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_1.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_4.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_5.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_6.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_7.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_8.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_2.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_3.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_4.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_5.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_6.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_7.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_8.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_9.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_10.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_11.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_12.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_13.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_14.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_15.wav",
+]
+LANGUAGE = config_dataset.language
+BEST_LOSS = None
+MAX_PATIENCE = 1  # early stopping patience
+CURRENT_PATIENCE = 0  # current patience
+def early_stopping_fn(eval_results):
+    """
+    This function is called after each evaluation step.
+    If you want to implement early stopping, you can do it here.
+    If model has not imporoved for 25 epoches, it will stop the training.
+    """
+    global BEST_LOSS
+    global CURRENT_PATIENCE
+    print(" > Early stopping function called!")
+    print(f" > Current patience: {CURRENT_PATIENCE}/{MAX_PATIENCE}")
+    print('Best Loss', BEST_LOSS)
+    print('Current Best Loss', eval_results.best_loss)
+    current_best_loss = eval_results.best_loss['eval_loss']
+    if BEST_LOSS is None:
+        BEST_LOSS = current_best_loss
+    else:
+        if CURRENT_PATIENCE <= MAX_PATIENCE:
+            print(" > Early stopping!")
+            return True
+        elif current_best_loss < BEST_LOSS:
+            BEST_LOSS = current_best_loss
+            CURRENT_PATIENCE = 0
+        elif CURRENT_PATIENCE < MAX_PATIENCE:
+            CURRENT_PATIENCE += 1
+            if CURRENT_PATIENCE >= MAX_PATIENCE:
+                print(" > Early stopping!")
+                return True
+    print('Updated Best Loss', BEST_LOSS)
+    return False
+def main():
+    # init args and config
+    model_args = GPTArgs(
+        max_conditioning_length=132300,  # 6 secs
+        min_conditioning_length=66150,  # 3 secs
+        debug_loading_failures=False,
+        max_wav_length=255995,  # ~11.6 seconds
+        max_text_length=250,
+        mel_norm_file=MEL_NORM_FILE,
+        dvae_checkpoint=DVAE_CHECKPOINT,
+        xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
+        tokenizer_file=TOKENIZER_FILE,
+        gpt_num_audio_tokens=1026,
+        gpt_start_audio_token=1024,
+        gpt_stop_audio_token=1025,
+        gpt_use_masking_gt_prompt_approach=True,
+        gpt_use_perceiver_resampler=True,
+    )
+    # define audio config
+    audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
+    # training parameters config
+    config = GPTTrainerConfig(
+        output_path=OUT_PATH,
+        model_args=model_args,
+        run_name=RUN_NAME,
+        project_name=PROJECT_NAME,
+        run_description="""
+            GPT XTTS training
+            """,
+        dashboard_logger=DASHBOARD_LOGGER,
+        logger_uri=LOGGER_URI,
+        audio=audio_config,
+        batch_size=BATCH_SIZE,
+        batch_group_size=48,
+        eval_batch_size=BATCH_SIZE,
+        num_loader_workers=8,
+        eval_split_max_size=256,
+        print_step=50,
+        plot_step=100,
+        log_model_step=1000,
+        save_step=10000,
+        save_n_checkpoints=1,
+        save_checkpoints=True,
+        # target_loss="loss",
+        print_eval=True,
+        # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
+        optimizer="AdamW",
+        optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
+        optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
+        lr=5e-06,  # learning rate
+        lr_scheduler="MultiStepLR",
+        # it was adjusted accordly for the new step scheme
+        lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
+        test_sentences=[
+            {
+                "text": "इसका फायदा भी उठा सकते हैं एक छोटे जी और आग्रह पूर्वक इसको करिए आप देखिए गरीब के साथ आपका कैसा जुड़ा होता है उस पर हम को कैसे सफलता मिलती",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "अपने आप को भूल चुके हम जैसे निराश्रित बन गए नहीं मेरे प्यारे भाइयों बहनों ऐसा नहीं हो सकता स्वामी विवेकानंद अक्सर एक बात हमेशा बताया कर शायद यह बात उन्होंने",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "मुझे उनका सुझाव अच्छा लगा मेरा अपना अनुभव है गुजरात में मुख्यमंत्री 2011 में एथेंस में",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "मुख्यमंत्री",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "ओलंपिक",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "मेरे देशवासियों जब तक हम चलने",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+        ],
+        eval_split_size=0.05
+    )
+    # init the model from config
+    model = GPTTrainer.init_from_config(config)
+    # load training samples
+    train_samples, eval_samples = load_tts_samples(
+        DATASETS_CONFIG_LIST,
+        eval_split=True,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
+    )
+    # init the trainer and 🚀
+    trainer = Trainer(
+        TrainerArgs(
+            restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
+            skip_train_epoch=False,
+            start_with_eval=START_WITH_EVAL,
+            grad_accum_steps=GRAD_ACUMM_STEPS,
+        ),
+        config,
+        output_path=OUT_PATH,
+        model=model,
+        train_samples=train_samples,
+        eval_samples=eval_samples,
+    )
+    trainer.fit()
+if __name__ == "__main__":
+    main()

recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-April-04-2024_09+33AM-0000000/train_gpt_xtts.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import os
+from trainer import Trainer, TrainerArgs
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
+from TTS.utils.manage import ModelManager
+# Logging parameters
+RUN_NAME = "NewModiSpeech"
+PROJECT_NAME = "XTTS_trainer"
+DASHBOARD_LOGGER = "tensorboard"
+LOGGER_URI = None
+# Set here the path that the checkpoints will be saved. Default: ./run/training/
+OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training")
+# Training Parameters
+OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
+START_WITH_EVAL = True  # if True it will star with evaluation
+BATCH_SIZE = 10  # set here the batch size
+GRAD_ACUMM_STEPS = 84  # set here the grad accumulation steps
+# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
+# Define here the dataset that you want to use for the fine-tuning on.
+config_dataset = BaseDatasetConfig(
+    formatter="ljspeech",
+    dataset_name="ljspeech",
+    path="/home/ubuntu/voicetts/modi/hindi",
+    meta_file_train="/home/ubuntu/voicetts/modi/hindi/metadata.txt",
+    language="hi",
+)
+# Add here the configs of the datasets
+DATASETS_CONFIG_LIST = [config_dataset]
+# Define the path where XTTS v2.0.1 files will be downloaded
+CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "NewModiSpeech-March-29-2024_10+35AM-0000000/")
+os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
+# DVAE files
+DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
+MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
+# Set the path to the downloaded files
+DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
+MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
+# download DVAE files if needed
+if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
+    print(" > Downloading DVAE files!")
+    ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
+# Download XTTS v2.0 checkpoint if needed
+TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
+XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
+# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
+TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json file
+XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth file
+# download XTTS v2.0 files if needed
+if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
+    print(" > Downloading XTTS v2.0 files!")
+    ModelManager._download_model_files(
+        [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
+    )
+# Training sentences generations
+SPEAKER_REFERENCE = [
+    # "./tests/data/ljspeech/wavs/LJ001-0002.wav"  # speaker reference to be used in training test sentences
+    "/home/ubuntu/voicetts/ds_path/modispeech.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_1.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_4.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_5.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_6.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_7.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_8.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_2.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_3.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_4.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_5.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_6.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_7.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_8.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_9.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_10.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_11.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_12.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_13.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_14.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_15.wav",
+]
+LANGUAGE = config_dataset.language
+BEST_LOSS = None
+MAX_PATIENCE = 1  # early stopping patience
+CURRENT_PATIENCE = 0  # current patience
+def early_stopping_fn(eval_results):
+    """
+    This function is called after each evaluation step.
+    If you want to implement early stopping, you can do it here.
+    If model has not imporoved for 25 epoches, it will stop the training.
+    """
+    global BEST_LOSS
+    global CURRENT_PATIENCE
+    print(" > Early stopping function called!")
+    print(f" > Current patience: {CURRENT_PATIENCE}/{MAX_PATIENCE}")
+    print('Best Loss', BEST_LOSS)
+    print('Current Best Loss', eval_results.best_loss)
+    current_best_loss = eval_results.best_loss['eval_loss']
+    if BEST_LOSS is None:
+        BEST_LOSS = current_best_loss
+    else:
+        if CURRENT_PATIENCE <= MAX_PATIENCE:
+            print(" > Early stopping!")
+            return True
+        elif current_best_loss < BEST_LOSS:
+            BEST_LOSS = current_best_loss
+            CURRENT_PATIENCE = 0
+        elif CURRENT_PATIENCE < MAX_PATIENCE:
+            CURRENT_PATIENCE += 1
+            if CURRENT_PATIENCE >= MAX_PATIENCE:
+                print(" > Early stopping!")
+                return True
+    print('Updated Best Loss', BEST_LOSS)
+    return False
+def main():
+    # init args and config
+    model_args = GPTArgs(
+        max_conditioning_length=132300,  # 6 secs
+        min_conditioning_length=66150,  # 3 secs
+        debug_loading_failures=False,
+        max_wav_length=255995,  # ~11.6 seconds
+        max_text_length=250,
+        mel_norm_file=MEL_NORM_FILE,
+        dvae_checkpoint=DVAE_CHECKPOINT,
+        xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
+        tokenizer_file=TOKENIZER_FILE,
+        gpt_num_audio_tokens=1026,
+        gpt_start_audio_token=1024,
+        gpt_stop_audio_token=1025,
+        gpt_use_masking_gt_prompt_approach=True,
+        gpt_use_perceiver_resampler=True,
+    )
+    # define audio config
+    audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
+    # training parameters config
+    config = GPTTrainerConfig(
+        output_path=OUT_PATH,
+        model_args=model_args,
+        run_name=RUN_NAME,
+        project_name=PROJECT_NAME,
+        run_description="""
+            GPT XTTS training
+            """,
+        dashboard_logger=DASHBOARD_LOGGER,
+        logger_uri=LOGGER_URI,
+        audio=audio_config,
+        batch_size=BATCH_SIZE,
+        batch_group_size=48,
+        eval_batch_size=BATCH_SIZE,
+        num_loader_workers=8,
+        eval_split_max_size=256,
+        print_step=50,
+        plot_step=100,
+        log_model_step=1000,
+        save_step=10000,
+        save_n_checkpoints=1,
+        save_checkpoints=True,
+        # target_loss="loss",
+        print_eval=False,
+        # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
+        optimizer="AdamW",
+        optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
+        optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
+        lr=5e-06,  # learning rate
+        lr_scheduler="MultiStepLR",
+        # it was adjusted accordly for the new step scheme
+        lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
+        test_sentences=[
+            {
+                "text": "इसका फायदा भी उठा सकते हैं एक छोटे जी और आग्रह पूर्वक इसको करिए आप देखिए गरीब के साथ आपका कैसा जुड़ा होता है उस पर हम को कैसे सफलता मिलती",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "अपने आप को भूल चुके हम जैसे निराश्रित बन गए नहीं मेरे प्यारे भाइयों बहनों ऐसा नहीं हो सकता स्वामी विवेकानंद अक्सर एक बात हमेशा बताया कर शायद यह बात उन्होंने",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "मुझे उनका सुझाव अच्छा लगा मेरा अपना अनुभव है गुजरात में मुख्यमंत्री 2011 में एथेंस में",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "मुख्यमंत्री",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "ओलंपिक",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "मेरे देशवासियों जब तक हम चलने",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+        ],
+        eval_split_size=0.03
+    )
+    # init the model from config
+    model = GPTTrainer.init_from_config(config)
+    # load training samples
+    train_samples, eval_samples = load_tts_samples(
+        DATASETS_CONFIG_LIST,
+        eval_split=True,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
+    )
+    # init the trainer and 🚀
+    trainer = Trainer(
+        TrainerArgs(
+            restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
+            skip_train_epoch=False,
+            start_with_eval=START_WITH_EVAL,
+            grad_accum_steps=GRAD_ACUMM_STEPS,
+        ),
+        config,
+        output_path=OUT_PATH,
+        model=model,
+        train_samples=train_samples,
+        eval_samples=eval_samples,
+    )
+    trainer.fit()
+if __name__ == "__main__":
+    main()

recipes/ljspeech/xtts_v2/run/training/NewModiSpeech-March-29-2024_10+35AM-0000000/train_gpt_xtts.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import os
+from trainer import Trainer, TrainerArgs
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
+from TTS.utils.manage import ModelManager
+# Logging parameters
+RUN_NAME = "NewModiSpeech"
+PROJECT_NAME = "XTTS_trainer"
+DASHBOARD_LOGGER = "tensorboard"
+LOGGER_URI = None
+# Set here the path that the checkpoints will be saved. Default: ./run/training/
+OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training")
+# Training Parameters
+OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
+START_WITH_EVAL = True  # if True it will star with evaluation
+BATCH_SIZE = 5  # set here the batch size
+GRAD_ACUMM_STEPS = 84  # set here the grad accumulation steps
+# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
+# Define here the dataset that you want to use for the fine-tuning on.
+config_dataset = BaseDatasetConfig(
+    formatter="ljspeech",
+    dataset_name="ljspeech",
+    path="/home/ubuntu/voicetts/finalsingle",
+    meta_file_train="/home/ubuntu/voicetts/finalsingle/metadata.txt",
+    language="hi",
+)
+# Add here the configs of the datasets
+DATASETS_CONFIG_LIST = [config_dataset]
+# Define the path where XTTS v2.0.1 files will be downloaded
+CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "NewModiSpeech-March-28-2024_06+57AM-0000000/")
+os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
+# DVAE files
+DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
+MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
+# Set the path to the downloaded files
+DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
+MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
+# download DVAE files if needed
+if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
+    print(" > Downloading DVAE files!")
+    ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
+# Download XTTS v2.0 checkpoint if needed
+TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
+XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
+# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
+TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json file
+XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth file
+# download XTTS v2.0 files if needed
+if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
+    print(" > Downloading XTTS v2.0 files!")
+    ModelManager._download_model_files(
+        [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
+    )
+# Training sentences generations
+SPEAKER_REFERENCE = [
+    # "./tests/data/ljspeech/wavs/LJ001-0002.wav"  # speaker reference to be used in training test sentences
+    "/home/ubuntu/voicetts/ds_path/modispeech.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_1.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_4.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_5.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_6.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_7.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_8.wav",
+]
+LANGUAGE = config_dataset.language
+def main():
+    # init args and config
+    model_args = GPTArgs(
+        max_conditioning_length=132300,  # 6 secs
+        min_conditioning_length=66150,  # 3 secs
+        debug_loading_failures=False,
+        max_wav_length=255995,  # ~11.6 seconds
+        max_text_length=200,
+        mel_norm_file=MEL_NORM_FILE,
+        dvae_checkpoint=DVAE_CHECKPOINT,
+        xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
+        tokenizer_file=TOKENIZER_FILE,
+        gpt_num_audio_tokens=1026,
+        gpt_start_audio_token=1024,
+        gpt_stop_audio_token=1025,
+        gpt_use_masking_gt_prompt_approach=True,
+        gpt_use_perceiver_resampler=True,
+    )
+    # define audio config
+    audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
+    # training parameters config
+    config = GPTTrainerConfig(
+        epochs=50,
+        output_path=OUT_PATH,
+        model_args=model_args,
+        run_name=RUN_NAME,
+        project_name=PROJECT_NAME,
+        run_description="""
+            GPT XTTS training
+            """,
+        dashboard_logger=DASHBOARD_LOGGER,
+        logger_uri=LOGGER_URI,
+        audio=audio_config,
+        batch_size=BATCH_SIZE,
+        batch_group_size=48,
+        eval_batch_size=BATCH_SIZE,
+        num_loader_workers=8,
+        eval_split_max_size=256,
+        print_step=50,
+        plot_step=100,
+        log_model_step=1000,
+        save_step=10000,
+        save_n_checkpoints=1,
+        save_checkpoints=True,
+        # target_loss="loss",
+        print_eval=False,
+        # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
+        optimizer="AdamW",
+        optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
+        optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
+        lr=5e-06,  # learning rate
+        lr_scheduler="MultiStepLR",
+        # it was adjusted accordly for the new step scheme
+        lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
+        test_sentences=[
+            {
+                "text": "प्रतिबद्धता",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "भव्य",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "मयना",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "फाल्गुनिबेन",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "भव्य नमस्कार , मैं नरेंद्र मोदी बात कर रहा हूँ.",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "राम राम फाल्गुनिबेन , मैं नरेंद्र मोदी बात कर रहा हूँ.",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+        ],
+    )
+    # init the model from config
+    model = GPTTrainer.init_from_config(config)
+    # load training samples
+    train_samples, eval_samples = load_tts_samples(
+        DATASETS_CONFIG_LIST,
+        eval_split=True,
+        eval_split_max_size=config.eval_split_max_size,
+        eval_split_size=config.eval_split_size,
+    )
+    # init the trainer and 🚀
+    trainer = Trainer(
+        TrainerArgs(
+            restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
+            skip_train_epoch=False,
+            start_with_eval=START_WITH_EVAL,
+            grad_accum_steps=GRAD_ACUMM_STEPS,
+        ),
+        config,
+        output_path=OUT_PATH,
+        model=model,
+        train_samples=train_samples,
+        eval_samples=eval_samples,
+    )
+    trainer.fit()
+if __name__ == "__main__":
+    main()

recipes/ljspeech/xtts_v2/train_gpt_xtts.py CHANGED Viewed

@@ -27,8 +27,8 @@ GRAD_ACUMM_STEPS = 84  # set here the grad accumulation steps
 config_dataset = BaseDatasetConfig(
     formatter="ljspeech",
     dataset_name="ljspeech",
-    path="/home/ubuntu/voicetts/final",
-    meta_file_train="/home/ubuntu/voicetts/final/metadata.csv",
     language="hi",
 )
@@ -36,7 +36,7 @@ config_dataset = BaseDatasetConfig(
 DATASETS_CONFIG_LIST = [config_dataset]
 # Define the path where XTTS v2.0.1 files will be downloaded
-CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
 os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
@@ -73,10 +73,65 @@ if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
 # Training sentences generations
 SPEAKER_REFERENCE = [
     # "./tests/data/ljspeech/wavs/LJ001-0002.wav"  # speaker reference to be used in training test sentences
-    "/home/ubuntu/voicetts/ds_path/2.wav"
 ]
 LANGUAGE = config_dataset.language
 def main():
     # init args and config
@@ -85,7 +140,7 @@ def main():
         min_conditioning_length=66150,  # 3 secs
         debug_loading_failures=False,
         max_wav_length=255995,  # ~11.6 seconds
-        max_text_length=300,
         mel_norm_file=MEL_NORM_FILE,
         dvae_checkpoint=DVAE_CHECKPOINT,
         xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
@@ -133,26 +188,37 @@ def main():
         lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
         test_sentences=[
             {
-                "text": "राम राम फाल्गुनिबेन , मैं नरेंद्र मोदी बात कर रहा हूँ. पिछले १० साल में भारत विश्वगुरु बनने की दिशा में आगे बढ़ रहा है . आप अपने लोकसभा के प्रत्याशी हसमुख पटेलजी को वोट देकर भारतकी अद्भुत यात्रामें अपना सहयोग दें. धन्यवाद ",
                 "speaker_wav": SPEAKER_REFERENCE,
                 "language": LANGUAGE,
             },
             {
-                "text": "भव्य नमस्कार , मैं नरेंद्र मोदी बात कर रहा हूँ. पिछले १० साल में भारत विश्वगुरु बनने की दिशा में आगे बढ़ रहा है . आप अपने लोकसभा के प्रत्याशी हसमुख पटेलजी को वोट देकर भारतकी अद्भुत यात्रामें अपना सहयोग दें. धन्यवाद ",
                 "speaker_wav": SPEAKER_REFERENCE,
                 "language": LANGUAGE,
             },
             {
-                "text": "मयना नमस्कार , मैं नरेंद्र मोदी बात कर रहा हूँ. पिछले १० साल में भारत विश्वगुरु बनने की दिशा में आगे बढ़ रहा है . आप अपने लोकसभा के प्रत्याशी हसमुख पटेलजी को वोट देकर भारतकी अद्भुत यात्रामें अपना सहयोग दें. धन्यवाद ",
                 "speaker_wav": SPEAKER_REFERENCE,
                 "language": LANGUAGE,
             },
             {
-                "text": "Hello, Samir.I'm Narendra Modi. In the previous ten years, our government has launched numerous measures that have benefited entrepreneurs. In my third term, India will be the world's third largest economy. Vote for the BJP to see India become a superpower. Thank you!",
                 "speaker_wav": SPEAKER_REFERENCE,
-                "language": "en",
             },
         ],
     )
     # init the model from config

 config_dataset = BaseDatasetConfig(
     formatter="ljspeech",
     dataset_name="ljspeech",
+    path="/home/ubuntu/voicetts/modi/hindi",
+    meta_file_train="/home/ubuntu/voicetts/modi/hindi/metadata.txt",
     language="hi",
 )
 DATASETS_CONFIG_LIST = [config_dataset]
 # Define the path where XTTS v2.0.1 files will be downloaded
+CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "NewModiSpeech-March-29-2024_10+35AM-0000000/")
 os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
 # Training sentences generations
 SPEAKER_REFERENCE = [
     # "./tests/data/ljspeech/wavs/LJ001-0002.wav"  # speaker reference to be used in training test sentences
+    "/home/ubuntu/voicetts/ds_path/modispeech.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_1.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_4.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_5.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_6.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_7.wav",
+    "/home/ubuntu/voicetts/final/wavs/clip_8.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_2.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_3.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_4.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_5.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_6.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_7.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_8.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_9.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_10.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_11.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_12.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_13.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_14.wav",
+    # "/home/ubuntu/voicetts/modi/modienglish/wavs/clip_15.wav",
 ]
 LANGUAGE = config_dataset.language
+BEST_LOSS = None
+MAX_PATIENCE = 1  # early stopping patience
+CURRENT_PATIENCE = 0  # current patience
+def early_stopping_fn(eval_results):
+    """
+    This function is called after each evaluation step.
+    If you want to implement early stopping, you can do it here.
+    If model has not imporoved for 25 epoches, it will stop the training.
+    """
+    global BEST_LOSS
+    global CURRENT_PATIENCE
+    print(" > Early stopping function called!")
+    print(f" > Current patience: {CURRENT_PATIENCE}/{MAX_PATIENCE}")
+    print('Best Loss', BEST_LOSS)
+    print('Current Best Loss', eval_results.best_loss)
+    current_best_loss = eval_results.best_loss['eval_loss']
+    if BEST_LOSS is None:
+        BEST_LOSS = current_best_loss
+    else:
+        if CURRENT_PATIENCE <= MAX_PATIENCE:
+            print(" > Early stopping!")
+            return True
+        elif current_best_loss < BEST_LOSS:
+            BEST_LOSS = current_best_loss
+            CURRENT_PATIENCE = 0
+        elif CURRENT_PATIENCE < MAX_PATIENCE:
+            CURRENT_PATIENCE += 1
+            if CURRENT_PATIENCE >= MAX_PATIENCE:
+                print(" > Early stopping!")
+                return True
+    print('Updated Best Loss', BEST_LOSS)
+    return False
 def main():
     # init args and config
         min_conditioning_length=66150,  # 3 secs
         debug_loading_failures=False,
         max_wav_length=255995,  # ~11.6 seconds
+        max_text_length=250,
         mel_norm_file=MEL_NORM_FILE,
         dvae_checkpoint=DVAE_CHECKPOINT,
         xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
         lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
         test_sentences=[
             {
+                "text": "इसका फायदा भी उठा सकते हैं एक छोटे जी और आग्रह पूर्वक इसको करिए आप देखिए गरीब के साथ आपका कैसा जुड़ा होता है उस पर हम को कैसे सफलता मिलती",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "अपने आप को भूल चुके हम जैसे निराश्रित बन गए नहीं मेरे प्यारे भाइयों बहनों ऐसा नहीं हो सकता स्वामी विवेकानंद अक्सर एक बात हमेशा बताया कर शायद यह बात उन्होंने",
                 "speaker_wav": SPEAKER_REFERENCE,
                 "language": LANGUAGE,
             },
             {
+                "text": "मुझे उनका सुझाव अच्छा लगा मेरा अपना अनुभव है गुजरात में मुख्यमंत्री 2011 में एथेंस में",
                 "speaker_wav": SPEAKER_REFERENCE,
                 "language": LANGUAGE,
             },
             {
+                "text": "मुख्यमंत्री",
                 "speaker_wav": SPEAKER_REFERENCE,
                 "language": LANGUAGE,
             },
             {
+                "text": "ओलंपिक",
                 "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
+            },
+            {
+                "text": "मेरे देशवासियों जब तक हम चलने",
+                "speaker_wav": SPEAKER_REFERENCE,
+                "language": LANGUAGE,
             },
         ],
+        eval_split_size=0.03
     )
     # init the model from config