|
import datetime |
|
import glob |
|
import os |
|
import random |
|
import re |
|
|
|
import numpy as np |
|
from scipy import signal |
|
|
|
from TTS.encoder.models.lstm import LSTMSpeakerEncoder |
|
from TTS.encoder.models.resnet import ResNetSpeakerEncoder |
|
from TTS.utils.io import save_fsspec |
|
|
|
|
|
class AugmentWAV(object): |
|
def __init__(self, ap, augmentation_config): |
|
self.ap = ap |
|
self.use_additive_noise = False |
|
|
|
if "additive" in augmentation_config.keys(): |
|
self.additive_noise_config = augmentation_config["additive"] |
|
additive_path = self.additive_noise_config["sounds_path"] |
|
if additive_path: |
|
self.use_additive_noise = True |
|
|
|
self.additive_noise_types = [] |
|
for key in self.additive_noise_config.keys(): |
|
if isinstance(self.additive_noise_config[key], dict): |
|
self.additive_noise_types.append(key) |
|
|
|
additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True) |
|
|
|
self.noise_list = {} |
|
|
|
for wav_file in additive_files: |
|
noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0] |
|
|
|
if noise_dir not in self.additive_noise_types: |
|
continue |
|
if not noise_dir in self.noise_list: |
|
self.noise_list[noise_dir] = [] |
|
self.noise_list[noise_dir].append(wav_file) |
|
|
|
print( |
|
f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}" |
|
) |
|
|
|
self.use_rir = False |
|
|
|
if "rir" in augmentation_config.keys(): |
|
self.rir_config = augmentation_config["rir"] |
|
if self.rir_config["rir_path"]: |
|
self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True) |
|
self.use_rir = True |
|
|
|
print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances") |
|
|
|
self.create_augmentation_global_list() |
|
|
|
def create_augmentation_global_list(self): |
|
if self.use_additive_noise: |
|
self.global_noise_list = self.additive_noise_types |
|
else: |
|
self.global_noise_list = [] |
|
if self.use_rir: |
|
self.global_noise_list.append("RIR_AUG") |
|
|
|
def additive_noise(self, noise_type, audio): |
|
clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4) |
|
|
|
noise_list = random.sample( |
|
self.noise_list[noise_type], |
|
random.randint( |
|
self.additive_noise_config[noise_type]["min_num_noises"], |
|
self.additive_noise_config[noise_type]["max_num_noises"], |
|
), |
|
) |
|
|
|
audio_len = audio.shape[0] |
|
noises_wav = None |
|
for noise in noise_list: |
|
noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len] |
|
|
|
if noiseaudio.shape[0] < audio_len: |
|
continue |
|
|
|
noise_snr = random.uniform( |
|
self.additive_noise_config[noise_type]["min_snr_in_db"], |
|
self.additive_noise_config[noise_type]["max_num_noises"], |
|
) |
|
noise_db = 10 * np.log10(np.mean(noiseaudio**2) + 1e-4) |
|
noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio |
|
|
|
if noises_wav is None: |
|
noises_wav = noise_wav |
|
else: |
|
noises_wav += noise_wav |
|
|
|
|
|
if noises_wav is None: |
|
return self.additive_noise(noise_type, audio) |
|
|
|
return audio + noises_wav |
|
|
|
def reverberate(self, audio): |
|
audio_len = audio.shape[0] |
|
|
|
rir_file = random.choice(self.rir_files) |
|
rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate) |
|
rir = rir / np.sqrt(np.sum(rir**2)) |
|
return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len] |
|
|
|
def apply_one(self, audio): |
|
noise_type = random.choice(self.global_noise_list) |
|
if noise_type == "RIR_AUG": |
|
return self.reverberate(audio) |
|
|
|
return self.additive_noise(noise_type, audio) |
|
|
|
|
|
def to_camel(text): |
|
text = text.capitalize() |
|
return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) |
|
|
|
|
|
def setup_encoder_model(config: "Coqpit"): |
|
if config.model_params["model_name"].lower() == "lstm": |
|
model = LSTMSpeakerEncoder( |
|
config.model_params["input_dim"], |
|
config.model_params["proj_dim"], |
|
config.model_params["lstm_dim"], |
|
config.model_params["num_lstm_layers"], |
|
use_torch_spec=config.model_params.get("use_torch_spec", False), |
|
audio_config=config.audio, |
|
) |
|
elif config.model_params["model_name"].lower() == "resnet": |
|
model = ResNetSpeakerEncoder( |
|
input_dim=config.model_params["input_dim"], |
|
proj_dim=config.model_params["proj_dim"], |
|
log_input=config.model_params.get("log_input", False), |
|
use_torch_spec=config.model_params.get("use_torch_spec", False), |
|
audio_config=config.audio, |
|
) |
|
return model |
|
|
|
|
|
def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch): |
|
checkpoint_path = "checkpoint_{}.pth".format(current_step) |
|
checkpoint_path = os.path.join(out_path, checkpoint_path) |
|
print(" | | > Checkpoint saving : {}".format(checkpoint_path)) |
|
|
|
new_state_dict = model.state_dict() |
|
state = { |
|
"model": new_state_dict, |
|
"optimizer": optimizer.state_dict() if optimizer is not None else None, |
|
"criterion": criterion.state_dict(), |
|
"step": current_step, |
|
"epoch": epoch, |
|
"loss": model_loss, |
|
"date": datetime.date.today().strftime("%B %d, %Y"), |
|
} |
|
save_fsspec(state, checkpoint_path) |
|
|
|
|
|
def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch): |
|
if model_loss < best_loss: |
|
new_state_dict = model.state_dict() |
|
state = { |
|
"model": new_state_dict, |
|
"optimizer": optimizer.state_dict(), |
|
"criterion": criterion.state_dict(), |
|
"step": current_step, |
|
"epoch": epoch, |
|
"loss": model_loss, |
|
"date": datetime.date.today().strftime("%B %d, %Y"), |
|
} |
|
best_loss = model_loss |
|
bestmodel_path = "best_model.pth" |
|
bestmodel_path = os.path.join(out_path, bestmodel_path) |
|
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path)) |
|
save_fsspec(state, bestmodel_path) |
|
return best_loss |
|
|