|
import librosa |
|
import re |
|
import numpy as np |
|
import torch |
|
from torch import no_grad, LongTensor, inference_mode, FloatTensor |
|
import utils |
|
from utils.sentence import sentence_split_and_markup |
|
from vits import commons |
|
from vits.mel_processing import spectrogram_torch |
|
from vits.text import text_to_sequence |
|
from vits.models import SynthesizerTrn |
|
|
|
|
|
|
|
class VITS: |
|
def __init__(self, model, config, additional_model=None, model_type=None, device=torch.device("cpu"),**kwargs): |
|
self.model_type = model_type |
|
self.hps_ms = utils.get_hparams_from_file(config) |
|
self.n_speakers = getattr(self.hps_ms.data, 'n_speakers', 0) |
|
self.n_symbols = len(getattr(self.hps_ms, 'symbols', [])) |
|
self.speakers = getattr(self.hps_ms, 'speakers', ['0']) |
|
if not isinstance(self.speakers, list): |
|
self.speakers = [item[0] for item in sorted(list(self.speakers.items()), key=lambda x: x[1])] |
|
self.use_f0 = getattr(self.hps_ms.data, 'use_f0', False) |
|
self.emotion_embedding = getattr(self.hps_ms.data, 'emotion_embedding', |
|
getattr(self.hps_ms.model, 'emotion_embedding', False)) |
|
self.bert_embedding = getattr(self.hps_ms.data, 'bert_embedding', |
|
getattr(self.hps_ms.model, 'bert_embedding', False)) |
|
self.hps_ms.model.emotion_embedding = self.emotion_embedding |
|
self.hps_ms.model.bert_embedding = self.bert_embedding |
|
|
|
self.net_g_ms = SynthesizerTrn( |
|
self.n_symbols, |
|
self.hps_ms.data.filter_length // 2 + 1, |
|
self.hps_ms.train.segment_size // self.hps_ms.data.hop_length, |
|
n_speakers=self.n_speakers, |
|
**self.hps_ms.model) |
|
_ = self.net_g_ms.eval() |
|
self.device = device |
|
|
|
|
|
self.load_model(model, additional_model) |
|
|
|
def load_model(self, model, additional_model=None): |
|
utils.load_checkpoint(model, self.net_g_ms) |
|
self.net_g_ms.to(self.device) |
|
if self.model_type == "hubert": |
|
self.hubert = additional_model |
|
elif self.model_type == "w2v2": |
|
self.emotion_reference = additional_model |
|
|
|
def get_cleaned_text(self, text, hps, cleaned=False): |
|
if cleaned: |
|
text_norm = text_to_sequence(text, hps.symbols, []) |
|
else: |
|
if self.bert_embedding: |
|
text_norm, char_embed = text_to_sequence(text, hps.symbols, hps.data.text_cleaners, |
|
bert_embedding=self.bert_embedding) |
|
text_norm = LongTensor(text_norm) |
|
return text_norm, char_embed |
|
else: |
|
text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners) |
|
if hps.data.add_blank: |
|
text_norm = commons.intersperse(text_norm, 0) |
|
text_norm = LongTensor(text_norm) |
|
return text_norm |
|
|
|
def get_cleaner(self): |
|
return getattr(self.hps_ms.data, 'text_cleaners', [None])[0] |
|
|
|
def get_speakers(self, escape=False): |
|
return self.speakers |
|
|
|
def infer(self, params): |
|
with no_grad(): |
|
x_tst = params.get("stn_tst").unsqueeze(0).to(self.device) |
|
x_tst_lengths = LongTensor([params.get("stn_tst").size(0)]).to(self.device) |
|
x_tst_prosody = torch.FloatTensor(params.get("char_embeds")).unsqueeze(0).to( |
|
self.device) if self.bert_embedding else None |
|
sid = params.get("sid").to(self.device) if not self.bert_embedding else None |
|
emotion = params.get("emotion").to(self.device) if self.emotion_embedding else None |
|
|
|
audio = self.net_g_ms.infer(x=x_tst, |
|
x_lengths=x_tst_lengths, |
|
sid=sid, |
|
noise_scale=params.get("noise_scale"), |
|
noise_scale_w=params.get("noise_scale_w"), |
|
length_scale=params.get("length_scale"), |
|
emotion_embedding=emotion, |
|
bert=x_tst_prosody)[0][0, 0].data.float().cpu().numpy() |
|
|
|
torch.cuda.empty_cache() |
|
|
|
return audio |
|
|
|
def get_infer_param(self, length_scale, noise_scale, noise_scale_w, text=None, speaker_id=None, audio_path=None, |
|
emotion=None, cleaned=False, f0_scale=1): |
|
emo = None |
|
char_embeds = None |
|
if self.model_type != "hubert": |
|
if self.bert_embedding: |
|
stn_tst, char_embeds = self.get_cleaned_text(text, self.hps_ms, cleaned=cleaned) |
|
sid = None |
|
else: |
|
stn_tst = self.get_cleaned_text(text, self.hps_ms, cleaned=cleaned) |
|
sid = LongTensor([speaker_id]) |
|
|
|
if self.model_type == "w2v2": |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
emo = torch.FloatTensor(self.emotion_reference[emotion]).unsqueeze(0) |
|
|
|
|
|
elif self.model_type == "hubert": |
|
if self.use_f0: |
|
audio, sampling_rate = librosa.load(audio_path, sr=self.hps_ms.data.sampling_rate, mono=True) |
|
audio16000 = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) |
|
else: |
|
audio16000, sampling_rate = librosa.load(audio_path, sr=16000, mono=True) |
|
|
|
with inference_mode(): |
|
units = self.hubert.units(FloatTensor(audio16000).unsqueeze(0).unsqueeze(0)).squeeze(0).numpy() |
|
if self.use_f0: |
|
f0 = librosa.pyin(audio, |
|
sr=sampling_rate, |
|
fmin=librosa.note_to_hz('C0'), |
|
fmax=librosa.note_to_hz('C7'), |
|
frame_length=1780)[0] |
|
target_length = len(units[:, 0]) |
|
f0 = np.nan_to_num(np.interp(np.arange(0, len(f0) * target_length, len(f0)) / target_length, |
|
np.arange(0, len(f0)), f0)) * f0_scale |
|
units[:, 0] = f0 / 10 |
|
|
|
stn_tst = FloatTensor(units) |
|
sid = LongTensor([speaker_id]) |
|
params = {"length_scale": length_scale, "noise_scale": noise_scale, |
|
"noise_scale_w": noise_scale_w, "stn_tst": stn_tst, |
|
"sid": sid, "emotion": emo, "char_embeds": char_embeds} |
|
|
|
return params |
|
|
|
def get_tasks(self, voice): |
|
text = voice.get("text", None) |
|
speaker_id = voice.get("id", 0) |
|
length = voice.get("length", 1) |
|
noise = voice.get("noise", 0.667) |
|
noisew = voice.get("noisew", 0.8) |
|
max = voice.get("max", 50) |
|
lang = voice.get("lang", "auto") |
|
speaker_lang = voice.get("speaker_lang", None) |
|
audio_path = voice.get("audio_path", None) |
|
emotion = voice.get("emotion", 0) |
|
|
|
|
|
if text is not None: text = re.sub(r'\s+', ' ', text).strip() |
|
|
|
tasks = [] |
|
if self.model_type == "vits": |
|
sentence_list = sentence_split_and_markup(text, max, lang, speaker_lang) |
|
for sentence in sentence_list: |
|
params = self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length, |
|
noise_scale=noise, noise_scale_w=noisew) |
|
tasks.append(params) |
|
|
|
elif self.model_type == "hubert": |
|
params = self.get_infer_param(speaker_id=speaker_id, length_scale=length, noise_scale=noise, |
|
noise_scale_w=noisew, audio_path=audio_path) |
|
tasks.append(params) |
|
|
|
elif self.model_type == "w2v2": |
|
sentence_list = sentence_split_and_markup(text, max, lang, speaker_lang) |
|
for sentence in sentence_list: |
|
params = self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length, |
|
noise_scale=noise, noise_scale_w=noisew, emotion=emotion) |
|
tasks.append(params) |
|
|
|
return tasks |
|
|
|
def get_audio(self, voice, auto_break=False): |
|
tasks = self.get_tasks(voice) |
|
|
|
brk = np.zeros(int(0.75 * 22050), dtype=np.int16) |
|
|
|
audios = [] |
|
for task in tasks: |
|
if auto_break: |
|
chunk = np.concatenate((self.infer(task), brk), axis=0) |
|
else: |
|
chunk = self.infer(task) |
|
audios.append(chunk) |
|
|
|
audio = np.concatenate(audios, axis=0) |
|
return audio |
|
|
|
def get_stream_audio(self, voice, auto_break=False): |
|
tasks = self.get_tasks(voice) |
|
|
|
brk = np.zeros(int(0.75 * 22050), dtype=np.int16) |
|
|
|
for task in tasks: |
|
if auto_break: |
|
chunk = np.concatenate((self.infer(task), brk), axis=0) |
|
else: |
|
chunk = self.infer(task) |
|
|
|
yield chunk |
|
|
|
def voice_conversion(self, voice): |
|
audio_path = voice.get("audio_path") |
|
original_id = voice.get("original_id") |
|
target_id = voice.get("target_id") |
|
|
|
audio = utils.load_audio_to_torch( |
|
audio_path, self.hps_ms.data.sampling_rate) |
|
|
|
y = audio.unsqueeze(0) |
|
|
|
spec = spectrogram_torch(y, self.hps_ms.data.filter_length, |
|
self.hps_ms.data.sampling_rate, self.hps_ms.data.hop_length, |
|
self.hps_ms.data.win_length, |
|
center=False) |
|
spec_lengths = LongTensor([spec.size(-1)]) |
|
sid_src = LongTensor([original_id]) |
|
|
|
with no_grad(): |
|
sid_tgt = LongTensor([target_id]) |
|
audio = self.net_g_ms.voice_conversion(spec.to(self.device), |
|
spec_lengths.to(self.device), |
|
sid_src=sid_src.to(self.device), |
|
sid_tgt=sid_tgt.to(self.device))[0][0, 0].data.cpu().float().numpy() |
|
|
|
torch.cuda.empty_cache() |
|
|
|
return audio |
|
|