mode

Runtime error

App Files Files Community

maolili commited on Jul 24, 2023

Commit

c3eaaf0

1 Parent(s): 6626ab7

Delete voice.py

Browse files

Files changed (1) hide show

voice.py +0 -532

voice.py DELETED Viewed

@@ -1,532 +0,0 @@
-import os
-import librosa
-import commons
-import re
-import numpy as np
-import torch
-import xml.etree.ElementTree as ET
-import config
-import logging
-import soundfile as sf
-from torch import no_grad, LongTensor, inference_mode, FloatTensor
-from io import BytesIO
-from graiax import silkcoder
-from utils.nlp import sentence_split
-from mel_processing import spectrogram_torch
-from text import text_to_sequence
-from models import SynthesizerTrn
-from utils import utils
-# torch.set_num_threads(1) # 设置torch线程为1
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-class vits:
-    def __init__(self, model, config, model_=None, model_type=None):
-        self.model_type = model_type
-        self.hps_ms = utils.get_hparams_from_file(config)
-        self.n_speakers = getattr(self.hps_ms.data, 'n_speakers', 0)
-        self.n_symbols = len(getattr(self.hps_ms, 'symbols', []))
-        self.speakers = getattr(self.hps_ms, 'speakers', ['0'])
-        self.use_f0 = getattr(self.hps_ms.data, 'use_f0', False)
-        self.emotion_embedding = getattr(self.hps_ms.data, 'emotion_embedding',
-                                         getattr(self.hps_ms.model, 'emotion_embedding', False))
-        self.bert_embedding = getattr(self.hps_ms.data, 'bert_embedding',
-                                      getattr(self.hps_ms.model, 'bert_embedding', False))
-        self.hps_ms.model.emotion_embedding = self.emotion_embedding
-        self.hps_ms.model.bert_embedding = self.bert_embedding
-        self.net_g_ms = SynthesizerTrn(
-            self.n_symbols,
-            self.hps_ms.data.filter_length // 2 + 1,
-            self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
-            n_speakers=self.n_speakers,
-            **self.hps_ms.model)
-        _ = self.net_g_ms.eval()
-        # load model
-        self.load_model(model, model_)
-    def load_model(self, model, model_=None):
-        utils.load_checkpoint(model, self.net_g_ms)
-        self.net_g_ms.to(device)
-        if self.model_type == "hubert":
-            self.hubert = model_
-        elif self.model_type == "w2v2":
-            self.emotion_reference = model_
-    def get_cleaned_text(self, text, hps, cleaned=False):
-        if cleaned:
-            text_norm = text_to_sequence(text, hps.symbols, [])
-        else:
-            if self.bert_embedding:
-                text_norm, char_embed = text_to_sequence(text, hps.symbols, hps.data.text_cleaners,
-                                                         bert_embedding=self.bert_embedding)
-                text_norm = LongTensor(text_norm)
-                return text_norm, char_embed
-            else:
-                text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
-        if hps.data.add_blank:
-            text_norm = commons.intersperse(text_norm, 0)
-        text_norm = LongTensor(text_norm)
-        return text_norm
-    def get_cleaner(self):
-        return getattr(self.hps_ms.data, 'text_cleaners', [None])[0]
-    def get_speakers(self, escape=False):
-        return self.speakers
-    def infer(self, params):
-        with no_grad():
-            x_tst = params.get("stn_tst").unsqueeze(0).to(device)
-            x_tst_lengths = LongTensor([params.get("stn_tst").size(0)]).to(device)
-            x_tst_prosody = torch.FloatTensor(params.get("char_embeds")).unsqueeze(0).to(
-                device) if self.bert_embedding else None
-            sid = params.get("sid").to(device) if not self.bert_embedding else None
-            emotion = params.get("emotion").to(device) if self.emotion_embedding else None
-            audio = self.net_g_ms.infer(x=x_tst,
-                                        x_lengths=x_tst_lengths,
-                                        sid=sid,
-                                        noise_scale=params.get("noise_scale"),
-                                        noise_scale_w=params.get("noise_scale_w"),
-                                        length_scale=params.get("length_scale"),
-                                        emotion_embedding=emotion,
-                                        bert=x_tst_prosody)[0][0, 0].data.float().cpu().numpy()
-        torch.cuda.empty_cache()
-        return audio
-    def get_infer_param(self, length_scale, noise_scale, noise_scale_w, text=None, speaker_id=None, audio_path=None,
-                        emotion=None, cleaned=False, f0_scale=1):
-        emo = None
-        char_embeds = None
-        if self.model_type != "hubert":
-            if self.bert_embedding:
-                stn_tst, char_embeds = self.get_cleaned_text(text, self.hps_ms, cleaned=cleaned)
-                sid = None
-            else:
-                stn_tst = self.get_cleaned_text(text, self.hps_ms, cleaned=cleaned)
-                sid = LongTensor([speaker_id])
-        if self.model_type == "w2v2":
-            # if emotion_reference.endswith('.npy'):
-            #     emotion = np.load(emotion_reference)
-            #     emotion = FloatTensor(emotion).unsqueeze(0)
-            # else:
-            #     audio16000, sampling_rate = librosa.load(
-            #         emotion_reference, sr=16000, mono=True)
-            #     emotion = self.w2v2(audio16000, sampling_rate)[
-            #         'hidden_states']
-            #     emotion_reference = re.sub(
-            #         r'\..*$', '', emotion_reference)
-            #     np.save(emotion_reference, emotion.squeeze(0))
-            #     emotion = FloatTensor(emotion)
-            emo = torch.FloatTensor(self.emotion_reference[emotion]).unsqueeze(0)
-        elif self.model_type == "hubert":
-            if self.use_f0:
-                audio, sampling_rate = librosa.load(audio_path, sr=self.hps_ms.data.sampling_rate, mono=True)
-                audio16000 = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
-            else:
-                audio16000, sampling_rate = librosa.load(audio_path, sr=16000, mono=True)
-            with inference_mode():
-                units = self.hubert.units(FloatTensor(audio16000).unsqueeze(0).unsqueeze(0)).squeeze(0).numpy()
-                if self.use_f0:
-                    f0 = librosa.pyin(audio,
-                                      sr=sampling_rate,
-                                      fmin=librosa.note_to_hz('C0'),
-                                      fmax=librosa.note_to_hz('C7'),
-                                      frame_length=1780)[0]
-                    target_length = len(units[:, 0])
-                    f0 = np.nan_to_num(np.interp(np.arange(0, len(f0) * target_length, len(f0)) / target_length,
-                                                 np.arange(0, len(f0)), f0)) * f0_scale
-                    units[:, 0] = f0 / 10
-            stn_tst = FloatTensor(units)
-            sid = LongTensor([speaker_id])
-        params = {"length_scale": length_scale, "noise_scale": noise_scale,
-                  "noise_scale_w": noise_scale_w, "stn_tst": stn_tst,
-                  "sid": sid, "emotion": emo, "char_embeds": char_embeds}
-        return params
-    def get_tasks(self, voice):
-        text = voice.get("text", None)
-        speaker_id = voice.get("id", 0)
-        length = voice.get("length", 1)
-        noise = voice.get("noise", 0.667)
-        noisew = voice.get("noisew", 0.8)
-        max = voice.get("max", 50)
-        lang = voice.get("lang", "auto")
-        speaker_lang = voice.get("speaker_lang", None)
-        audio_path = voice.get("audio_path", None)
-        emotion = voice.get("emotion", 0)
-        # 去除所有多余的空白字符
-        if text is not None: text = re.sub(r'\s+', ' ', text).strip()
-        tasks = []
-        if self.model_type == "vits":
-            sentence_list = sentence_split(text, max, lang, speaker_lang)
-            for sentence in sentence_list:
-                params = self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length,
-                                              noise_scale=noise, noise_scale_w=noisew)
-                tasks.append(params)
-        elif self.model_type == "hubert":
-            params = self.get_infer_param(speaker_id=speaker_id, length_scale=length, noise_scale=noise,
-                                          noise_scale_w=noisew, audio_path=audio_path)
-            tasks.append(params)
-        elif self.model_type == "w2v2":
-            sentence_list = sentence_split(text, max, lang, speaker_lang)
-            for sentence in sentence_list:
-                params = self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length,
-                                              noise_scale=noise, noise_scale_w=noisew, emotion=emotion)
-                tasks.append(params)
-        return tasks
-    def get_audio(self, voice, auto_break=False):
-        tasks = self.get_tasks(voice)
-        # 停顿0.75s，避免语音分段合成再拼接后的连接突兀
-        brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
-        audios = []
-        for task in tasks:
-            if auto_break:
-                chunk = np.concatenate((self.infer(task), brk), axis=0)
-            else:
-                chunk = self.infer(task)
-            audios.append(chunk)
-        audio = np.concatenate(audios, axis=0)
-        return audio
-    def get_stream_audio(self, voice, auto_break=False):
-        tasks = self.get_tasks(voice)
-        brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
-        for task in tasks:
-            if auto_break:
-                chunk = np.concatenate((self.infer(task), brk), axis=0)
-            else:
-                chunk = self.infer(task)
-            yield chunk
-    def voice_conversion(self, voice):
-        audio_path = voice.get("audio_path")
-        original_id = voice.get("original_id")
-        target_id = voice.get("target_id")
-        audio = utils.load_audio_to_torch(
-            audio_path, self.hps_ms.data.sampling_rate)
-        y = audio.unsqueeze(0)
-        spec = spectrogram_torch(y, self.hps_ms.data.filter_length,
-                                 self.hps_ms.data.sampling_rate, self.hps_ms.data.hop_length,
-                                 self.hps_ms.data.win_length,
-                                 center=False)
-        spec_lengths = LongTensor([spec.size(-1)])
-        sid_src = LongTensor([original_id])
-        with no_grad():
-            sid_tgt = LongTensor([target_id])
-            audio = self.net_g_ms.voice_conversion(spec.to(device),
-                                                   spec_lengths.to(device),
-                                                   sid_src=sid_src.to(device),
-                                                   sid_tgt=sid_tgt.to(device))[0][0, 0].data.cpu().float().numpy()
-        torch.cuda.empty_cache()
-        return audio
-class TTS:
-    def __init__(self, voice_obj, voice_speakers):
-        self._voice_obj = voice_obj
-        self._voice_speakers = voice_speakers
-        self._strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
-        self._speakers_count = sum([len(self._voice_speakers[i]) for i in self._voice_speakers])
-        self._vits_speakers_count = len(self._voice_speakers["VITS"])
-        self._hubert_speakers_count = len(self._voice_speakers["HUBERT-VITS"])
-        self._w2v2_speakers_count = len(self._voice_speakers["W2V2-VITS"])
-        self.dem = None
-        # Initialization information
-        self.logger = logging.getLogger("vits-simple-api")
-        self.logger.info(f"torch:{torch.__version__} cuda_available:{torch.cuda.is_available()}")
-        self.logger.info(f'device:{device} device.type:{device.type}')
-        if getattr(config, "DIMENSIONAL_EMOTION_MODEL", None) != None:
-            try:
-                import audonnx
-                root = os.path.dirname(config.DIMENSIONAL_EMOTION_MODEL)
-                model_file = config.DIMENSIONAL_EMOTION_MODEL
-                self.dem = audonnx.load(root=root, model_file=model_file)
-            except Exception as e:
-                self.logger.warning(f"Load DIMENSIONAL_EMOTION_MODEL failed {e}")
-        if self._vits_speakers_count != 0: self.logger.info(f"[VITS] {self._vits_speakers_count} speakers")
-        if self._hubert_speakers_count != 0: self.logger.info(f"[hubert] {self._hubert_speakers_count} speakers")
-        if self._w2v2_speakers_count != 0: self.logger.info(f"[w2v2] {self._w2v2_speakers_count} speakers")
-        self.logger.info(f"{self._speakers_count} speakers in total")
-        if self._speakers_count == 0:
-            self.logger.warning(f"No model was loaded")
-    @property
-    def voice_speakers(self):
-        return self._voice_speakers
-    @property
-    def speakers_count(self):
-        return self._speakers_count
-    @property
-    def vits_speakers_count(self):
-        return self._vits_speakers_count
-    @property
-    def hubert_speakers_count(self):
-        return self._hubert_speakers_count
-    @property
-    def w2v2_speakers_count(self):
-        return self._w2v2_speakers_count
-    def encode(self, sampling_rate, audio, format):
-        with BytesIO() as f:
-            if format.upper() == 'OGG':
-                sf.write(f, audio, sampling_rate, format="ogg")
-                return BytesIO(f.getvalue())
-            elif format.upper() == 'SILK':
-                sf.write(f, audio, sampling_rate, format="wav")
-                return BytesIO(silkcoder.encode(f))
-            elif format.upper() == 'MP3':
-                sf.write(f, audio, sampling_rate, format="mp3")
-                return BytesIO(f.getvalue())
-            elif format.upper() == 'WAV':
-                sf.write(f, audio, sampling_rate, format="wav")
-                return BytesIO(f.getvalue())
-            elif format.upper() == 'FLAC':
-                sf.write(f, audio, sampling_rate, format="flac")
-                return BytesIO(f.getvalue())
-            else:
-                raise ValueError(f"Unsupported format:{format}")
-    def convert_time_string(self, time_string):
-        time_value = float(re.findall(r'\d+\.?\d*', time_string)[0])
-        time_unit = re.findall(r'[a-zA-Z]+', time_string)[0].lower()
-        if time_unit.upper() == 'MS':
-            return time_value / 1000
-        elif time_unit.upper() == 'S':
-            return time_value
-        elif time_unit.upper() == 'MIN':
-            return time_value * 60
-        elif time_unit.upper() == 'H':
-            return time_value * 3600
-        elif time_unit.upper() == 'D':
-            return time_value * 24 * 3600  # 不会有人真写D吧？
-        else:
-            raise ValueError("Unsupported time unit: {}".format(time_unit))
-    def generate_audio_chunks(self, audio):
-        chunk_size = 4096
-        while True:
-            chunk = audio.read(chunk_size)
-            if not chunk:
-                break
-            yield chunk
-    def parse_ssml(self, ssml):
-        root = ET.fromstring(ssml)
-        format = root.attrib.get("format", "wav")
-        voice_tasks = []
-        brk_count = 0
-        strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
-        for element in root.iter():
-            if element.tag == "voice":
-                id = int(element.attrib.get("id", root.attrib.get("id", config.ID)))
-                lang = element.attrib.get("lang", root.attrib.get("lang", config.LANG))
-                length = float(element.attrib.get("length", root.attrib.get("length", config.LENGTH)))
-                noise = float(element.attrib.get("noise", root.attrib.get("noise", config.NOISE)))
-                noisew = float(element.attrib.get("noisew", root.attrib.get("noisew", config.NOISEW)))
-                max = int(element.attrib.get("max", root.attrib.get("max", "0")))
-                # 不填写默认就是vits
-                model = element.attrib.get("model", root.attrib.get("model", "vits"))
-                # w2v2-vits/emotion-vits才有emotion
-                emotion = int(element.attrib.get("emotion", root.attrib.get("emotion", 0)))
-                voice_element = ET.tostring(element, encoding='unicode')
-                pattern_voice = r'<voice.*?>(.*?)</voice>'
-                pattern_break = r'<break\s*?(.*?)\s*?/>'
-                matches_voice = re.findall(pattern_voice, voice_element)[0]
-                matches_break = re.split(pattern_break, matches_voice)
-                for match in matches_break:
-                    strength = re.search(r'\s*strength\s*=\s*[\'\"](.*?)[\'\"]', match)
-                    time = re.search(r'\s*time\s*=\s*[\'\"](.*?)[\'\"]', match)
-                    # break标签 strength属性
-                    if strength:
-                        brk = strength_dict[strength.group(1)]
-                        voice_tasks.append({"break": brk})
-                        brk_count += 1
-                    # break标签 time属性
-                    elif time:
-                        brk = self.convert_time_string(time.group(1))
-                        voice_tasks.append({"break": brk})
-                        brk_count += 1
-                    # break标签 为空说明只写了break，默认停顿0.75s
-                    elif match == "":
-                        voice_tasks.append({"break": 0.75})
-                        brk_count += 1
-                    # voice标签中除了break剩下的就是文本
-                    else:
-                        voice_tasks.append({"id": id,
-                                            "text": match,
-                                            "lang": lang,
-                                            "length": length,
-                                            "noise": noise,
-                                            "noisew": noisew,
-                                            "max": max,
-                                            "model": model,
-                                            "emotion": emotion
-                                            })
-                # 分段末尾停顿0.75s
-                voice_tasks.append({"break": 0.75})
-            elif element.tag == "break":
-                # brk_count大于0说明voice标签中有break
-                if brk_count > 0:
-                    brk_count -= 1
-                    continue
-                brk = strength_dict.get(element.attrib.get("strength"),
-                                        self.convert_time_string(element.attrib.get("time", "750ms")))
-                voice_tasks.append({"break": brk})
-        for i in voice_tasks:
-            self.logger.debug(i)
-        return voice_tasks, format
-    def create_ssml_infer_task(self, ssml, fname):
-        voice_tasks, format = self.parse_ssml(ssml)
-        audios = []
-        for voice in voice_tasks:
-            if voice.get("break"):
-                audios.append(np.zeros(int(voice.get("break") * 22050), dtype=np.int16))
-            else:
-                model = voice.get("model").upper()
-                if model != "VITS" and model != "W2V2-VITS" and model != "EMOTION-VITS":
-                    raise ValueError(f"Unsupported model: {voice.get('model')}")
-                voice_obj = self._voice_obj[model][voice.get("id")][1]
-                voice["id"] = self._voice_obj[model][voice.get("id")][0]
-                audio = voice_obj.get_audio(voice)
-                audios.append(audio)
-        audio = np.concatenate(audios, axis=0)
-        encoded_audio = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
-        if config.SAVE_AUDIO:
-            path = f"{config.CACHE_PATH}/{fname}"
-            utils.save_audio(encoded_audio.getvalue(), path)
-        return encoded_audio, format
-    def vits_infer(self, voice, fname):
-        format = voice.get("format", "wav")
-        voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
-        voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
-        sampling_rate = voice_obj.hps_ms.data.sampling_rate
-        audio = voice_obj.get_audio(voice, auto_break=True)
-        encoded_audio = self.encode(sampling_rate, audio, format)
-        if config.SAVE_AUDIO:
-            path = f"{config.CACHE_PATH}/{fname}"
-            utils.save_audio(encoded_audio.getvalue(), path)
-        return encoded_audio
-    def stream_vits_infer(self, voice, fname):
-        format = voice.get("format", "wav")
-        voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
-        voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
-        sampling_rate = voice_obj.hps_ms.data.sampling_rate
-        genertator = voice_obj.get_stream_audio(voice, auto_break=True)
-        audio = BytesIO()
-        for chunk in genertator:
-            encoded_audio = self.encode(sampling_rate, chunk, format)
-            for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio):
-                yield encoded_audio_chunk
-            if config.SAVE_AUDIO:
-                audio.write(encoded_audio.getvalue())
-        if config.SAVE_AUDIO:
-            path = f"{config.CACHE_PATH}/{fname}"
-            utils.save_audio(audio.getvalue(), path)
-    def hubert_vits_infer(self, voice, fname):
-        format = voice.get("format", "wav")
-        voice_obj = self._voice_obj["HUBERT-VITS"][voice.get("id")][1]
-        voice["id"] = self._voice_obj["HUBERT-VITS"][voice.get("id")][0]
-        sampling_rate = voice_obj.hps_ms.data.sampling_rate
-        audio = voice_obj.get_audio(voice)
-        encoded_audio = self.encode(sampling_rate, audio, format)
-        if config.SAVE_AUDIO:
-            path = f"{config.CACHE_PATH}/{fname}"
-            utils.save_audio(encoded_audio.getvalue(), path)
-        return encoded_audio
-    def w2v2_vits_infer(self, voice, fname):
-        format = voice.get("format", "wav")
-        voice_obj = self._voice_obj["W2V2-VITS"][voice.get("id")][1]
-        voice["id"] = self._voice_obj["W2V2-VITS"][voice.get("id")][0]
-        sampling_rate = voice_obj.hps_ms.data.sampling_rate
-        audio = voice_obj.get_audio(voice, auto_break=True)
-        encoded_audio = self.encode(sampling_rate, audio, format)
-        if config.SAVE_AUDIO:
-            path = f"{config.CACHE_PATH}/{fname}"
-            utils.save_audio(encoded_audio.getvalue(), path)
-        return encoded_audio
-    def vits_voice_conversion(self, voice, fname):
-        original_id = voice.get("original_id")
-        target_id = voice.get("target_id")
-        format = voice.get("format")
-        original_id_obj = int(self._voice_obj["VITS"][original_id][2])
-        target_id_obj = int(self._voice_obj["VITS"][target_id][2])
-        if original_id_obj != target_id_obj:
-            raise ValueError(f"speakers are in diffrent VITS Model")
-        voice["original_id"] = int(self._voice_obj["VITS"][original_id][0])
-        voice["target_id"] = int(self._voice_obj["VITS"][target_id][0])
-        voice_obj = self._voice_obj["VITS"][original_id][1]
-        sampling_rate = voice_obj.hps_ms.data.sampling_rate
-        audio = voice_obj.voice_conversion(voice)
-        encoded_audio = self.encode(sampling_rate, audio, format)
-        if config.SAVE_AUDIO:
-            path = f"{config.CACHE_PATH}/{fname}"
-            utils.save_audio(encoded_audio.getvalue(), path)
-        return encoded_audio
-    def get_dimensional_emotion_npy(self, audio):
-        if self.dem is None:
-            raise ValueError(f"Please configure DIMENSIONAL_EMOTION_MODEL path in config.py")
-        audio16000, sampling_rate = librosa.load(audio, sr=16000, mono=True)
-        emotion = self.dem(audio16000, sampling_rate)['hidden_states']
-        emotion_npy = BytesIO()
-        np.save(emotion_npy, emotion.squeeze(0))
-        emotion_npy.seek(0)
-        return emotion_npy