Spaces:

CaiRou-Huang
/

Bert-Vits2

Sleeping

App Files Files Community

CaiRou-Huang commited on Apr 1, 2024

Commit

448c16f

verified ·

1 Parent(s): 93c3567

Upload 5 files

Browse files

Files changed (5) hide show

common/constants.py +20 -0
common/log.py +16 -0
common/stdout_wrapper.py +34 -0
common/subprocess_utils.py +32 -0
common/tts_model.py +250 -0

common/constants.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import enum
+DEFAULT_STYLE: str = "Neutral"
+DEFAULT_STYLE_WEIGHT: float = 5.0
+class Languages(str, enum.Enum):
+    JP = "JP"
+    EN = "EN"
+    ZH = "ZH"
+DEFAULT_SDP_RATIO: float = 0.2
+DEFAULT_NOISE: float = 0.6
+DEFAULT_NOISEW: float = 0.8
+DEFAULT_LENGTH: float = 1.0
+DEFAULT_LINE_SPLIT: bool = True
+DEFAULT_SPLIT_INTERVAL: float = 0.5
+DEFAULT_ASSIST_TEXT_WEIGHT: float = 0.7
+DEFAULT_ASSIST_TEXT_WEIGHT: float = 1.0

common/log.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""
+logger封装
+"""
+from loguru import logger
+from .stdout_wrapper import SAFE_STDOUT
+# 移除所有默认的处理器
+logger.remove()
+# 自定义格式并添加到标准输出
+log_format = (
+    "<g>{time:MM-DD HH:mm:ss}</g> |<lvl>{level:^8}</lvl>| {file}:{line} | {message}"
+)
+logger.add(SAFE_STDOUT, format=log_format, backtrace=True, diagnose=True)

common/stdout_wrapper.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import sys
+import tempfile
+class StdoutWrapper:
+    def __init__(self):
+        self.temp_file = tempfile.NamedTemporaryFile(mode="w+", delete=False)
+        self.original_stdout = sys.stdout
+    def write(self, message: str):
+        self.temp_file.write(message)
+        self.temp_file.flush()
+        print(message, end="", file=self.original_stdout)
+    def flush(self):
+        self.temp_file.flush()
+    def read(self):
+        self.temp_file.seek(0)
+        return self.temp_file.read()
+    def close(self):
+        self.temp_file.close()
+    def fileno(self):
+        return self.temp_file.fileno()
+try:
+    import google.colab
+    SAFE_STDOUT = StdoutWrapper()
+except ImportError:
+    SAFE_STDOUT = sys.stdout

common/subprocess_utils.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import subprocess
+import sys
+from .log import logger
+from .stdout_wrapper import SAFE_STDOUT
+python = sys.executable
+def run_script_with_log(cmd: list[str], ignore_warning=False) -> tuple[bool, str]:
+    logger.info(f"Running: {' '.join(cmd)}")
+    result = subprocess.run(
+        [python] + cmd,
+        stdout=SAFE_STDOUT,  # type: ignore
+        stderr=subprocess.PIPE,
+        text=True,
+    )
+    if result.returncode != 0:
+        logger.error(f"Error: {' '.join(cmd)}\n{result.stderr}")
+        return False, result.stderr
+    elif result.stderr and not ignore_warning:
+        logger.warning(f"Warning: {' '.join(cmd)}\n{result.stderr}")
+        return True, result.stderr
+    logger.success(f"Success: {' '.join(cmd)}")
+    return True, ""
+def second_elem_of(original_function):
+    def inner_function(*args, **kwargs):
+        return original_function(*args, **kwargs)[1]
+    return inner_function

common/tts_model.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import numpy as np
+import gradio as gr
+import torch
+import os
+import warnings
+from gradio.processing_utils import convert_to_16_bit_wav
+from typing import Dict, List, Optional, Union
+import utils
+from infer import get_net_g, infer
+from models import SynthesizerTrn
+from models_jp_extra import SynthesizerTrn as SynthesizerTrnJPExtra
+from .log import logger
+from .constants import (
+    DEFAULT_ASSIST_TEXT_WEIGHT,
+    DEFAULT_LENGTH,
+    DEFAULT_LINE_SPLIT,
+    DEFAULT_NOISE,
+    DEFAULT_NOISEW,
+    DEFAULT_SDP_RATIO,
+    DEFAULT_SPLIT_INTERVAL,
+    DEFAULT_STYLE,
+    DEFAULT_STYLE_WEIGHT,
+)
+class Model:
+    def __init__(
+        self, model_path: str, config_path: str, style_vec_path: str, device: str
+    ):
+        self.model_path: str = model_path
+        self.config_path: str = config_path
+        self.device: str = device
+        self.style_vec_path: str = style_vec_path
+        self.hps: utils.HParams = utils.get_hparams_from_file(self.config_path)
+        self.spk2id: Dict[str, int] = self.hps.data.spk2id
+        self.id2spk: Dict[int, str] = {v: k for k, v in self.spk2id.items()}
+        self.num_styles: int = self.hps.data.num_styles
+        if hasattr(self.hps.data, "style2id"):
+            self.style2id: Dict[str, int] = self.hps.data.style2id
+        else:
+            self.style2id: Dict[str, int] = {str(i): i for i in range(self.num_styles)}
+        if len(self.style2id) != self.num_styles:
+            raise ValueError(
+                f"Number of styles ({self.num_styles}) does not match the number of style2id ({len(self.style2id)})"
+            )
+        self.style_vectors: np.ndarray = np.load(self.style_vec_path)
+        if self.style_vectors.shape[0] != self.num_styles:
+            raise ValueError(
+                f"The number of styles ({self.num_styles}) does not match the number of style vectors ({self.style_vectors.shape[0]})"
+            )
+        self.net_g: Union[SynthesizerTrn, SynthesizerTrnJPExtra, None] = None
+    def load_net_g(self):
+        self.net_g = get_net_g(
+            model_path=self.model_path,
+            version=self.hps.version,
+            device=self.device,
+            hps=self.hps,
+        )
+    def get_style_vector(self, style_id: int, weight: float = 1.0) -> np.ndarray:
+        mean = self.style_vectors[0]
+        style_vec = self.style_vectors[style_id]
+        style_vec = mean + (style_vec - mean) * weight
+        return style_vec
+    def get_style_vector_from_audio(
+        self, audio_path: str, weight: float = 1.0
+    ) -> np.ndarray:
+        from style_gen import get_style_vector
+        xvec = get_style_vector(audio_path)
+        mean = self.style_vectors[0]
+        xvec = mean + (xvec - mean) * weight
+        return xvec
+    def infer(
+        self,
+        text: str,
+        language: str = "JP",
+        sid: int = 0,
+        reference_audio_path: Optional[str] = None,
+        sdp_ratio: float = DEFAULT_SDP_RATIO,
+        noise: float = DEFAULT_NOISE,
+        noisew: float = DEFAULT_NOISEW,
+        length: float = DEFAULT_LENGTH,
+        line_split: bool = DEFAULT_LINE_SPLIT,
+        split_interval: float = DEFAULT_SPLIT_INTERVAL,
+        assist_text: Optional[str] = None,
+        assist_text_weight: float = DEFAULT_ASSIST_TEXT_WEIGHT,
+        use_assist_text: bool = False,
+        style: str = DEFAULT_STYLE,
+        style_weight: float = DEFAULT_STYLE_WEIGHT,
+        given_tone: Optional[list[int]] = None,
+    ) -> tuple[int, np.ndarray]:
+        logger.info(f"Start generating audio data from text:\n{text}")
+        if language != "JP" and self.hps.version.endswith("JP-Extra"):
+            raise ValueError(
+                "The model is trained with JP-Extra, but the language is not JP"
+            )
+        if reference_audio_path == "":
+            reference_audio_path = None
+        if assist_text == "" or not use_assist_text:
+            assist_text = None
+        if self.net_g is None:
+            self.load_net_g()
+        if reference_audio_path is None:
+            style_id = self.style2id[style]
+            style_vector = self.get_style_vector(style_id, style_weight)
+        else:
+            style_vector = self.get_style_vector_from_audio(
+                reference_audio_path, style_weight
+            )
+        if not line_split:
+            with torch.no_grad():
+                audio = infer(
+                    text=text,
+                    sdp_ratio=sdp_ratio,
+                    noise_scale=noise,
+                    noise_scale_w=noisew,
+                    length_scale=length,
+                    sid=sid,
+                    language=language,
+                    hps=self.hps,
+                    net_g=self.net_g,
+                    device=self.device,
+                    assist_text=assist_text,
+                    assist_text_weight=assist_text_weight,
+                    style_vec=style_vector,
+                    given_tone=given_tone,
+                )
+        else:
+            texts = text.split("\n")
+            texts = [t for t in texts if t != ""]
+            audios = []
+            with torch.no_grad():
+                for i, t in enumerate(texts):
+                    audios.append(
+                        infer(
+                            text=t,
+                            sdp_ratio=sdp_ratio,
+                            noise_scale=noise,
+                            noise_scale_w=noisew,
+                            length_scale=length,
+                            sid=sid,
+                            language=language,
+                            hps=self.hps,
+                            net_g=self.net_g,
+                            device=self.device,
+                            assist_text=assist_text,
+                            assist_text_weight=assist_text_weight,
+                            style_vec=style_vector,
+                        )
+                    )
+                    if i != len(texts) - 1:
+                        audios.append(np.zeros(int(44100 * split_interval)))
+                audio = np.concatenate(audios)
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                audio = convert_to_16_bit_wav(audio)
+        logger.info("Audio data generated successfully")
+        return (self.hps.data.sampling_rate, audio)
+class ModelHolder:
+    def __init__(self, root_dir: str, device: str):
+        self.root_dir: str = root_dir
+        self.device: str = device
+        self.model_files_dict: Dict[str, List[str]] = {}
+        self.current_model: Optional[Model] = None
+        self.model_names: List[str] = []
+        self.models: List[Model] = []
+        self.refresh()
+    def refresh(self):
+        self.model_files_dict = {}
+        self.model_names = []
+        self.current_model = None
+        model_dirs = [
+            d
+            for d in os.listdir(self.root_dir)
+            if os.path.isdir(os.path.join(self.root_dir, d))
+        ]
+        for model_name in model_dirs:
+            model_dir = os.path.join(self.root_dir, model_name)
+            model_files = [
+                os.path.join(model_dir, f)
+                for f in os.listdir(model_dir)
+                if f.endswith(".pth") or f.endswith(".pt") or f.endswith(".safetensors")
+            ]
+            if len(model_files) == 0:
+                logger.warning(
+                    f"No model files found in {self.root_dir}/{model_name}, so skip it"
+                )
+                continue
+            self.model_files_dict[model_name] = model_files
+            self.model_names.append(model_name)
+    def load_model_gr(
+        self, model_name: str, model_path: str
+    ) -> tuple[gr.Dropdown, gr.Button, gr.Dropdown]:
+        if model_name not in self.model_files_dict:
+            raise ValueError(f"Model `{model_name}` is not found")
+        if model_path not in self.model_files_dict[model_name]:
+            raise ValueError(f"Model file `{model_path}` is not found")
+        if (
+            self.current_model is not None
+            and self.current_model.model_path == model_path
+        ):
+            # Already loaded
+            speakers = list(self.current_model.spk2id.keys())
+            styles = list(self.current_model.style2id.keys())
+            return (
+                gr.Dropdown(choices=styles, value=styles[0]),
+                gr.Button(interactive=True, value="音声合成"),
+                gr.Dropdown(choices=speakers, value=speakers[0]),
+            )
+        self.current_model = Model(
+            model_path=model_path,
+            config_path=os.path.join(self.root_dir, model_name, "config.json"),
+            style_vec_path=os.path.join(self.root_dir, model_name, "style_vectors.npy"),
+            device=self.device,
+        )
+        speakers = list(self.current_model.spk2id.keys())
+        styles = list(self.current_model.style2id.keys())
+        return (
+            gr.Dropdown(choices=styles, value=styles[0]),
+            gr.Button(interactive=True, value="音声合成"),
+            gr.Dropdown(choices=speakers, value=speakers[0]),
+        )
+    def update_model_files_gr(self, model_name: str) -> gr.Dropdown:
+        model_files = self.model_files_dict[model_name]
+        return gr.Dropdown(choices=model_files, value=model_files[0])
+    def update_model_names_gr(self) -> tuple[gr.Dropdown, gr.Dropdown, gr.Button]:
+        self.refresh()
+        initial_model_name = self.model_names[0]
+        initial_model_files = self.model_files_dict[initial_model_name]
+        return (
+            gr.Dropdown(choices=self.model_names, value=initial_model_name),
+            gr.Dropdown(choices=initial_model_files, value=initial_model_files[0]),
+            gr.Button(interactive=False),  # For tts_button
+        )