sovits-isla / app.py
kwau's picture
Upload 42 files
94d1117
import ast
import base64
import datetime
import glob
import json
import logging
import multiprocessing
import os
import re
import requests
import shutil
import subprocess
import sys
import traceback
import zipfile
from itertools import chain
from pathlib import Path
import gradio as gr
import librosa
import numpy as np
import soundfile as sf
import torch
import yaml
from auto_slicer import AutoSlicer
from compress_model import removeOptimizer
from inference.infer_tool_webui import Svc
from onnx_export import main as onnx_export
from sami import SAMIService
from tts_voices import SUPPORTED_LANGUAGES
from utils import mix_model
os.environ["PATH"] += os.pathsep + os.path.join(os.getcwd(), "ffmpeg", "bin")
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)
# Some directories
workdir = "logs/44k"
second_dir = "models"
diff_second_dir = "models/diffusion"
diff_workdir = "logs/44k/diffusion"
config_dir = "configs/"
dataset_dir = "dataset/44k"
raw_path = "dataset_raw"
raw_wavs_path = "raw"
models_backup_path = 'models_backup'
root_dir = "checkpoints"
default_settings_file = "settings.yaml"
current_mode = ""
# Some global variables
debug = False
precheck_ok = False
model = None
sovits_params = {}
diff_params = {}
# Some dicts for mapping
MODEL_TYPE = {
"vec768l12": 768,
"vec256l9": 256,
"hubertsoft": 256,
"whisper-ppg": 1024,
"cnhubertlarge": 1024,
"dphubert": 768,
"wavlmbase+": 768,
"whisper-ppg-large": 1280
}
ENCODER_PRETRAIN = {
"vec256l9": "pretrain/checkpoint_best_legacy_500.pt",
"vec768l12": "pretrain/checkpoint_best_legacy_500.pt",
"hubertsoft": "pretrain/hubert-soft-0d54a1f4.pt",
"whisper-ppg": "pretrain/medium.pt",
"cnhubertlarge": "pretrain/chinese-hubert-large-fairseq-ckpt.pt",
"dphubert": "pretrain/DPHuBERT-sp0.75.pth",
"wavlmbase+": "pretrain/WavLM-Base+.pt",
"whisper-ppg-large": "pretrain/large-v2.pt"
}
class Config:
def __init__(self, path, type):
self.path = path
self.type = type
def read(self):
if self.type == "json":
with open(self.path, 'r') as f:
return json.load(f)
if self.type == "yaml":
with open(self.path, 'r') as f:
return yaml.safe_load(f)
def save(self, content):
if self.type == "json":
with open(self.path, 'w') as f:
json.dump(content, f, indent=4)
if self.type == "yaml":
with open(self.path, 'w') as f:
yaml.safe_dump(content, f, default_flow_style=False, sort_keys=False)
class ReleasePacker:
def __init__(self, speaker, model):
self.speaker = speaker
self.model = model
self.output_path = os.path.join("release_packs", f"{speaker}_release.zip")
self.file_list = []
def remove_temp(self, path):
for filename in os.listdir(path):
file_path = os.path.join(path, filename)
if os.path.isfile(file_path) and not filename.endswith(".zip"):
os.remove(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path, ignore_errors=True)
def add_file(self, file_paths):
self.file_list.extend(file_paths)
def spk_to_dict(self):
spk_string = self.speaker.replace(',', ',')
spk_string = spk_string.replace(' ', '')
_spk = spk_string.split(',')
return {_spk: index for index, _spk in enumerate(_spk)}
def generate_config(self, diff_model, config_origin):
_config_origin = Config(os.path.join(config_read_dir, config_origin), "json")
_template = Config("release_packs/config_template.json", "json")
_d_template = Config("release_packs/diffusion_template.yaml", "yaml")
orig_config = _config_origin.read()
config_template = _template.read()
diff_config_template = _d_template.read()
spk_dict = self.spk_to_dict()
_net = torch.load(os.path.join(ckpt_read_dir, self.model), map_location='cpu')
emb_dim, model_dim = _net['model'].get('emb_g.weight', torch.empty(0, 0)).size()
vol_emb = _net['model'].get('emb_vol.weight')
if vol_emb is not None:
config_template["train"]["vol_aug"] = config_template["model"]["vol_embedding"] = True
#Keep the spk_dict length same as emb_dim
if emb_dim > len(spk_dict):
for i in range(emb_dim - len(spk_dict)):
spk_dict[f"spk{i}"] = len(spk_dict)
if emb_dim < len(spk_dict):
for i in range(len(spk_dict) - emb_dim):
spk_dict.popitem()
self.speaker = ','.join(spk_dict.keys())
config_template['model']['ssl_dim'] = config_template["model"]["filter_channels"] = config_template["model"]["gin_channels"] = model_dim
config_template['model']['n_speakers'] = diff_config_template['model']['n_spk'] = emb_dim
config_template['spk'] = diff_config_template['spk'] = spk_dict
encoder = [k for k, v in MODEL_TYPE.items() if v == model_dim]
if orig_config['model']['speech_encoder'] in encoder:
config_template['model']['speech_encoder'] = orig_config['model']['speech_encoder']
else:
raise Exception("Config is not compatible with the model")
if diff_model != "no_diff":
_diff = torch.load(os.path.join(diff_read_dir, diff_model), map_location='cpu')
_, diff_dim = _diff["model"].get("unit_embed.weight", torch.empty(0, 0)).size()
if diff_dim == 256:
diff_config_template['data']['encoder'] = 'hubertsoft'
diff_config_template['data']['encoder_out_channels'] = 256
elif diff_dim == 768:
diff_config_template['data']['encoder'] = 'vec768l12'
diff_config_template['data']['encoder_out_channels'] = 768
elif diff_dim == 1024:
diff_config_template['data']['encoder'] = 'whisper-ppg'
diff_config_template['data']['encoder_out_channels'] = 1024
with open("release_packs/install.txt", 'w') as f:
f.write(str(self.file_list) + '#' + str(self.speaker))
_template.save(config_template)
_d_template.save(diff_config_template)
def unpack(self, zip_file):
with zipfile.ZipFile(zip_file, 'r') as zipf:
zipf.extractall("release_packs")
def formatted_install(self, install_txt):
with open(install_txt, 'r') as f:
content = f.read()
file_list, speaker = content.split('#')
self.speaker = speaker
file_list = ast.literal_eval(file_list)
self.file_list = file_list
for _, target_path in self.file_list:
if target_path != "install.txt" and target_path != "":
shutil.move(os.path.join("release_packs", target_path), target_path)
self.remove_temp("release_packs")
return self.speaker
def pack(self):
with zipfile.ZipFile(self.output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for file_path, target_path in self.file_list:
if os.path.isfile(file_path):
zipf.write(file_path, arcname=target_path)
def debug_change():
global debug
debug = debug_button.value
def get_default_settings():
global sovits_params, diff_params, second_dir_enable
config_file = Config(default_settings_file, "yaml")
default_settings = config_file.read()
sovits_params = default_settings['sovits_params']
diff_params = default_settings['diff_params']
webui_settings = default_settings['webui_settings']
second_dir_enable = webui_settings['second_dir']
return sovits_params, diff_params, second_dir_enable
def webui_change(read_second_dir):
global second_dir_enable
config_file = Config(default_settings_file, "yaml")
default_settings = config_file.read()
second_dir_enable = default_settings['webui_settings']['second_dir'] = read_second_dir
config_file.save(default_settings)
def get_current_mode():
global current_mode
current_mode = "当前模式:独立目录模式,将从'./models/'读取模型文件" if second_dir_enable else "当前模式:工作目录模式,将从'./logs/44k'读取模型文件"
return current_mode
def save_default_settings(log_interval,eval_interval,keep_ckpts,batch_size,learning_rate,amp_dtype,all_in_mem,num_workers,cache_all_data,cache_device,diff_amp_dtype,diff_batch_size,diff_lr,diff_interval_log,diff_interval_val,diff_force_save,diff_k_step_max):
config_file = Config(default_settings_file, "yaml")
default_settings = config_file.read()
default_settings['sovits_params']['log_interval'] = int(log_interval)
default_settings['sovits_params']['eval_interval'] = int(eval_interval)
default_settings['sovits_params']['keep_ckpts'] = int(keep_ckpts)
default_settings['sovits_params']['batch_size'] = int(batch_size)
default_settings['sovits_params']['learning_rate'] = float(learning_rate)
default_settings['sovits_params']['amp_dtype'] = str(amp_dtype)
default_settings['sovits_params']['all_in_mem'] = all_in_mem
default_settings['diff_params']['num_workers'] = int(num_workers)
default_settings['diff_params']['cache_all_data'] = cache_all_data
default_settings['diff_params']['cache_device'] = str(cache_device)
default_settings['diff_params']['amp_dtype'] = str(diff_amp_dtype)
default_settings['diff_params']['diff_batch_size'] = int(diff_batch_size)
default_settings['diff_params']['diff_lr'] = float(diff_lr)
default_settings['diff_params']['diff_interval_log'] = int(diff_interval_log)
default_settings['diff_params']['diff_interval_val'] = int(diff_interval_val)
default_settings['diff_params']['diff_force_save'] = int(diff_force_save)
default_settings['diff_params']['diff_k_step_max'] = diff_k_step_max
config_file.save(default_settings)
return "成功保存默认配置"
def get_model_info(choice_ckpt):
pthfile = os.path.join(ckpt_read_dir, choice_ckpt)
net = torch.load(pthfile, map_location=torch.device('cpu')) #cpu load to avoid using gpu memory
spk_emb = net["model"].get("emb_g.weight")
if spk_emb is None:
return "所选模型缺少emb_g.weight,你可能选择了一个底模"
_layer = spk_emb.size(1)
encoder = [k for k, v in MODEL_TYPE.items() if v == _layer] #通过维度对应编码器
encoder.sort()
if encoder == ["hubertsoft", "vec256l9"]:
encoder = ["vec256l9 / hubertsoft"]
if encoder == ["cnhubertlarge", "whisper-ppg"]:
encoder = ["whisper-ppg / cnhubertlarge"]
if encoder == ["dphubert", "vec768l12", "wavlmbase+"]:
encoder = ["vec768l12 / dphubert / wavlmbase+"]
return encoder[0]
def load_json_encoder(config_choice, choice_ckpt):
if config_choice == "no_config":
return "未启用自动加载,请手动选择配置文件"
if choice_ckpt == "no_model":
return "请先选择模型"
config_file = Config(os.path.join(config_read_dir, config_choice), "json")
config = config_file.read()
try:
#比对配置文件中的模型维度与该encoder的实际维度是否对应,防止古神语
config_encoder = config["model"].get("speech_encoder", "no_encoder")
config_dim = config["model"]["ssl_dim"]
#旧版配置文件自动匹配
if config_encoder == "no_encoder":
config_encoder = config["model"]["speech_encoder"] = "vec256l9" if config_dim == 256 else "vec768l12"
config_file.save(config)
correct_dim = MODEL_TYPE.get(config_encoder, "unknown")
if config_dim != correct_dim:
return "配置文件中的编码器与模型维度不匹配"
return config_encoder
except Exception as e:
return f"出错了: {e}"
def auto_load(choice_ckpt):
global second_dir_enable
model_output_msg = get_model_info(choice_ckpt)
json_output_msg = config_choice = ""
choice_ckpt_name, _ = os.path.splitext(choice_ckpt)
if second_dir_enable:
all_config = [json for json in os.listdir(second_dir) if json.endswith(".json")]
for config in all_config:
config_fname, _ = os.path.splitext(config)
if config_fname == choice_ckpt_name:
config_choice = config
json_output_msg = load_json_encoder(config, choice_ckpt)
if json_output_msg != "":
return model_output_msg, config_choice, json_output_msg
else:
return model_output_msg, "no_config", ""
else:
return model_output_msg, "no_config", ""
def auto_load_diff(diff_model):
global second_dir_enable
if second_dir_enable is False:
return "no_diff_config"
all_diff_config = [yaml for yaml in os.listdir(second_dir) if yaml.endswith(".yaml")]
for config in all_diff_config:
config_fname, _ = os.path.splitext(config)
diff_fname, _ = os.path.splitext(diff_model)
if config_fname == diff_fname:
return config
return "no_diff_config"
def load_model_func(ckpt_name,cluster_name,config_name,enhance,diff_model_name,diff_config_name,only_diffusion,use_spk_mix,using_device,method,speedup,cl_num):
global model
config_path = os.path.join(config_read_dir, config_name) if not only_diffusion else "configs/config.json"
diff_config_path = os.path.join(config_read_dir, diff_config_name) if diff_config_name != "no_diff_config" else "configs/diffusion.yaml"
ckpt_path = os.path.join(ckpt_read_dir, ckpt_name)
cluster_path = os.path.join(ckpt_read_dir, cluster_name)
diff_model_path = os.path.join(diff_read_dir, diff_model_name)
k_step_max = 1000
if not only_diffusion:
config = Config(config_path, "json").read()
if diff_model_name != "no_diff":
_diff = Config(diff_config_path, "yaml")
_content = _diff.read()
diff_spk = _content.get('spk', {})
diff_spk_choice = spk_choice = next(iter(diff_spk), "未检测到音色")
if not only_diffusion:
if _content['data'].get('encoder_out_channels') != config["model"].get('ssl_dim'):
return "扩散模型维度与主模型不匹配,请确保两个模型使用的是同一个编码器", gr.Dropdown.update(choices=[], value=""), 0, None
_content["infer"]["speedup"] = int(speedup)
_content["infer"]["method"] = str(method)
k_step_max = _content["model"].get('k_step_max', 0) if _content["model"].get('k_step_max', 0) != 0 else 1000
_diff.save(_content)
if not only_diffusion:
net = torch.load(ckpt_path, map_location=torch.device('cpu'))
#读取模型各维度并比对,还有小可爱无视提示硬要加载底模的就返回个未初始张量
emb_dim, model_dim = net["model"].get("emb_g.weight", torch.empty(0, 0)).size()
if emb_dim > config["model"]["n_speakers"]:
return "模型说话人数量与emb维度不匹配", gr.Dropdown.update(choices=[], value=""), 0, None
if model_dim != config["model"]["ssl_dim"]:
return "配置文件与模型不匹配", gr.Dropdown.update(choices=[], value=""), 0, None
encoder = config["model"]["speech_encoder"]
spk_dict = config.get('spk', {})
spk_choice = next(iter(spk_dict), "未检测到音色")
else:
spk_dict = diff_spk
spk_choice = diff_spk_choice
fr = cluster_name.endswith(".pkl") #如果是pkl后缀就启用特征检索
shallow_diffusion = diff_model_name != "no_diff" #加载了扩散模型就启用浅扩散
device = cuda[using_device] if "CUDA" in using_device else using_device
model = Svc(ckpt_path,
config_path,
device=device if device != "Auto" else None,
cluster_model_path=cluster_path,
nsf_hifigan_enhance=enhance,
diffusion_model_path=diff_model_path,
diffusion_config_path=diff_config_path,
shallow_diffusion=shallow_diffusion,
only_diffusion=only_diffusion,
spk_mix_enable=use_spk_mix,
feature_retrieval=fr)
spk_list = list(spk_dict.keys())
if not only_diffusion:
clip = 25 if encoder == "whisper-ppg" or encoder == "whisper-ppg-large" else cl_num #Whisper必须强制切片25秒
device_name = torch.cuda.get_device_properties(model.dev).name if "cuda" in str(model.dev) else str(model.dev)
sovits_msg = f"模型被成功加载到了{device_name}上\n"
else:
clip = cl_num
sovits_msg = "启用全扩散推理,未加载So-VITS模型\n"
index_or_kmeans = "特征索引" if fr else "聚类模型"
clu_load = "未加载" if cluster_name == "no_clu" else cluster_name
diff_load = "未加载" if diff_model_name == "no_diff" else f"{diff_model_name} | 采样器: {method} | 加速倍数:{int(speedup)} | 最大浅扩散步数:{k_step_max}"
output_msg = f"{sovits_msg}{index_or_kmeans}{clu_load}\n扩散模型:{diff_load}"
return (
output_msg,
gr.Dropdown.update(choices=spk_list, value=spk_choice),
clip,
gr.Slider.update(value=100 if k_step_max>100 else k_step_max, minimum=speedup, maximum=k_step_max)
)
def model_empty_cache():
global model
if model is None:
return sid.update(choices = [],value=""),"没有模型需要卸载!"
else:
model.unload_model()
model = None
torch.cuda.empty_cache()
return sid.update(choices = [],value=""),"模型卸载完毕!"
def get_file_options(directory, extension):
return [file for file in os.listdir(directory) if file.endswith(extension)]
def load_options():
ckpt_list = [file for file in get_file_options(ckpt_read_dir, ".pth") if not file.startswith("D_") or file == "G_0.pth"]
config_list = get_file_options(config_read_dir, ".json")
cluster_list = ["no_clu"] + get_file_options(ckpt_read_dir, ".pt") + get_file_options(ckpt_read_dir, ".pkl") # 聚类和特征检索模型
diff_list = ["no_diff"] + get_file_options(diff_read_dir, ".pt")
diff_config_list = ["no_diff_config"] + get_file_options(config_read_dir, ".yaml")
return ckpt_list, config_list, cluster_list, diff_list, diff_config_list
def refresh_options():
global ckpt_read_dir, config_read_dir, diff_read_dir, current_mode
ckpt_read_dir = second_dir if second_dir_enable else workdir
config_read_dir = second_dir if second_dir_enable else config_dir
diff_read_dir = diff_second_dir if second_dir_enable else diff_workdir
ckpt_list, config_list, cluster_list, diff_list, diff_config_list = load_options()
current_mode = get_current_mode()
return (
choice_ckpt.update(choices=ckpt_list),
config_choice.update(choices=config_list),
cluster_choice.update(choices=cluster_list),
diff_choice.update(choices=diff_list),
diff_config_choice.update(choices=diff_config_list),
mode_caption.update(value=f"""{current_mode},可在页面底端切换模式""")
)
def source_change(use_microphone):
if use_microphone:
return vc_input3.update(source="microphone")
else:
return vc_input3.update(source="upload")
def vc_infer(output_format, sid, input_audio, sr, input_audio_path, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment):
if np.issubdtype(input_audio.dtype, np.integer):
input_audio = (input_audio / np.iinfo(input_audio.dtype).max).astype(np.float32)
if len(input_audio.shape) > 1:
input_audio = librosa.to_mono(input_audio.transpose(1, 0))
if sr != 44100:
input_audio = librosa.resample(input_audio, orig_sr=sr, target_sr=44100)
sf.write("temp.wav", input_audio, 44100, format="wav")
_audio = model.slice_inference(
"temp.wav",
sid,
vc_transform,
slice_db,
cluster_ratio,
auto_f0,
noise_scale,
pad_seconds,
cl_num,
lg_num,
lgr_num,
f0_predictor,
enhancer_adaptive_key,
cr_threshold,
k_step,
use_spk_mix,
second_encoding,
loudness_envelope_adjustment
)
model.clear_empty()
if not os.path.exists("results"):
os.makedirs("results")
key = "auto" if auto_f0 else f"{int(vc_transform)}key"
cluster = "_" if cluster_ratio == 0 else f"_{cluster_ratio}_"
isdiffusion = "sovits_"
if model.shallow_diffusion:
isdiffusion = "sovdiff_"
if model.only_diffusion:
isdiffusion = "diff_"
#Gradio上传的filepath因为未知原因会有一个无意义的固定后缀,这里去掉
truncated_basename = Path(input_audio_path).stem[:-6] if Path(input_audio_path).stem[-6:] == "-0-100" else Path(input_audio_path).stem
output_file_name = f'{truncated_basename}_{sid}_{key}{cluster}{isdiffusion}{f0_predictor}.{output_format}'
output_file_path = os.path.join("results", output_file_name)
if os.path.exists(output_file_path):
count = 1
while os.path.exists(output_file_path):
output_file_name = f'{truncated_basename}_{sid}_{key}{cluster}{isdiffusion}{f0_predictor}_{str(count)}.{output_format}'
output_file_path = os.path.join("results", output_file_name)
count += 1
sf.write(output_file_path, _audio, model.target_sample, format=output_format)
return output_file_path
def vc_fn(output_format, sid, input_audio, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment, progress=gr.Progress(track_tqdm=True)):
global model
try:
if input_audio is None:
return "你还没有上传音频", None
if model is None:
return "你还没有加载模型", None
if getattr(model, 'cluster_model', None) is None and model.feature_retrieval is False:
if cluster_ratio != 0:
return "你还未加载聚类或特征检索模型,无法启用聚类/特征检索混合比例", None
audio, sr = sf.read(input_audio)
output_file_path = vc_infer(output_format, sid, audio, sr, input_audio, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment)
os.remove("temp.wav")
return "Success", output_file_path
except Exception as e:
if debug:
traceback.print_exc()
raise gr.Error(e)
def vc_batch_fn(output_format, sid, input_audio_files, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment, progress=gr.Progress()):
global model
try:
if input_audio_files is None or len(input_audio_files) == 0:
return "你还没有上传音频"
if model is None:
return "你还没有加载模型"
if getattr(model, 'cluster_model', None) is None and model.feature_retrieval is False:
if cluster_ratio != 0:
return "你还未加载聚类或特征检索模型,无法启用聚类/特征检索混合比例", None
_output = []
for file_obj in progress.tqdm(input_audio_files, desc="Inferencing"):
print(f"Start processing: {file_obj.name}")
input_audio_path = file_obj.name
audio, sr = sf.read(input_audio_path)
output_file_path = vc_infer(output_format, sid, audio, sr, input_audio_path, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment)
_output.append(output_file_path)
return "批量推理完成,音频已经被保存到results文件夹"
except Exception as e:
if debug:
traceback.print_exc()
raise gr.Error(e)
def tts_fn(_text, _gender, _lang, _rate, _volume, output_format, sid, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold, k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment,progress=gr.Progress(track_tqdm=True)):
global model
try:
if model is None:
return "你还没有加载模型", None
if getattr(model, 'cluster_model', None) is None and model.feature_retrieval is False:
if cluster_ratio != 0:
return "你还未加载聚类或特征检索模型,无法启用聚类/特征检索混合比例", None
_rate = f"+{int(_rate*100)}%" if _rate >= 0 else f"{int(_rate*100)}%"
_volume = f"+{int(_volume*100)}%" if _volume >= 0 else f"{int(_volume*100)}%"
if _lang == "Auto":
_gender = "Male" if _gender == "男" else "Female"
subprocess.run([r".\workenv\python.exe", "tts.py", _text, _lang, _rate, _volume, _gender])
else:
subprocess.run([r".\workenv\python.exe", "tts.py", _text, _lang, _rate, _volume])
target_sr = 44100
y, sr = librosa.load("tts.wav")
resampled_y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
sf.write("tts.wav", resampled_y, target_sr, subtype = "PCM_16")
input_audio = "tts.wav"
audio, sr = sf.read(input_audio)
output_file_path = vc_infer(output_format, sid, audio, sr, input_audio, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment)
#os.remove("tts.wav")
return "Success", output_file_path
except Exception as e:
if debug:
traceback.print_exc()
raise gr.Error(e)
def load_raw_dirs():
global precheck_ok
precheck_ok = False
allowed_pattern = re.compile(r'^[a-zA-Z0-9_@#$%^&()_+\-=\s\.]*$')
illegal_files = illegal_dataset = []
for root, dirs, files in os.walk(raw_path):
for dir in dirs:
if not allowed_pattern.match(dir):
illegal_dataset.append(dir)
if illegal_dataset:
return f"数据集文件夹名只能包含数字、字母、下划线,以下文件夹不符合要求,请改名后再试:\n{illegal_dataset}"
if root != raw_path: # 只处理子文件夹内的文件
for file in files:
if not allowed_pattern.match(file) and file not in illegal_files:
illegal_files.append(file)
if not file.lower().endswith('.wav') and file not in illegal_files:
illegal_files.append(file)
if illegal_files:
return f"数据集文件名只能包含数字、字母、下划线,且必须是.wav格式,以下文件不符合要求,请改名后再试:\n{illegal_files}"
spk_dirs = [entry.name for entry in os.scandir(raw_path) if entry.is_dir()]
if spk_dirs:
precheck_ok = True
return spk_dirs
else:
return "未找到数据集,请检查dataset_raw文件夹"
def dataset_preprocess(encoder, f0_predictor, use_diff, vol_aug, skip_loudnorm, num_processes, tiny_enable):
if precheck_ok:
diff_arg = "--use_diff" if use_diff else ""
vol_aug_arg = "--vol_aug" if vol_aug else ""
skip_loudnorm_arg = "--skip_loudnorm" if skip_loudnorm else ""
tiny_arg = "--tiny" if tiny_enable else ""
preprocess_commands = [
r".\workenv\python.exe resample.py %s" % (skip_loudnorm_arg),
r".\workenv\python.exe preprocess_flist_config.py --speech_encoder %s %s %s" % (encoder, vol_aug_arg, tiny_arg),
r".\workenv\python.exe preprocess_hubert_f0.py --num_processes %s --f0_predictor %s %s" % (num_processes ,f0_predictor, diff_arg)
]
accumulated_output = ""
#清空dataset
dataset = os.listdir(dataset_dir)
if len(dataset) != 0:
for dir in dataset:
dataset_spk_dir = os.path.join(dataset_dir, str(dir))
if os.path.isdir(dataset_spk_dir):
shutil.rmtree(dataset_spk_dir)
accumulated_output += f"Deleting previous dataset: {dir}\n"
for command in preprocess_commands:
try:
result = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, text=True)
accumulated_output += f"Command: {command}, Using Encoder: {encoder}, Using f0 Predictor: {f0_predictor}\n"
yield accumulated_output, None
progress_line = None
for line in result.stdout:
if r"it/s" in line or r"s/it" in line: #防止进度条刷屏
progress_line = line
else:
accumulated_output += line
if progress_line is None:
yield accumulated_output, None
else:
yield accumulated_output + progress_line, None
result.communicate()
except subprocess.CalledProcessError as e:
result = e.output
accumulated_output += f"Error: {result}\n"
yield accumulated_output, None
if progress_line is not None:
accumulated_output += progress_line
accumulated_output += '-' * 50 + '\n'
yield accumulated_output, None
config_path = "configs/config.json"
with open(config_path, 'r') as f:
config = json.load(f)
spk_name = config.get('spk', None)
yield accumulated_output, gr.Textbox.update(value=spk_name)
else:
yield "数据集识别未通过,请先识别数据集并确保没有报错信息", None
def regenerate_config(encoder, vol_aug, tiny_enable):
if precheck_ok is False:
return "数据集识别未通过,请检查识别结果的报错信息"
vol_aug_arg = "--vol_aug" if vol_aug else ""
tiny_arg = "--tiny" if tiny_enable else ""
cmd = r".\workenv\python.exe preprocess_flist_config.py --speech_encoder %s %s %s" % (encoder, vol_aug_arg, tiny_arg)
output = ""
try:
result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, text=True)
for line in result.stdout:
output += line
output += "Regenerate config file successfully."
except subprocess.CalledProcessError as e:
result = e.output
output += f"Error: {result}\n"
return output
def clear_output():
return gr.Textbox.update(value="Cleared!>_<")
def get_available_encoder():
current_pretrain = os.listdir("pretrain")
current_pretrain = [("pretrain/" + model) for model in current_pretrain]
encoder_list = []
for encoder, path in ENCODER_PRETRAIN.items():
if path in current_pretrain:
encoder_list.append(encoder)
return encoder_list
def config_fn(log_interval, eval_interval, keep_ckpts, batch_size, lr, amp_dtype, all_in_mem, diff_num_workers, diff_cache_all_data, diff_batch_size, diff_lr, diff_interval_log, diff_interval_val, diff_cache_device, diff_amp_dtype, diff_force_save, diff_k_step_max):
if amp_dtype == "fp16" or amp_dtype == "bf16":
fp16_run = True
else:
fp16_run = False
amp_dtype = "fp16"
config_origin = Config("configs/config.json", "json")
diff_config = Config("configs/diffusion.yaml", "yaml")
config_data = config_origin.read()
config_data['train']['log_interval'] = int(log_interval)
config_data['train']['eval_interval'] = int(eval_interval)
config_data['train']['keep_ckpts'] = int(keep_ckpts)
config_data['train']['batch_size'] = int(batch_size)
config_data['train']['learning_rate'] = float(lr)
config_data['train']['fp16_run'] = fp16_run
config_data['train']['half_type'] = str(amp_dtype)
config_data['train']['all_in_mem'] = all_in_mem
config_origin.save(config_data)
diff_config_data = diff_config.read()
diff_config_data['train']['num_workers'] = int(diff_num_workers)
diff_config_data['train']['cache_all_data'] = diff_cache_all_data
diff_config_data['train']['batch_size'] = int(diff_batch_size)
diff_config_data['train']['lr'] = float(diff_lr)
diff_config_data['train']['interval_log'] = int(diff_interval_log)
diff_config_data['train']['interval_val'] = int(diff_interval_val)
diff_config_data['train']['cache_device'] = str(diff_cache_device)
diff_config_data['train']['amp_dtype'] = str(diff_amp_dtype)
diff_config_data['train']['interval_force_save'] = int(diff_force_save)
diff_config_data['model']['k_step_max'] = 100 if diff_k_step_max else 0
diff_config.save(diff_config_data)
return "配置文件写入完成"
def check_dataset(dataset_path):
if not os.listdir(dataset_path):
return "数据集不存在,请检查dataset文件夹"
no_npy_pt_files = True
for root, dirs, files in os.walk(dataset_path):
for file in files:
if file.endswith('.npy') or file.endswith('.pt'):
no_npy_pt_files = False
break
if no_npy_pt_files:
return "数据集中未检测到f0和hubert文件,可能是预处理未完成"
return None
def training(gpu_selection, encoder, tiny_enable):
if tiny_enable:
encoder = "vec768l12_tiny"
config_file = Config("configs/config.json", "json")
config_data = config_file.read()
vol_emb = config_data["model"]["vol_embedding"]
dataset_warn = check_dataset(dataset_dir)
if dataset_warn is not None:
return dataset_warn
PRETRAIN = {
"vec256l9": ("D_0.pth", "G_0.pth", "pre_trained_model"),
"vec768l12": ("D_0.pth", "G_0.pth", "pre_trained_model/768l12/vol_emb" if vol_emb else "pre_trained_model/768l12"),
"vec768l12_tiny": ("D_0.pth", "G_0.pth", "pre_trained_model/tiny/vec768l12_vol_emb"),
"hubertsoft": ("D_0.pth", "G_0.pth", "pre_trained_model/hubertsoft"),
"whisper-ppg": ("D_0.pth", "G_0.pth", "pre_trained_model/whisper-ppg"),
"cnhubertlarge": ("D_0.pth", "G_0.pth", "pre_trained_model/cnhubertlarge"),
"dphubert": ("D_0.pth", "G_0.pth", "pre_trained_model/dphubert"),
"wavlmbase+": ("D_0.pth", "G_0.pth", "pre_trained_model/wavlmbase+"),
"whisper-ppg-large": ("D_0.pth", "G_0.pth", "pre_trained_model/whisper-ppg-large")
}
if encoder not in PRETRAIN:
return "未知编码器"
d_0_file, g_0_file, encoder_model_path = PRETRAIN[encoder]
d_0_path = os.path.join(encoder_model_path, d_0_file)
g_0_path = os.path.join(encoder_model_path, g_0_file)
timestamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')
new_backup_folder = os.path.join(models_backup_path, str(timestamp))
output_msg = ""
if os.listdir(workdir) != ['diffusion']:
os.makedirs(new_backup_folder, exist_ok=True)
for file in os.listdir(workdir):
if file != "diffusion":
shutil.move(os.path.join(workdir, file), os.path.join(new_backup_folder, file))
if os.path.isfile(g_0_path) and os.path.isfile(d_0_path):
shutil.copy(d_0_path, os.path.join(workdir, "D_0.pth"))
shutil.copy(g_0_path, os.path.join(workdir, "G_0.pth"))
output_msg += f"成功装载预训练模型,编码器:{encoder}\n"
else:
output_msg += f"{encoder}的预训练模型不存在,未装载预训练模型\n"
cmd = r"set CUDA_VISIBLE_DEVICES=%s && .\workenv\python.exe train.py -c configs/config.json -m 44k" % (gpu_selection)
subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", cmd])
output_msg += "已经在新的终端窗口开始训练,请监看终端窗口的训练日志。在终端中按Ctrl+C可暂停训练。"
return output_msg
def continue_training(gpu_selection, encoder):
dataset_warn = check_dataset(dataset_dir)
if dataset_warn is not None:
return dataset_warn
if encoder == "":
return "请先选择预处理对应的编码器"
all_files = os.listdir(workdir)
model_files = [f for f in all_files if f.startswith('G_') and f.endswith('.pth')]
if len(model_files) == 0:
return "你还没有已开始的训练"
cmd = r"set CUDA_VISIBLE_DEVICES=%s && .\workenv\python.exe train.py -c configs/config.json -m 44k" % (gpu_selection)
subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", cmd])
return "已经在新的终端窗口开始训练,请监看终端窗口的训练日志。在终端中按Ctrl+C可暂停训练。"
def kmeans_training(kmeans_gpu):
if not os.listdir(dataset_dir):
return "数据集不存在,请检查dataset文件夹"
cmd = r".\workenv\python.exe cluster/train_cluster.py --gpu" if kmeans_gpu else r".\workenv\python.exe cluster/train_cluster.py"
subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", cmd])
return "已经在新的终端窗口开始训练,训练聚类模型不会输出日志,CPU训练一般需要5-10分钟左右"
def index_training():
if not os.listdir(dataset_dir):
return "数据集不存在,请检查dataset文件夹"
cmd = r".\workenv\python.exe train_index.py -c configs/config.json"
subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", cmd])
return "已经在新的终端窗口开始训练"
def diff_training(encoder, k_step_max):
if not os.listdir(dataset_dir):
return "数据集不存在,请检查dataset文件夹"
timestamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')
new_backup_folder = os.path.join(models_backup_path, "diffusion", str(timestamp))
if len(os.listdir(diff_workdir)) != 0:
os.makedirs(new_backup_folder, exist_ok=True)
for file in os.listdir(diff_workdir):
shutil.move(os.path.join(diff_workdir, file), os.path.join(new_backup_folder, file))
DIFF_PRETRAIN = {
"768-kstepmax100": "pre_trained_model/diffusion/768l12/max100/model_0.pt",
"vec768l12": "pre_trained_model/diffusion/768l12/model_0.pt",
"hubertsoft": "pre_trained_model/diffusion/hubertsoft/model_0.pt",
"whisper-ppg": "pre_trained_model/diffusion/whisper-ppg/model_0.pt"
}
if encoder not in DIFF_PRETRAIN:
return "你所选的编码器暂时不支持训练扩散模型"
if k_step_max:
encoder = "768-kstepmax100"
diff_pretrained_model = DIFF_PRETRAIN[encoder]
shutil.copy(diff_pretrained_model, os.path.join(diff_workdir, "model_0.pt"))
subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", r".\workenv\python.exe train_diff.py -c configs/diffusion.yaml"])
output_message = "已经在新的终端窗口开始训练,请监看终端窗口的训练日志。在终端中按Ctrl+C可暂停训练。"
if encoder == "768-kstepmax100":
output_message += "\n正在进行100步深度的浅扩散训练,已加载底模"
else:
output_message += f"\n正在进行完整深度的扩散训练,编码器{encoder}"
return output_message
def diff_continue_training(encoder):
if not os.listdir(dataset_dir):
return "数据集不存在,请检查dataset文件夹"
if encoder == "":
return "请先选择预处理对应的编码器"
all_files = os.listdir(diff_workdir)
model_files = [f for f in all_files if f.endswith('.pt')]
if len(model_files) == 0:
return "你还没有已开始的训练"
subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", r".\workenv\python.exe train_diff.py -c configs/diffusion.yaml"])
return "已经在新的终端窗口开始训练,请监看终端窗口的训练日志。在终端中按Ctrl+C可暂停训练。"
def upload_mix_append_file(files,sfiles):
try:
if(sfiles is None):
file_paths = [file.name for file in files]
else:
file_paths = [file.name for file in chain(files,sfiles)]
p = {file:100 for file in file_paths}
return file_paths,mix_model_output1.update(value=json.dumps(p,indent=2))
except Exception as e:
if debug:
traceback.print_exc()
raise gr.Error(e)
def mix_submit_click(js,mode):
try:
assert js.lstrip()!=""
modes = {"凸组合":0, "线性组合":1}
mode = modes[mode]
data = json.loads(js)
data = list(data.items())
model_path,mix_rate = zip(*data)
path = mix_model(model_path,mix_rate,mode)
return f"成功,文件被保存在了{path}"
except Exception as e:
if debug:
traceback.print_exc()
raise gr.Error(e)
def updata_mix_info(files):
try:
if files is None:
return mix_model_output1.update(value="")
p = {file.name:100 for file in files}
return mix_model_output1.update(value=json.dumps(p,indent=2))
except Exception as e:
if debug:
traceback.print_exc()
raise gr.Error(e)
def pth_identify():
if not os.path.exists(root_dir):
return f"未找到{root_dir}文件夹,请先创建一个{root_dir}文件夹并按第一步流程操作"
model_dirs = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
if not model_dirs:
return f"未在{root_dir}文件夹中找到模型文件夹,请确保每个模型和配置文件都被放置在单独的文件夹中"
valid_model_dirs = []
for path in model_dirs:
pth_files = glob.glob(f"{root_dir}/{path}/*.pth")
json_files = glob.glob(f"{root_dir}/{path}/*.json")
if len(pth_files) != 1 or len(json_files) != 1:
return f"错误: 在{root_dir}/{path}中找到了{len(pth_files)}个.pth文件和{len(json_files)}个.json文件。应当确保每个文件夹内有且只有一个.pth文件和.json文件"
valid_model_dirs.append(path)
return f"成功识别了{len(valid_model_dirs)}个模型:{valid_model_dirs}"
def onnx_export_func():
model_dirs = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
output_msg = ""
try:
for path in model_dirs:
pth_files = glob.glob(f"{root_dir}/{path}/*.pth")
json_files = glob.glob(f"{root_dir}/{path}/*.json")
model_file = Path(pth_files[0]).name
json_file = Path(json_files[0]).name
try:
onnx_export(path, json_file, model_file)
output_msg += f"成功转换{path}\n"
except Exception as e:
output_msg += f"转换{path}时出现错误: {e}\n"
return output_msg
except Exception as e:
if debug:
traceback.print_exc()
raise gr.Error(e)
def load_raw_audio(audio_path):
if not os.path.isdir(audio_path):
return "请输入正确的目录", None
files = os.listdir(audio_path)
wav_files = [file for file in files if file.lower().endswith('.wav')]
if not wav_files:
return "未在目录中找到.wav音频文件", None
return "成功加载", wav_files
def slicer_fn(input_dir, output_dir, process_method, max_sec, min_sec):
if output_dir == "":
return "请先选择输出的文件夹"
if output_dir == input_dir:
return "输出目录不能和输入目录相同"
slicer = AutoSlicer()
if os.path.exists(output_dir) is not True:
os.makedirs(output_dir)
for filename in os.listdir(input_dir):
if filename.lower().endswith(".wav"):
slicer.auto_slice(filename, input_dir, output_dir, max_sec)
if process_method == "丢弃":
for filename in os.listdir(output_dir):
if filename.endswith(".wav"):
filepath = os.path.join(output_dir, filename)
audio, sr = librosa.load(filepath, sr=None, mono=False)
if librosa.get_duration(y=audio, sr=sr) < min_sec:
os.remove(filepath)
elif process_method == "将过短音频整合为长音频":
slicer.merge_short(output_dir, max_sec, min_sec)
file_count, max_duration, min_duration, orig_duration, final_duration = slicer.slice_count(input_dir, output_dir)
hrs = int(final_duration / 3600)
mins = int((final_duration % 3600) / 60)
sec = format(float(final_duration % 60), '.2f')
rate = format(100 * (final_duration / orig_duration), '.2f') if orig_duration != 0 else 0
rate_msg = f"为原始音频时长的{rate}%" if rate != 0 else "因未知问题,无法计算切片时长的占比"
return f"成功将音频切分为{file_count}条片段,其中最长{max_duration}秒,最短{min_duration}秒,切片后的音频总时长{hrs:02d}小时{mins:02d}{sec}秒,{rate_msg}"
def model_compression(_model, is_fp16):
if _model == "":
return "请先选择要压缩的模型"
else:
model_path = os.path.join(ckpt_read_dir, _model)
filename, extension = os.path.splitext(_model)
output_model_name = f"{filename}_compressed{extension}"
output_path = os.path.join(ckpt_read_dir, output_model_name)
removeOptimizer("configs/config.json", model_path, is_fp16, output_path)
return f"模型已成功被保存在了{output_path}"
def pack_autoload(model_to_pack):
_, config_name, _ = auto_load(model_to_pack)
if config_name == "no_config":
return "未找到对应的配置文件,请手动选择", None
else:
_config = Config(os.path.join(config_read_dir, config_name), "json")
_content = _config.read()
spk_dict = _content["spk"]
spk_list = ",".join(spk_dict.keys())
return config_name, spk_list
def release_packing(model_to_pack, model_config, speaker, diff_to_pack, cluster_to_pack):
model_path = diff_path = cluster_path = ""
basename = os.path.splitext(model_to_pack)[0]
diff_basename = os.path.splitext(diff_to_pack)[0]
if model_to_pack == "" or model_config == "" or speaker == "":
return "存在必选项为空,请检查后重试"
released_pack = ReleasePacker(speaker, model_to_pack)
released_pack.remove_temp("release_packs")
model_path = os.path.join(ckpt_read_dir, model_to_pack)
config_path = os.path.join(config_read_dir, model_config)
if os.stat(model_path).st_size > 300000000:
removeOptimizer(config_path, model_path, False, os.path.join("release_packs", model_to_pack))
model_path = os.path.join("release_packs", model_to_pack)
if diff_to_pack != "no_diff":
diff_path = os.path.join(diff_read_dir, diff_to_pack)
if cluster_to_pack != "no_cluster":
cluster_path = os.path.join(ckpt_read_dir, cluster_to_pack)
shutil.copyfile("configs_template/config_template.json", "release_packs/config_template.json")
shutil.copyfile("configs_template/diffusion_template.yaml", "release_packs/diffusion_template.yaml")
files_to_pack = [
(model_path, f"models/{model_to_pack}"),
(diff_path, f"models/diffusion/{diff_to_pack}") if diff_to_pack != "no_diff" else ("", ""),
(cluster_path, f"models/{cluster_to_pack}") if cluster_to_pack != "no_cluster" else ("", ""),
(f"release_packs/{basename}.json", f"models/{basename}.json"),
(f"release_packs/{diff_basename}.yaml", f"models/{diff_basename}.yaml") if diff_to_pack != "no_diff" else ("", ""),
("release_packs/install.txt", "install.txt")
]
released_pack.add_file(files_to_pack)
released_pack.generate_config(diff_to_pack, model_config)
os.rename("release_packs/config_template.json", f"release_packs/{basename}.json")
os.rename("release_packs/diffusion_template.yaml", f"release_packs/{diff_basename}.yaml")
released_pack.pack()
to_remove = [file for file in os.listdir("release_packs") if not file.endswith(".zip")]
for file in to_remove:
os.remove(os.path.join("release_packs", file))
return "打包成功, 请在release_packs目录下查看"
def release_install(model_zip_path):
model_zip = ReleasePacker("", "")
model_zip.unpack(model_zip_path)
for file in os.listdir("release_packs"):
if file.endswith(".txt"):
install_txt = os.path.join("release_packs", file)
break
else:
model_zip.remove_temp("release_packs")
return "非格式化安装包,无法安装"
_spk = model_zip.formatted_install(install_txt)
model_zip.remove_temp("release_packs")
return f"安装成功,可用说话人{_spk},请启用独立目录模式加载模型"
def sami_inference(ac_key, s_key, app_key, audio_path, model, use_proxy, port):
if ac_key == "" or s_key == "" or app_key == "":
return None, "密钥和APP_KEY不能为空"
if use_proxy:
os.environ['HTTP_PROXY'] = f"http://127.0.0.1:{int(port)}/"
sami_service = SAMIService()
sami_service.set_ak(ac_key)
sami_service.set_sk(s_key)
auth_req = {"appkey": app_key, "token_version": 'volc-auth-v1', "expiration": 3600}
auth_resp = sami_service.common_json_handler("GetToken", auth_req)
try:
auth_token = auth_resp["token"]
except KeyError as e:
if debug:
traceback.print_exc()
raise gr.Error(e)
payload = json.dumps({"model": model})
with open(audio_path, "rb") as f:
data = f.read()
data = base64.b64encode(data).decode('utf-8')
req = {
"appkey": app_key,
"token": auth_token,
"namespace": "MusicSourceSeparate",
"payload": payload,
"data": data
}
resp = requests.post("https://sami.bytedance.com/api/v1/invoke", json=req)
try:
sami_resp = resp.json()
if resp.status_code != 200:
print(sami_resp)
sys.exit(1)
except Exception as e:
if debug:
traceback.print_exc()
raise gr.Error(e)
print("response task_id=%s status_code=%d status_text=%s" % (
sami_resp["task_id"], sami_resp["status_code"], sami_resp["status_text"]), end=" ")
if "payload" in sami_resp and len(sami_resp["payload"]) > 0:
print("payload=%s" % sami_resp["payload"], end=" ")
if "data" in sami_resp and len(sami_resp["data"]) > 0:
# Save audio data into file
data = base64.b64decode(sami_resp["data"])
print("data=[%d]bytes" % len(data))
with open("output.wav", "wb") as f:
f.write(data)
if use_proxy:
os.environ.pop('HTTP_PROXY')
if os.path.isfile("output.wav"):
return "output.wav", "Success"
else:
return None, "出错了"
#read default params
sovits_params, diff_params, second_dir_enable = get_default_settings()
ckpt_read_dir = second_dir if second_dir_enable else workdir
config_read_dir = second_dir if second_dir_enable else config_dir
diff_read_dir = diff_second_dir if second_dir_enable else diff_workdir
current_mode = get_current_mode()
# create dirs if they don't exist
dirs_to_check = [
workdir,
second_dir,
diff_workdir,
diff_second_dir,
dataset_dir,
]
for dir in dirs_to_check:
if not os.path.exists(dir):
os.makedirs(dir)
# read ckpt list
ckpt_list, config_list, cluster_list, diff_list, diff_config_list = load_options()
# read available encoder list
encoder_list = get_available_encoder()
#read GPU info
ngpu=torch.cuda.device_count()
gpu_infos=[]
if(torch.cuda.is_available() is False or ngpu==0):
if_gpu_ok=False
else:
if_gpu_ok = False
for i in range(ngpu):
gpu_name=torch.cuda.get_device_name(i)
if("MX"in gpu_name):
continue
if("RTX" in gpu_name.upper() or "10"in gpu_name or "16"in gpu_name or "20"in gpu_name or "30"in gpu_name or "40"in gpu_name or "A50"in gpu_name.upper() or "70"in gpu_name or "80"in gpu_name or "90"in gpu_name or "M4"in gpu_name or"P4"in gpu_name or "T4"in gpu_name or "TITAN"in gpu_name.upper()):#A10#A100#V100#A40#P40#M40#K80
if_gpu_ok=True#至少有一张能用的N卡
gpu_infos.append("%s\t%s"%(i,gpu_name))
gpu_info="\n".join(gpu_infos)if if_gpu_ok is True and len(gpu_infos)>0 else "很遗憾您这没有能用的显卡来支持您训练"
gpus="-".join([i[0]for i in gpu_infos])
#read cuda info for inference
cuda = {}
min_vram = 0
if torch.cuda.is_available():
for i in range(torch.cuda.device_count()):
current_vram = torch.cuda.get_device_properties(i).total_memory
min_vram = current_vram if current_vram > min_vram else min_vram
device_name = torch.cuda.get_device_properties(i).name
cuda[f"CUDA:{i} {device_name}"] = f"cuda:{i}"
total_vram = round(min_vram * 9.31322575e-10) if min_vram != 0 else 0
auto_batch = total_vram - 2 if total_vram <= 12 and total_vram > 0 else total_vram
print(f"Current vram: {total_vram} GiB, recommended batch size: {auto_batch}")
#Check BF16 support
amp_options = ["fp32", "fp16"]
if if_gpu_ok:
if torch.cuda.is_bf16_supported():
amp_options = ["fp32", "fp16", "bf16"]
#Get F0 Options
f0_options = ["crepe","pm","dio","harvest","rmvpe","fcpe"]
app = gr.Blocks()
with app:
gr.Markdown(value="""
### So-VITS-SVC 4.1-Stable WebUI 推理&训练 v2.3.14
制作协力:bilibili@麦哲云
仅供个人娱乐和非商业用途,禁止用于血腥、暴力、性相关、政治相关内容
[使用文档和常见报错解答](https://www.yuque.com/umoubuton/ueupp5)
整合包作者:bilibili@羽毛布団 | 技术交流群:742817595 | 交流二群:168254971 | 交流三群:416656175 | 交流四群:903516607
""")
with gr.Tabs():
with gr.TabItem("推理") as inference_tab:
mode_caption = gr.Markdown(value=f"""
{current_mode},可在页面底端切换模式
""")
with gr.Row():
choice_ckpt = gr.Dropdown(label="模型选择", choices=ckpt_list, value="no_model")
model_branch = gr.Textbox(label="模型编码器", placeholder="请先选择模型", interactive=False)
with gr.Row():
config_choice = gr.Dropdown(label="配置文件", choices=config_list, value="no_config")
config_info = gr.Textbox(label="配置文件编码器", placeholder="请选择配置文件")
gr.Markdown(value="""**请检查模型和配置文件的编码器是否匹配**""")
with gr.Row():
diff_choice = gr.Dropdown(label="(可选)选择扩散模型", choices=diff_list, value="no_diff", interactive=True)
diff_config_choice = gr.Dropdown(label="扩散模型配置文件", choices=diff_config_list, value="no_diff_config", interactive=True)
cluster_choice = gr.Dropdown(label="(可选)选择聚类模型/特征检索模型", choices=cluster_list, value="no_clu")
refresh = gr.Button("刷新选项")
with gr.Row():
enhance = gr.Checkbox(label="是否使用NSF_HIFIGAN增强,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭", value=False)
only_diffusion = gr.Checkbox(label="是否使用全扩散推理,开启后将不使用So-VITS模型,仅使用扩散模型进行完整扩散推理,不建议使用", value=False)
with gr.Row():
diffusion_method = gr.Dropdown(label="扩散模型采样器", choices=["dpm-solver++","dpm-solver","pndm","ddim","unipc"], value="dpm-solver++")
diffusion_speedup = gr.Number(label="扩散加速倍数,默认为10倍", value=10)
using_device = gr.Dropdown(label="推理设备,默认为自动选择", choices=["Auto",*cuda.keys(),"cpu"], value="Auto")
with gr.Row():
loadckpt = gr.Button("加载模型", variant="primary")
unload = gr.Button("卸载模型", variant="primary")
with gr.Row():
model_message = gr.Textbox(label="Output Message")
sid = gr.Dropdown(label="So-VITS说话人", value="speaker0")
inference_tab.select(refresh_options,[],[choice_ckpt,config_choice,cluster_choice,diff_choice,diff_config_choice])
choice_ckpt.change(auto_load, [choice_ckpt], [model_branch, config_choice, config_info])
config_choice.change(load_json_encoder, [config_choice, choice_ckpt], [config_info])
diff_choice.change(auto_load_diff, [diff_choice], [diff_config_choice])
refresh.click(refresh_options,[],[choice_ckpt,config_choice,cluster_choice,diff_choice,diff_config_choice,mode_caption])
gr.Markdown(value="""
请稍等片刻,模型加载大约需要10秒。后续操作不需要重新加载模型
""")
with gr.Tabs():
with gr.TabItem("单个音频上传"):
vc_input3 = gr.Audio(label="单个音频上传", type="filepath", source="upload")
use_microphone = gr.Checkbox(label="使用麦克风输入")
with gr.TabItem("批量音频上传"):
vc_batch_files = gr.Files(label="批量音频上传", file_types=["audio"], file_count="multiple")
with gr.TabItem("文字转语音"):
gr.Markdown("""
文字转语音(TTS)说明:使用edge_tts服务生成音频,并转换为So-VITS模型音色。
""")
text_input = gr.Textbox(label = "在此输入需要转译的文字(建议打开自动f0预测)",)
with gr.Row():
tts_gender = gr.Radio(label = "说话人性别", choices = ["男","女"], value = "男")
tts_lang = gr.Dropdown(label = "选择语言,Auto为根据输入文字自动识别", choices=SUPPORTED_LANGUAGES, value = "Auto")
with gr.Row():
tts_rate = gr.Slider(label = "TTS语音变速(倍速相对值)", minimum = -1, maximum = 3, value = 0, step = 0.1)
tts_volume = gr.Slider(label = "TTS语音音量(相对值)", minimum = -1, maximum = 1.5, value = 0, step = 0.1)
with gr.Row():
auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声不要勾选此项会跑调)", value=False)
f0_predictor = gr.Radio(label="f0预测器选择(如遇哑音可以更换f0预测器解决,crepe为原F0使用均值滤波器)", choices=f0_options, value="pm")
cr_threshold = gr.Number(label="F0过滤阈值,只有使用crepe时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音", value=0.05)
with gr.Row():
vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
cluster_ratio = gr.Number(label="聚类模型/特征检索混合比例,0-1之间,默认为0不启用聚类或特征检索,能提升音色相似度,但会导致咬字下降", value=0)
k_step = gr.Slider(label="浅扩散步数,只有使用了扩散模型才有效,步数越大越接近扩散模型的结果", value=100, minimum = 1, maximum = 1000)
with gr.Row():
output_format = gr.Radio(label="音频输出格式", choices=["wav", "flac", "mp3"], value = "wav")
enhancer_adaptive_key = gr.Number(label="使NSF-HIFIGAN增强器适应更高的音域(单位为半音数)|默认为0", value=0)
slice_db = gr.Number(label="切片阈值", value=-50)
cl_num = gr.Number(label="音频自动切片,0为按默认方式切片,单位为秒/s,爆显存可以设置此处强制切片", value=0)
with gr.Accordion("高级设置(一般不需要动)", open=False):
noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4)
pad_seconds = gr.Number(label="推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现", value=0.5)
lg_num = gr.Number(label="两端音频切片的交叉淡入长度,如果自动切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,注意,该设置会影响推理速度,单位为秒/s", value=1)
lgr_num = gr.Number(label="自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭", value=0.75)
second_encoding = gr.Checkbox(label = "二次编码,浅扩散前会对原始音频进行二次编码,玄学选项,效果时好时差,默认关闭", value=False)
loudness_envelope_adjustment = gr.Number(label="输入源响度包络替换输出响度包络融合比例,越靠近1越使用输出响度包络", value = 0)
use_spk_mix = gr.Checkbox(label="动态声线融合,需要手动编辑角色混合轨道,没做完暂时不要开启", value=False, interactive=False)
with gr.Row():
vc_submit = gr.Button("音频转换", variant="primary")
vc_batch_submit = gr.Button("批量转换", variant="primary")
vc_tts_submit = gr.Button("文本转语音", variant="primary")
#interrupt_button = gr.Button("中止转换", variant="danger")
vc_output1 = gr.Textbox(label="Output Message")
vc_output2 = gr.Audio(label="Output Audio")
loadckpt.click(load_model_func,[choice_ckpt,cluster_choice,config_choice,enhance,diff_choice,diff_config_choice,only_diffusion,use_spk_mix,using_device,diffusion_method,diffusion_speedup,cl_num],[model_message, sid, cl_num, k_step])
unload.click(model_empty_cache, [], [sid, model_message])
use_microphone.change(source_change, [use_microphone], [vc_input3])
vc_submit.click(vc_fn, [output_format, sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1, vc_output2])
vc_batch_submit.click(vc_batch_fn, [output_format, sid, vc_batch_files, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1])
vc_tts_submit.click(tts_fn, [text_input, tts_gender, tts_lang, tts_rate, tts_volume, output_format, sid, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1, vc_output2])
#interrupt_button.click(fn=None, inputs=None, outputs=None, cancels=[vc_event])
with gr.TabItem("训练"):
gr.Markdown(value="""请将数据集文件夹放置在dataset_raw文件夹下,确认放置正确后点击下方获取数据集名称""")
raw_dirs_list=gr.Textbox(label="Raw dataset directory(s):")
get_raw_dirs=gr.Button("识别数据集", variant="primary")
gr.Markdown(value="""确认数据集正确识别后请选择训练使用的特征编码器和f0预测器,**如果要训练扩散模型,请选择Vec768l12或hubertsoft或whisper-ppg,并确保So-VITS和扩散模型使用同一个编码器**""")
with gr.Row():
gr.Markdown(value="""**vec256l9**: ContentVec(256Layer9),旧版本叫v1,So-VITS-SVC 4.0的基础版本,**不推荐使用**
**vec768l12**: 特征输入更换为ContentVec的第12层Transformer输出,模型理论上会更加还原训练集音色
**hubertsoft**: So-VITS-SVC 3.0使用的编码器,咬字更为准确,但可能存在多说话人音色泄露问题
**whisper-ppg**: 来自OpenAI,咬字最为准确,但和Hubertsoft一样存在多说话人音色泄露,且显存占用和训练时间有明显增加。
解锁更多编码器选项,请见[这里](https://www.yuque.com/umoubuton/ueupp5/kmui02dszo5zrqkz)
""")
gr.Markdown(value="""**crepe**: 抗噪能力最强,但预处理速度慢(不过如果你的显卡很强的话速度会很快)
**pm**: 预处理速度快,但抗噪能力较弱
**dio**: 先前版本预处理默认使用的f0预测器,比较拉胯不推荐使用
**harvest**: 有一定抗噪能力,预处理显存占用友好,速度比较慢
**rmvpe**: 最精准的预测器,crepe的完全上位替代
**fcpe**: SVC开发组自研F0预测器,有最快的速度和不输crepe的精度
""")
with gr.Row():
branch_selection = gr.Dropdown(label="选择训练使用的编码器", choices=encoder_list, value="vec768l12", interactive=True)
f0_predictor_selection = gr.Dropdown(label="选择训练使用的f0预测器", choices=f0_options, value="rmvpe", interactive=True)
with gr.Row():
use_diff = gr.Checkbox(label="是否使用浅扩散模型,如要训练浅扩散请勾选此项,将会在预处理时生成浅扩散必备的特征文件(确定不训练可以不勾,能节省一点空间)", value=True)
vol_aug=gr.Checkbox(label="是否启用响度嵌入和音量增强,启用后可以根据输入源控制输出响度,但对数据集质量的要求更高。**仅支持vec768l12编码器**", value=False)
tiny_enable = gr.Checkbox(label="是否启用TINY训练,TINY为实时专用模型,显存占用更低,推理速度更快,但质量有所削减。仅支持vec768,且必须打开响度嵌入", value=False)
with gr.Row():
skip_loudnorm = gr.Checkbox(label="是否跳过响度匹配,如果你已经用音频处理软件做过响度匹配,请勾选此处")
num_processes = gr.Slider(label="预处理使用的CPU线程数,可以大幅加快预处理速度,但线程数过大容易爆显存,建议12G显存设置为2", minimum=1, maximum=multiprocessing.cpu_count(), value=1, step=1)
with gr.Row():
raw_preprocess=gr.Button("数据预处理", variant="primary")
regenerate_config_btn=gr.Button("重新生成配置文件", variant="primary")
preprocess_output=gr.Textbox(label="预处理输出信息,完成后请检查一下是否有报错信息,如无则可以进行下一步", max_lines=999)
clear_preprocess_output=gr.Button("清空输出信息")
with gr.Group():
gr.Markdown(value="""填写训练设置和超参数""")
with gr.Row():
gr.Textbox(label="当前使用显卡信息", value=gpu_info)
gpu_selection=gr.Textbox(label="多卡用户请指定希望训练使用的显卡ID(0,1,2...)", value=gpus, interactive=True)
with gr.Row():
log_interval=gr.Textbox(label="每隔多少步(steps)生成一次评估日志", value=sovits_params['log_interval'])
eval_interval=gr.Textbox(label="每隔多少步(steps)验证并保存一次模型", value=sovits_params['eval_interval'])
keep_ckpts=gr.Textbox(label="仅保留最新的X个模型,超出该数字的旧模型会被删除。设置为0则永不删除", value=sovits_params['keep_ckpts'])
with gr.Row():
batch_size=gr.Textbox(label="批量大小,每步取多少条数据进行训练,大batch有助于训练但显著增加显存占用。6G显存建议设定为4", value=auto_batch)
lr=gr.Textbox(label="学习率,一般不用动,批量大小较大时可以适当增大学习率,但强烈不建议超过0.0002,有炸炉风险", value=sovits_params['learning_rate'])
amp_dtype = gr.Radio(label="训练数据类型,fp16可能会有更快的训练速度和更低的显存占用,但容易炸炉", choices=amp_options, value=sovits_params['amp_dtype'])
all_in_mem=gr.Checkbox(label="是否加载所有数据集到内存中,硬盘IO过于低下、同时内存容量远大于数据集体积时可以启用,能显著加快训练速度", value=sovits_params['all_in_mem'])
with gr.Row():
gr.Markdown("请检查右侧的说话人列表是否和你要训练的目标说话人一致,确认无误后点击写入配置文件,然后就可以开始训练了")
speakers=gr.Textbox(label="说话人列表")
with gr.Accordion(label = "扩散模型配置(训练扩散模型需要写入此处)", open=True):
with gr.Row():
diff_num_workers = gr.Number(label="num_workers, 如果你的电脑配置较高,可以将这里设置为0加快训练速度", value=diff_params['num_workers'])
diff_k_step_max = gr.Checkbox(label="只训练100步深度的浅扩散模型。能加快训练速度并提高模型质量,代价是无法执行超过100步的浅扩散推理", value=diff_params['diff_k_step_max'])
diff_cache_all_data = gr.Checkbox(label="是否缓存数据,启用后可以加快训练速度,关闭后可以节省显存或内存,但会减慢训练速度", value=diff_params['cache_all_data'])
diff_cache_device = gr.Radio(label="若启用缓存数据,使用显存(cuda)还是内存(cpu)缓存,如果显卡显存充足,选择cuda以加快训练速度", choices=["cuda","cpu"], value=diff_params['cache_device'])
diff_amp_dtype = gr.Radio(label="训练数据类型,fp16可能会有更快的训练速度,前提是你的显卡支持", choices=["fp32","fp16"], value=diff_params['amp_dtype'])
with gr.Row():
diff_batch_size = gr.Number(label="批量大小(batch_size),根据显卡显存设置,小显存适当降低该项,6G显存可以设定为48,但该数值不要超过数据集总数量的1/4", value=diff_params['diff_batch_size'])
diff_lr = gr.Number(label="学习率(一般不需要动)", value=diff_params['diff_lr'])
diff_interval_log = gr.Number(label="每隔多少步(steps)生成一次评估日志", value = diff_params['diff_interval_log'])
diff_interval_val = gr.Number(label="每隔多少步(steps)验证并保存一次模型,如果你的批量大小较大,可以适当减少这里的数字,但不建议设置为1000以下", value=diff_params['diff_interval_val'])
diff_force_save = gr.Number(label="每隔多少步强制保留模型,只有该步数的倍数保存的模型会被保留,其余会被删除。设置为与验证步数相同的值则每个模型都会被保留", value=diff_params['diff_force_save'])
with gr.Row():
save_params=gr.Button("将当前设置保存为默认设置", variant="primary")
write_config=gr.Button("写入配置文件", variant="primary")
write_config_output=gr.Textbox(label="输出信息")
gr.Markdown(value="""**点击从头开始训练**将会自动将已有的训练进度保存到models_backup文件夹,并自动装载预训练模型。
**继续上一次的训练进度**将从上一个保存模型的进度继续训练。继续训练进度无需重新预处理和写入配置文件。
关于扩散、聚类和特征检索的详细说明请看[此处](https://www.yuque.com/umoubuton/ueupp5/kmui02dszo5zrqkz)。
""")
with gr.Row():
with gr.Column():
start_training=gr.Button("从头开始训练", variant="primary")
training_output=gr.Textbox(label="训练输出信息")
with gr.Column():
continue_training_btn=gr.Button("继续上一次的训练进度", variant="primary")
continue_training_output=gr.Textbox(label="训练输出信息")
with gr.Row():
with gr.Column():
diff_training_btn=gr.Button("从头训练扩散模型", variant="primary")
diff_training_output=gr.Textbox(label="训练输出信息")
with gr.Column():
diff_continue_training_btn=gr.Button("继续训练扩散模型", variant="primary")
diff_continue_training_output=gr.Textbox(label="训练输出信息")
with gr.Accordion(label = "聚类、特征检索训练", open=False):
with gr.Row():
with gr.Column():
kmeans_button=gr.Button("训练聚类模型", variant="primary")
kmeans_gpu = gr.Checkbox(label="使用GPU训练", value=True)
kmeans_output=gr.Textbox(label="训练输出信息")
with gr.Column():
index_button=gr.Button("训练特征检索模型", variant="primary")
index_output=gr.Textbox(label="训练输出信息")
with gr.TabItem("小工具/实验室特性"):
gr.Markdown(value="""
### So-vits-svc 4.1 小工具/实验室特性
提供了一些有趣或实用的小工具,可以自行探索
""")
with gr.Tabs():
with gr.TabItem("静态声线融合"):
gr.Markdown(value="""
<font size=2> 介绍:该功能可以将多个声音模型合成为一个声音模型(多个模型参数的凸组合或线性组合),从而制造出现实中不存在的声线
注意:
1.该功能仅支持单说话人的模型
2.如果强行使用多说话人模型,需要保证多个模型的说话人数量相同,这样可以混合同一个SpaekerID下的声音
3.保证所有待混合模型的config.json中的model字段是相同的
4.输出的混合模型可以使用待合成模型的任意一个config.json,但聚类模型将不能使用
5.批量上传模型的时候最好把模型放到一个文件夹选中后一起上传
6.混合比例调整建议大小在0-100之间,也可以调为其他数字,但在线性组合模式下会出现未知的效果
7.混合完毕后,文件将会保存在项目根目录中,文件名为output.pth
8.凸组合模式会将混合比例执行Softmax使混合比例相加为1,而线性组合模式不会
</font>
""")
mix_model_path = gr.Files(label="选择需要混合模型文件")
mix_model_upload_button = gr.UploadButton("选择/追加需要混合模型文件", file_count="multiple")
mix_model_output1 = gr.Textbox(
label="混合比例调整,单位/%",
interactive = True
)
mix_mode = gr.Radio(choices=["凸组合", "线性组合"], label="融合模式",value="凸组合",interactive = True)
mix_submit = gr.Button("声线融合启动", variant="primary")
mix_model_output2 = gr.Textbox(
label="Output Message"
)
with gr.TabItem("onnx转换"):
gr.Markdown(value="""
提供了将.pth模型(批量)转换为.onnx模型的功能
源项目本身自带转换的功能,但不支持批量,操作也不够简单,这个工具可以支持在WebUI中以可视化的操作方式批量转换.onnx模型
有人可能会问,转.onnx模型有什么作用呢?相信我,如果你问出了这个问题,说明这个工具你应该用不上
### Step 1:
在整合包根目录下新建一个"checkpoints"文件夹,将pth模型和对应的json配置文件按目录分别放置到checkpoints文件夹下
看起来应该像这样:
checkpoints
├───xxxx
│ ├───xxxx.pth
│ └───xxxx.json
├───xxxx
│ ├───xxxx.pth
│ └───xxxx.json
└───……
""")
pth_dir_msg = gr.Textbox(label="识别待转换模型", placeholder="请将模型和配置文件按上述说明放置在正确位置")
pth_dir_identify_btn = gr.Button("识别", variant="primary")
gr.Markdown(value="""
### Step 2:
识别正确后点击下方开始转换,转换一个模型可能需要一分钟甚至更久
""")
pth2onnx_btn = gr.Button("开始转换", variant="primary")
pth2onnx_msg = gr.Textbox(label="输出信息")
with gr.TabItem("智能音频切片"):
gr.Markdown(value="""
该工具可以实现对音频的切片,无需调整参数即可完成符合要求的数据集制作。
数据集要求的音频切片约在2-15秒内,用传统的Slicer-GUI切片工具需要精准调参和二次切片才能符合要求,该工具省去了上述繁琐的操作,只要上传原始音频即可一键制作数据集。
""")
with gr.Row():
raw_audio_path = gr.Textbox(label="原始音频文件夹", placeholder="包含所有待切片音频的文件夹,示例: D:\干声\speakers")
load_raw_audio_btn = gr.Button("加载原始音频", variant = "primary")
load_raw_audio_output = gr.Textbox(label = "输出信息")
raw_audio_dataset = gr.Textbox(label = "音频列表", value = "")
slicer_output_dir = gr.Textbox(label = "输出目录", placeholder = "选择输出目录(不要和输入音频是同一个文件夹)")
with gr.Row():
process_method = gr.Radio(label = "对过短音频的处理方式", choices = ["丢弃","将过短音频整合为长音频"], value = "丢弃")
max_sec = gr.Number(label = "切片的最长秒数", value = 15)
min_sec = gr.Number(label = "切片的最短秒数", value = 2)
slicer_btn = gr.Button("开始切片", variant = "primary")
slicer_output_msg = gr.Textbox(label = "输出信息")
mix_model_path.change(updata_mix_info,[mix_model_path],[mix_model_output1])
mix_model_upload_button.upload(upload_mix_append_file, [mix_model_upload_button,mix_model_path], [mix_model_path,mix_model_output1])
mix_submit.click(mix_submit_click, [mix_model_output1,mix_mode], [mix_model_output2])
pth_dir_identify_btn.click(pth_identify, [], [pth_dir_msg])
pth2onnx_btn.click(onnx_export_func, [], [pth2onnx_msg])
load_raw_audio_btn.click(load_raw_audio, [raw_audio_path], [load_raw_audio_output, raw_audio_dataset])
slicer_btn.click(slicer_fn, [raw_audio_path, slicer_output_dir, process_method, max_sec, min_sec], [slicer_output_msg])
with gr.TabItem("模型压缩工具"):
gr.Markdown(value="""
该工具可以实现对模型的体积压缩,在**不影响模型推理功能**的情况下,将原本约600M的So-VITS模型压缩至约200M, 大大减少了硬盘的压力。
**注意:压缩后的模型将无法继续训练,请在确认封炉后再压缩。**
将模型文件放置在logs/44k下,然后选择需要压缩的模型
""")
model_to_compress = gr.Dropdown(label="模型选择", choices=ckpt_list, value="no_model")
fp16_compress = gr.Checkbox(label="使用 fp16 压缩", value=False)
compress_model_btn = gr.Button("压缩模型", variant="primary")
compress_model_output = gr.Textbox(label="输出信息", value="")
compress_model_btn.click(model_compression, [model_to_compress, fp16_compress], [compress_model_output])
with gr.TabItem("模型发布打包/安装"):
gr.Markdown(value="""
如果你想将你的模型分享给他人,请使用该工具对模型进行打包。
该工具可以自动生成正确的配置文件,确保你在打包过程中不出现任何遗漏和错误,接收到使用该工具打包的模型后,也可以用该工具进行自动安装。
""")
with gr.Tabs():
with gr.TabItem("安装"):
with gr.Row():
model_to_install = gr.Textbox(label = "模型压缩包路径", placeholder="示例:D:\Downloads\model_packing.zip")
install_model_btn = gr.Button("安装", variant="primary")
install_output = gr.Textbox(label="输出信息", value="")
with gr.TabItem("打包"):
with gr.Row():
model_to_pack = gr.Dropdown(label="选择要打包的模型", choices=ckpt_list, value="")
model_config = gr.Dropdown(label="选择要打包的模型配置文件", choices=config_list, value="", interactive=True)
speaker_name = gr.Textbox(label="模型说话人名称", placeholder="该模型的说话人名称,仅限数字字母下划线,如模型中有多说话人,请用逗号分割,例如:spk1,spk2,spk3", value = "")
with gr.Row():
diff_to_pack = gr.Dropdown(label="(可选)选择要打包的扩散模型", choices=diff_list, value="no_diff")
cluster_to_pack = gr.Dropdown(label="(可选)选择要打包的聚类或特征检索模型", choices=cluster_list, value="no_cluster")
packing_btn = gr.Button("开始打包", variant="primary")
packing_output_msg = gr.Textbox(label = "输出信息")
model_to_pack.change(pack_autoload, [model_to_pack], [model_config, speaker_name])
packing_btn.click(release_packing, [model_to_pack, model_config, speaker_name, diff_to_pack, cluster_to_pack], [packing_output_msg])
install_model_btn.click(release_install, [model_to_install], [install_output])
with gr.TabItem("歌曲人声分离"):
gr.Markdown(value="""
使用火山引擎 SAMI 技术分离人声,需要联网并自行创建应用 API 后使用。
""")
with gr.Row():
input_audio = gr.Audio(label="上传原始音频", type="filepath", source="upload")
sami_model = gr.Dropdown(label="选择分离模型", choices=["2track_vocal","2track_acc","bs_4track_vocal","bs_4track_acc"], value="bs_4track_vocal")
with gr.Row():
sami_access = gr.Textbox(label="Access Key", placeholder="")
sami_secret = gr.Textbox(label="Secret Key", placeholder="")
sami_appkey = gr.Textbox(label="App Key", placeholder="")
with gr.Row():
use_proxy = gr.Checkbox(label="使用代理", value=False)
proxy_port = gr.Number(label="代理端口", value=7890)
sami_submit = gr.Button("开始分离", variant="primary")
sami_output = gr.Audio(label="输出结果", type="filepath")
sami_output_msg = gr.Textbox(label="输出信息")
sami_submit.click(sami_inference, [sami_access, sami_secret, sami_appkey, input_audio, sami_model, use_proxy, proxy_port], [sami_output, sami_output_msg])
get_raw_dirs.click(load_raw_dirs,[],[raw_dirs_list])
raw_preprocess.click(dataset_preprocess,[branch_selection, f0_predictor_selection, use_diff, vol_aug, skip_loudnorm, num_processes, tiny_enable],[preprocess_output, speakers])
regenerate_config_btn.click(regenerate_config,[branch_selection, vol_aug, tiny_enable],[preprocess_output])
clear_preprocess_output.click(clear_output,[],[preprocess_output])
save_params.click(save_default_settings, [log_interval,eval_interval,keep_ckpts,batch_size,lr,amp_dtype,all_in_mem,diff_num_workers,diff_cache_all_data,diff_cache_device,diff_amp_dtype,diff_batch_size,diff_lr,diff_interval_log,diff_interval_val,diff_force_save,diff_k_step_max], [write_config_output])
write_config.click(config_fn,[log_interval, eval_interval, keep_ckpts, batch_size, lr, amp_dtype, all_in_mem, diff_num_workers, diff_cache_all_data, diff_batch_size, diff_lr, diff_interval_log, diff_interval_val, diff_cache_device, diff_amp_dtype, diff_force_save, diff_k_step_max],[write_config_output])
start_training.click(training,[gpu_selection, branch_selection, tiny_enable],[training_output])
diff_training_btn.click(diff_training,[branch_selection, diff_k_step_max],[diff_training_output])
continue_training_btn.click(continue_training,[gpu_selection, branch_selection],[continue_training_output])
diff_continue_training_btn.click(diff_continue_training,[branch_selection],[diff_continue_training_output])
kmeans_button.click(kmeans_training,[kmeans_gpu],[kmeans_output])
index_button.click(index_training, [], [index_output])
with gr.Tabs():
with gr.Row(variant="panel"):
with gr.Column():
gr.Markdown(value="""
<font size=2> WebUI设置</font>
""")
with gr.Row():
debug_button = gr.Checkbox(label="Debug模式,反馈BUG需要打开,打开后控制台可以显示具体错误提示", value=debug)
read_second_dir = gr.Checkbox(label = "独立目录模式,开启后将从独立目录(./models)读取模型和配置文件,变更后需要刷新选项才能生效", value=second_dir_enable)
debug_button.change(debug_change,[],[])
read_second_dir.change(webui_change,[read_second_dir],[])
app.queue(concurrency_count=1022, max_size=2044).launch(server_name="127.0.0.1",inbrowser=True,quiet=True)