Spaces:
Runtime error
Runtime error
File size: 66,112 Bytes
46cfe25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 |
import multiprocessing
import os
import re
import torch
import glob
import gradio as gr
import librosa
import numpy as np
import soundfile as sf
from inference.infer_tool import Svc
import logging
import json
import yaml
import time
import subprocess
import shutil
import utils
import datetime
import traceback
from utils import mix_model
from onnxexport.model_onnx import SynthesizerTrn
from itertools import chain
from compress_model import removeOptimizer
from auto_slicer import AutoSlicer
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)
workdir = "logs/44k"
diff_workdir = "logs/44k/diffusion"
config_dir = "configs/"
raw_path = "dataset_raw"
raw_wavs_path = "raw"
models_backup_path = 'models_backup'
root_dir = "checkpoints"
debug = False
sovits_params = {}
diff_params = {}
loaded = None
def debug_change():
global debug
debug = debug_button.value
def get_default_settings():
global sovits_params, diff_params
yaml_path = "settings.yaml"
with open(yaml_path, 'r') as f:
default_settings = yaml.safe_load(f)
sovits_params = default_settings['sovits_params']
diff_params = default_settings['diff_params']
return sovits_params, diff_params
def save_default_settings(log_interval,eval_interval,keep_ckpts,batch_size,learning_rate,fp16_run,all_in_mem,num_workers,cache_all_data,cache_device,amp_dtype,diff_batch_size,diff_lr,diff_interval_log,diff_interval_val,diff_force_save):
yaml_path = "settings.yaml"
with open(yaml_path, 'r') as f:
default_settings = yaml.safe_load(f)
default_settings['sovits_params']['log_interval'] = int(log_interval)
default_settings['sovits_params']['eval_interval'] = int(eval_interval)
default_settings['sovits_params']['keep_ckpts'] = int(keep_ckpts)
default_settings['sovits_params']['batch_size'] = int(batch_size)
default_settings['sovits_params']['learning_rate'] = float(learning_rate)
default_settings['sovits_params']['fp16_run'] = fp16_run
default_settings['sovits_params']['all_in_mem'] = all_in_mem
default_settings['diff_params']['num_workers'] = int(num_workers)
default_settings['diff_params']['cache_all_data'] = cache_all_data
default_settings['diff_params']['cache_device'] = str(cache_device)
default_settings['diff_params']['amp_dtype'] = str(amp_dtype)
default_settings['diff_params']['diff_batch_size'] = int(diff_batch_size)
default_settings['diff_params']['diff_lr'] = float(diff_lr)
default_settings['diff_params']['diff_interval_log'] = int(diff_interval_log)
default_settings['diff_params']['diff_interval_val'] = int(diff_interval_val)
default_settings['diff_params']['diff_force_save'] = int(diff_force_save)
with open(yaml_path, 'w') as y:
yaml.safe_dump(default_settings, y, default_flow_style=False, sort_keys=False)
return "成功保存默认配置"
def get_model_info(choice_ckpt):
pthfile = os.path.join(workdir, choice_ckpt)
net = torch.load(pthfile, map_location=torch.device('cpu')) #cpu load
spk_emb = net["model"].get("emb_g.weight")
if spk_emb is None:
return "所选模型缺少emb_g.weight,你可能选择了一个底模"
_dim, _layer = spk_emb.size()
model_type = {
768: "Vec768-Layer12",
256: "Vec256-Layer9 / HubertSoft",
1024: "Whisper-PPG"
}
return model_type.get(_layer, "不受支持的模型")
def load_json_encoder(config_choice):
config_file = os.path.join(config_dir + config_choice)
with open(config_file, 'r') as f:
config = json.load(f)
try:
config_encoder = str(config["model"]["speech_encoder"])
return config_encoder
except Exception as e:
if "speech_encoder" in str(e):
return "你的配置文件似乎是未作兼容的旧版,请根据文档指示对你的配置文件进行修改"
else:
return f"出错了: {e}"
def load_model_func(ckpt_name,cluster_name,config_name,enhance,diff_model_name,diff_config_name,only_diffusion,encoder,using_device):
global model
config_path = os.path.join(config_dir, config_name)
diff_config_path = os.path.join(config_dir, diff_config_name) if diff_config_name != "no_diff_config" else "configs/diffusion.yaml"
with open(config_path, 'r') as f:
config = json.load(f)
spk_dict = config["spk"]
spk_name = config.get('spk', None)
spk_choice = next(iter(spk_name)) if spk_name else "未检测到音色"
ckpt_path = os.path.join(workdir, ckpt_name)
_, _suffix = os.path.splitext(cluster_name)
fr = True if _suffix == ".pkl" else False #如果是pkl后缀就启用特征检索
cluster_path = os.path.join(workdir, cluster_name)
diff_model_path = os.path.join(diff_workdir, diff_model_name)
shallow_diffusion = True if diff_model_name != "no_diff" else False
use_spk_mix = False
device = None if using_device == "Auto" else using_device
model = Svc(ckpt_path,
config_path,
device,
cluster_path,
enhance,
diff_model_path,
diff_config_path,
shallow_diffusion,
only_diffusion,
use_spk_mix,
fr)
spk_list = list(spk_dict.keys())
clip = 25 if encoder == "Whisper-PPG" else 0 #Whisper必须强制切片25秒
device_name = torch.cuda.get_device_properties(model.dev).name if "cuda" in str(model.dev) else str(model.dev)
index_or_kmeans = "特征索引" if fr is True else "聚类模型"
clu_load = "未加载" if cluster_name == "no_clu" else cluster_name
diff_load = "未加载" if diff_model_name == "no_diff" else diff_model_name
output_msg = f"模型被成功加载到了{device_name}上\n{index_or_kmeans}:{clu_load}\n扩散模型:{diff_load}"
return output_msg, gr.Dropdown.update(choices=spk_list, value=spk_choice), clip
def Newload_model_func(ckpt_name,cluster_name,config_name2,enhance2,diff_model_name2,diff_config_name2,only_diffusion2,encoder2,using_device2):
global model, loaded
config_name = config_name2.value
enhance = enhance2.value
diff_model_name = diff_model_name2.value
diff_config_name = (diff_config_name2).value
only_diffusion = (only_diffusion2).value
encoder = (encoder2).value
using_device = (using_device2).value
config_path = os.path.join(config_dir, config_name)
diff_config_path = os.path.join(config_dir, diff_config_name) if diff_config_name != "no_diff_config" else "configs/diffusion.yaml"
with open(config_path, 'r') as f:
config = json.load(f)
spk_dict = config["spk"]
spk_name = config.get('spk', None)
spk_choice = next(iter(spk_name)) if spk_name else "未检测到音色"
ckpt_path = os.path.join(workdir, ckpt_name)
_, _suffix = os.path.splitext(cluster_name)
fr = True if _suffix == ".pkl" else False #如果是pkl后缀就启用特征检索
cluster_path = os.path.join(workdir, cluster_name)
diff_model_path = os.path.join(diff_workdir, diff_model_name)
shallow_diffusion = True if diff_model_name != "no_diff" else False
use_spk_mix = False
device = None if using_device == "Auto" else using_device
model = Svc(ckpt_path,
config_path,
device,
cluster_path,
enhance,
diff_model_path,
diff_config_path,
shallow_diffusion,
only_diffusion,
use_spk_mix,
fr)
spk_list = list(spk_dict.keys())
clip = 25 if encoder == "Whisper-PPG" else 0 #Whisper必须强制切片25秒
device_name = torch.cuda.get_device_properties(model.dev).name if "cuda" in str(model.dev) else str(model.dev)
index_or_kmeans = "特征索引" if fr is True else "聚类模型"
clu_load = "未加载" if cluster_name == "no_clu" else cluster_name
diff_load = "未加载" if diff_model_name == "no_diff" else diff_model_name
loaded = cluster_name
#output_msg = f"模型被成功加载到了{device_name}上\n{index_or_kmeans}:{clu_load}\n扩散模型:{diff_load}"
#return output_msg, gr.Dropdown.update(choices=spk_list, value=spk_choice), clip
def get_file_options(directory, extension):
return [file for file in os.listdir(directory) if file.endswith(extension)]
def load_options():
ckpt_list = [file for file in get_file_options(workdir, ".pth") if not file.startswith("D_")]
config_list = get_file_options(config_dir, ".json")
cluster_list = ["no_clu"] + get_file_options(workdir, ".pt") + get_file_options(workdir, ".pkl") # 聚类和特征检索模型
diff_list = ["no_diff"] + get_file_options(diff_workdir, ".pt")
diff_config_list = get_file_options(config_dir, ".yaml")
return ckpt_list, config_list, cluster_list, diff_list, diff_config_list
def refresh_options():
ckpt_list, config_list, cluster_list, diff_list, diff_config_list = load_options()
return (
choice_ckpt.update(choices=ckpt_list),
config_choice.update(choices=config_list),
cluster_choice.update(choices=cluster_list),
diff_choice.update(choices=diff_list),
diff_config_choice.update(choices=diff_config_list)
)
def vc_infer(sid, input_audio, input_audio_path, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment):
if np.issubdtype(input_audio.dtype, np.integer):
input_audio = (input_audio / np.iinfo(input_audio.dtype).max).astype(np.float32)
if len(input_audio.shape) > 1:
input_audio = librosa.to_mono(input_audio.transpose(1, 0))
_audio = model.slice_inference(
input_audio_path,
sid,
vc_transform,
slice_db,
cluster_ratio,
auto_f0,
noise_scale,
pad_seconds,
cl_num,
lg_num,
lgr_num,
f0_predictor,
enhancer_adaptive_key,
cr_threshold,
k_step,
use_spk_mix,
second_encoding,
loudness_envelope_adjustment
)
model.clear_empty()
timestamp = str(int(time.time()))
if not os.path.exists("results"):
os.makedirs("results")
output_file_name = os.path.splitext(os.path.basename(input_audio_path))[0] + "_" + sid + "_" + timestamp + ".wav"
output_file_path = os.path.join("results", output_file_name)
sf.write(output_file_path, _audio, model.target_sample, format="wav")
return output_file_path
def vc_fn(sid, input_audio, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment):
global model
try:
if input_audio is None:
return "You need to upload an audio", None
if model is None:
return "You need to upload an model", None
sampling_rate, audio = input_audio
temp_path = "temp.wav"
sf.write(temp_path, audio, sampling_rate, format="wav")
output_file_path = vc_infer(sid, audio, temp_path, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment)
os.remove(temp_path)
return "Success", output_file_path
except Exception as e:
if debug: traceback.print_exc()
raise gr.Error(e)
def vc_batch_fn(sid, input_audio_files, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment):
global model
try:
if input_audio_files is None or len(input_audio_files) == 0:
return "You need to upload at least one audio file"
if model is None:
return "You need to upload a model"
for file_obj in input_audio_files:
input_audio_path = file_obj.name
audio, sampling_rate = sf.read(input_audio_path)
vc_infer(sid, audio, input_audio_path, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment)
return "批量推理完成,音频已经被保存到results文件夹"
except Exception as e:
if debug: traceback.print_exc()
raise gr.Error(e)
def tts_fn(_text, _speaker, sid, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold, k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment):
global model
try:
subprocess.run([r"python", "tts.py", _text, _speaker])
sr = 44100
y, sr = librosa.load("tts.wav")
resampled_y = librosa.resample(y, orig_sr=sr, target_sr=sr)
sf.write("tts.wav", resampled_y, sr, subtype = "PCM_16")
input_audio = "tts.wav"
audio, sampling_rate = sf.read(input_audio)
if model is None:
return "You need to upload a model", None
output_file_path = vc_infer(sid, audio, input_audio, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment)
return "Success", output_file_path
except Exception as e:
if debug: traceback.print_exc()
raise gr.Error(e)
def load_raw_dirs():
illegal_files = []
#检查文件名
allowed_pattern = re.compile(r'^[a-zA-Z0-9_@#$%^&()_+\-=\s\.]*$')
for root, dirs, files in os.walk(raw_path):
if root != raw_path: # 只处理子文件夹内的文件
for file in files:
file_name, _ = os.path.splitext(file)
if not allowed_pattern.match(file_name):
illegal_files.append(file)
if len(illegal_files)!=0:
return f"数据集文件名只能包含数字、字母、下划线,以下文件不符合要求,请改名后再试:{illegal_files}"
#检查有没有小可爱不用wav文件当数据集
for root, dirs, files in os.walk(raw_path):
if root != raw_path: # 只处理子文件夹内的文件
for file in files:
if not file.lower().endswith('.wav'):
illegal_files.append(file)
if len(illegal_files)!=0:
return f"以下文件为非wav格式文件,请删除后再试:{illegal_files}"
spk_dirs = []
with os.scandir(raw_path) as entries:
for entry in entries:
if entry.is_dir():
spk_dirs.append(entry.name)
if len(spk_dirs) != 0:
return raw_dirs_list.update(value=spk_dirs)
else:
return raw_dirs_list.update(value="未找到数据集,请检查dataset_raw文件夹")
def dataset_preprocess(encoder, f0_predictor, use_diff, vol_aug, skip_loudnorm, num_processes):
diff_arg = "--use_diff" if use_diff else ""
vol_aug_arg = "--vol_aug" if vol_aug else ""
skip_loudnorm_arg = "--skip_loudnorm" if skip_loudnorm else ""
preprocess_commands = [
r"python resample.py %s" % (skip_loudnorm_arg),
r"python preprocess_flist_config.py --speech_encoder %s %s" % (encoder, vol_aug_arg),
r"python preprocess_hubert_f0.py --num_processes %s --f0_predictor %s %s" % (num_processes ,f0_predictor, diff_arg)
]
accumulated_output = ""
#清空dataset
dataset = os.listdir("dataset/44k")
if len(dataset) != 0:
for dir in dataset:
dataset_dir = "dataset/44k/" + str(dir)
if os.path.isdir(dataset_dir):
shutil.rmtree(dataset_dir)
accumulated_output += f"Deleting previous dataset: {dir}\n"
for command in preprocess_commands:
try:
result = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, text=True)
accumulated_output += f"Command: {command}, Using Encoder: {encoder}, Using f0 Predictor: {f0_predictor}\n"
yield accumulated_output, None
progress_line = None
for line in result.stdout:
if r"it/s" in line or r"s/it" in line: #防止进度条刷屏
progress_line = line
else:
accumulated_output += line
if progress_line is None:
yield accumulated_output, None
else:
yield accumulated_output + progress_line, None
result.communicate()
except subprocess.CalledProcessError as e:
result = e.output
accumulated_output += f"Error: {result}\n"
yield accumulated_output, None
if progress_line is not None:
accumulated_output += progress_line
accumulated_output += '-' * 50 + '\n'
yield accumulated_output, None
config_path = "configs/config.json"
with open(config_path, 'r') as f:
config = json.load(f)
spk_name = config.get('spk', None)
yield accumulated_output, gr.Textbox.update(value=spk_name)
def regenerate_config(encoder, vol_aug):
vol_aug_arg = "--vol_aug" if vol_aug else ""
cmd = r"python preprocess_flist_config.py --speech_encoder %s %s" % (encoder, vol_aug_arg)
output = ""
try:
result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, text=True)
for line in result.stdout:
output += line
output += "Regenerate config file successfully."
except subprocess.CalledProcessError as e:
result = e.output
output += f"Error: {result}\n"
return output
def clear_output():
return gr.Textbox.update(value="Cleared!>_<")
def read_config(config_path):
with open(config_path, 'r') as config_file:
config_data = json.load(config_file)
return config_data
def config_fn(log_interval, eval_interval, keep_ckpts, batch_size, lr, fp16_run, all_in_mem, diff_num_workers, diff_cache_all_data, diff_batch_size, diff_lr, diff_interval_log, diff_interval_val, diff_cache_device, diff_amp_dtype, diff_force_save):
config_origin = "configs/config.json"
diff_config = "configs/diffusion.yaml"
config_data = read_config(config_origin)
config_data['train']['log_interval'] = int(log_interval)
config_data['train']['eval_interval'] = int(eval_interval)
config_data['train']['keep_ckpts'] = int(keep_ckpts)
config_data['train']['batch_size'] = int(batch_size)
config_data['train']['learning_rate'] = float(lr)
config_data['train']['fp16_run'] = fp16_run
config_data['train']['all_in_mem'] = all_in_mem
with open(config_origin, 'w') as config_file:
json.dump(config_data, config_file, indent=4)
with open(diff_config, 'r') as diff_yaml:
diff_config_data = yaml.safe_load(diff_yaml)
diff_config_data['train']['num_workers'] = int(diff_num_workers)
diff_config_data['train']['cache_all_data'] = diff_cache_all_data
diff_config_data['train']['batch_size'] = int(diff_batch_size)
diff_config_data['train']['lr'] = float(diff_lr)
diff_config_data['train']['interval_log'] = int(diff_interval_log)
diff_config_data['train']['interval_val'] = int(diff_interval_val)
diff_config_data['train']['cache_device'] = str(diff_cache_device)
diff_config_data['train']['amp_dtype'] = str(diff_amp_dtype)
diff_config_data['train']['interval_force_save'] = int(diff_force_save)
with open(diff_config, 'w') as diff_yaml:
yaml.safe_dump(diff_config_data, diff_yaml, default_flow_style=False, sort_keys=False)
return "配置文件写入完成"
def check_dataset(dataset_path):
if not os.listdir(dataset_path):
return "数据集不存在,请检查dataset文件夹"
no_npy_pt_files = True
for root, dirs, files in os.walk(dataset_path):
for file in files:
if file.endswith('.npy') or file.endswith('.pt'):
no_npy_pt_files = False
break
if no_npy_pt_files:
return "数据集中未检测到f0和hubert文件,可能是预处理未完成"
return None
def training(gpu_selection, encoder):
config_data = read_config("configs/config.json")
vol_emb = config_data["model"]["vol_embedding"]
dataset_warn = check_dataset("dataset/44k")
if dataset_warn is not None:
return dataset_warn
encoder_models = { #编码器好多,要塞不下了
"vec256l9": ("D_0.pth", "G_0.pth", "pre_trained_model"),
"vec768l12": ("D_0.pth", "G_0.pth", "pre_trained_model/768l12/vol_emb" if vol_emb else "pre_trained_model/768l12"),
"hubertsoft": ("D_0.pth", "G_0.pth", "pre_trained_model/hubertsoft"),
"whisper-ppg": ("D_0.pth", "G_0.pth", "pre_trained_model/whisper-ppg"),
"cnhubertlarge": ("D_0.pth", "G_0.pth", "pre_trained_model/cnhubertlarge"),
"dphubert": ("D_0.pth", "G_0.pth", "pre_trained_model/dphubert"),
"whisper-ppg-large": ("D_0.pth", "G_0.pth", "pre_trained_model/whisper-ppg-large")
}
if encoder not in encoder_models:
return "未知编码器"
d_0_file, g_0_file, encoder_model_path = encoder_models[encoder]
d_0_path = os.path.join(encoder_model_path, d_0_file)
g_0_path = os.path.join(encoder_model_path, g_0_file)
timestamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')
new_backup_folder = os.path.join(models_backup_path, str(timestamp))
if os.listdir(workdir) != ['diffusion']:
os.makedirs(new_backup_folder, exist_ok=True)
for file in os.listdir(workdir):
if file != "diffusion":
shutil.move(os.path.join(workdir, file), os.path.join(new_backup_folder, file))
shutil.copy(d_0_path, os.path.join(workdir, "D_0.pth"))
shutil.copy(g_0_path, os.path.join(workdir, "G_0.pth"))
cmd = r"set CUDA_VISIBLE_DEVICES=%s && python train.py -c configs/config.json -m 44k" % (gpu_selection)
subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", cmd])
return "已经在新的终端窗口开始训练,请监看终端窗口的训练日志。在终端中按Ctrl+C可暂停训练。"
def continue_training(gpu_selection, encoder):
dataset_warn = check_dataset("dataset/44k")
if dataset_warn is not None:
return dataset_warn
if encoder == "":
return "请先选择预处理对应的编码器"
all_files = os.listdir(workdir)
model_files = [f for f in all_files if f.startswith('G_') and f.endswith('.pth')]
if len(model_files) == 0:
return "你还没有已开始的训练"
cmd = r"set CUDA_VISIBLE_DEVICES=%s && python train.py -c configs/config.json -m 44k" % (gpu_selection)
subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", cmd])
return "已经在新的终端窗口开始训练,请监看终端窗口的训练日志。在终端中按Ctrl+C可暂停训练。"
def kmeans_training(kmeans_gpu):
if not os.listdir(r"dataset/44k"):
return "数据集不存在,请检查dataset文件夹"
cmd = r"python cluster/train_cluster.py --gpu" if kmeans_gpu else r"python cluster/train_cluster.py"
subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", cmd])
return "已经在新的终端窗口开始训练,训练聚类模型不会输出日志,CPU训练一般需要5-10分钟左右"
def index_training():
if not os.listdir(r"dataset/44k"):
return "数据集不存在,请检查dataset文件夹"
cmd = r"python train_index.py -c configs/config.json"
subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", cmd])
return "已经在新的终端窗口开始训练"
def diff_training(encoder):
if not os.listdir(r"dataset/44k"):
return "数据集不存在,请检查dataset文件夹"
pre_trained_model_768l12 = "pre_trained_model/diffusion/768l12/model_0.pt"
pre_trained_model_hubertsoft = "pre_trained_model/diffusion/hubertsoft/model_0.pt"
timestamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')
new_backup_folder = os.path.join(models_backup_path, "diffusion", str(timestamp))
if len(os.listdir(diff_workdir)) != 0:
os.makedirs(new_backup_folder, exist_ok=True)
for file in os.listdir(diff_workdir):
shutil.move(os.path.join(diff_workdir, file), os.path.join(new_backup_folder, file))
if encoder == "vec256l9" or encoder == "whisper-ppg":
return "你所选的编码器暂时不支持训练扩散模型"
elif encoder == "vec768l12":
shutil.copy(pre_trained_model_768l12, os.path.join(diff_workdir, "model_0.pt"))
elif encoder == "hubertsoft":
shutil.copy(pre_trained_model_hubertsoft, os.path.join(diff_workdir, "model_0.pt"))
else:
return "请先选择编码器"
subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", r"python train_diff.py -c configs/diffusion.yaml"])
return "已经在新的终端窗口开始训练,请监看终端窗口的训练日志。在终端中按Ctrl+C可暂停训练。"
def diff_continue_training(encoder):
if not os.listdir(r"dataset/44k"):
return "数据集不存在,请检查dataset文件夹"
if encoder == "":
return "请先选择预处理对应的编码器"
all_files = os.listdir(diff_workdir)
model_files = [f for f in all_files if f.endswith('.pt')]
if len(model_files) == 0:
return "你还没有已开始的训练"
subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", r"python train_diff.py -c configs/diffusion.yaml"])
return "已经在新的终端窗口开始训练,请监看终端窗口的训练日志。在终端中按Ctrl+C可暂停训练。"
def upload_mix_append_file(files,sfiles):
try:
if(sfiles == None):
file_paths = [file.name for file in files]
else:
file_paths = [file.name for file in chain(files,sfiles)]
p = {file:100 for file in file_paths}
return file_paths,mix_model_output1.update(value=json.dumps(p,indent=2))
except Exception as e:
if debug: traceback.print_exc()
raise gr.Error(e)
def mix_submit_click(js,mode):
try:
assert js.lstrip()!=""
modes = {"凸组合":0, "线性组合":1}
mode = modes[mode]
data = json.loads(js)
data = list(data.items())
model_path,mix_rate = zip(*data)
path = mix_model(model_path,mix_rate,mode)
return f"成功,文件被保存在了{path}"
except Exception as e:
if debug: traceback.print_exc()
raise gr.Error(e)
def updata_mix_info(files):
try:
if files == None : return mix_model_output1.update(value="")
p = {file.name:100 for file in files}
return mix_model_output1.update(value=json.dumps(p,indent=2))
except Exception as e:
if debug: traceback.print_exc()
raise gr.Error(e)
def pth_identify():
if not os.path.exists(root_dir):
return f"未找到{root_dir}文件夹,请先创建一个{root_dir}文件夹并按第一步流程操作"
model_dirs = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
if not model_dirs:
return f"未在{root_dir}文件夹中找到模型文件夹,请确保每个模型和配置文件都被放置在单独的文件夹中"
valid_model_dirs = []
for path in model_dirs:
pth_files = glob.glob(f"{root_dir}/{path}/*.pth")
json_files = glob.glob(f"{root_dir}/{path}/*.json")
if len(pth_files) != 1 or len(json_files) != 1:
return f"错误: 在{root_dir}/{path}中找到了{len(pth_files)}个.pth文件和{len(json_files)}个.json文件。应当确保每个文件夹内有且只有一个.pth文件和.json文件"
valid_model_dirs.append(path)
return f"成功识别了{len(valid_model_dirs)}个模型:{valid_model_dirs}"
def onnx_export():
model_dirs = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
try:
for path in model_dirs:
pth_files = glob.glob(f"{root_dir}/{path}/*.pth")
json_files = glob.glob(f"{root_dir}/{path}/*.json")
model_file = pth_files[0]
json_file = json_files[0]
with open(json_file, 'r') as config_file:
config_data = json.load(config_file)
channels = config_data["model"]["gin_channels"]
if str(channels) == "256":
para1 = 1
if str(channels) == "768":
para1 = 192
device = torch.device("cpu")
hps = utils.get_hparams_from_file(json_file)
SVCVITS = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model)
_ = utils.load_checkpoint(model_file, SVCVITS, None)
_ = SVCVITS.eval().to(device)
for i in SVCVITS.parameters():
i.requires_grad = False
n_frame = 10
test_hidden_unit = torch.rand(para1, n_frame, channels)
test_pitch = torch.rand(1, n_frame)
test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
test_uv = torch.ones(1, n_frame, dtype=torch.float32)
test_noise = torch.randn(1, 192, n_frame)
test_sid = torch.LongTensor([0])
input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
output_names = ["audio", ]
onnx_file = os.path.splitext(model_file)[0] + ".onnx"
torch.onnx.export(SVCVITS,
(
test_hidden_unit.to(device),
test_pitch.to(device),
test_mel2ph.to(device),
test_uv.to(device),
test_noise.to(device),
test_sid.to(device)
),
onnx_file,
dynamic_axes={
"c": [0, 1],
"f0": [1],
"mel2ph": [1],
"uv": [1],
"noise": [2],
},
do_constant_folding=False,
opset_version=16,
verbose=False,
input_names=input_names,
output_names=output_names)
return "转换成功,模型被保存在了checkpoints下的对应目录"
except Exception as e:
if debug: traceback.print_exc()
return "转换错误:"+str(e)
def load_raw_audio(audio_path):
if not os.path.isdir(audio_path):
return "请输入正确的目录", None
files = os.listdir(audio_path)
wav_files = [file for file in files if file.lower().endswith('.wav')]
if not wav_files:
return "未在目录中找到.wav音频文件", None
return "成功加载", wav_files
def slicer_fn(input_dir, output_dir, process_method, max_sec, min_sec):
if output_dir == "":
return "请先选择输出的文件夹"
slicer = AutoSlicer()
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for filename in os.listdir(input_dir):
if filename.lower().endswith(".wav"):
slicer.auto_slice(filename, input_dir, output_dir, max_sec)
if process_method == "丢弃":
for filename in os.listdir(output_dir):
if filename.endswith(".wav"):
filepath = os.path.join(output_dir, filename)
audio, sr = librosa.load(filepath, sr=None, mono=False)
if librosa.get_duration(y=audio, sr=sr) < min_sec:
os.remove(filepath)
elif process_method == "将过短音频整合为长音频":
slicer.merge_short(output_dir, max_sec, min_sec)
file_count, max_duration, min_duration, orig_duration, final_duration = slicer.slice_count(input_dir, output_dir)
hrs = int(final_duration / 3600)
mins = int((final_duration % 3600) / 60)
sec = format(float(final_duration % 60), '.2f')
rate = format(100 * (final_duration / orig_duration), '.2f')
return f"成功将音频切分为{file_count}条片段,其中最长{max_duration}秒,最短{min_duration}秒,切片后的音频总时长{hrs:02d}小时{mins:02d}分{sec}秒,为原始音频时长的{rate}%"
def model_compression(_model):
if _model == "":
return "请先选择要压缩的模型"
else:
model_path = os.path.join(workdir, _model)
filename, extension = os.path.splitext(_model)
output_model_name = f"{filename}_compressed{extension}"
output_path = os.path.join(workdir, output_model_name)
removeOptimizer(model_path, output_path)
return f"模型已成功被保存在了{output_path}"
# read ckpt list
ckpt_list, config_list, cluster_list, diff_list, diff_config_list = load_options()
#read GPU info
ngpu=torch.cuda.device_count()
gpu_infos=[]
if(torch.cuda.is_available()==False or ngpu==0):if_gpu_ok=False
else:
if_gpu_ok = False
for i in range(ngpu):
gpu_name=torch.cuda.get_device_name(i)
if("MX"in gpu_name):continue
if("10"in gpu_name or "16"in gpu_name or "20"in gpu_name or "30"in gpu_name or "40"in gpu_name or "A50"in gpu_name.upper() or "70"in gpu_name or "80"in gpu_name or "90"in gpu_name or "M4"in gpu_name or"P4"in gpu_name or "T4"in gpu_name or "TITAN"in gpu_name.upper()):#A10#A100#V100#A40#P40#M40#K80
if_gpu_ok=True#至少有一张能用的N卡
gpu_infos.append("%s\t%s"%(i,gpu_name))
gpu_info="\n".join(gpu_infos)if if_gpu_ok==True and len(gpu_infos)>0 else "很遗憾您这没有能用的显卡来支持您训练"
gpus="-".join([i[0]for i in gpu_infos])
#read default params
sovits_params, diff_params = get_default_settings()
app = gr.Blocks()
def Newget_model_info(choice_ckpt2):
choice_ckpt = str(choice_ckpt2)
pthfile = os.path.join(workdir, choice_ckpt)
net = torch.load(pthfile, map_location=torch.device('cpu')) #cpu load
spk_emb = net["model"].get("emb_g.weight")
if spk_emb is None:
return "所选模型缺少emb_g.weight,你可能选择了一个底模"
_dim, _layer = spk_emb.size()
model_type = {
768: "Vec768-Layer12",
256: "Vec256-Layer9 / HubertSoft",
1024: "Whisper-PPG"
}
return gr.Textbox(visible=False, value=model_type.get(_layer, "不受支持的模型"))
with app:
gr.Markdown(value="""
### So-VITS-SVC 4.1-Stable
修改自原项目及bilibili@麦哲云
仅供个人娱乐和非商业用途,禁止用于血腥、暴力、性相关、政治相关内容
weiui来自:bilibili@羽毛布団,交流③群:416656175
镜像作者:bilibili@kiss丿冷鸟鸟,交流群:829974025
""")
with gr.Tabs():
with gr.TabItem("FC"):
#with gr.Row():
# choice_ckpt = gr.Dropdown(label="模型选择", choices=ckpt_list, value="no_model")
# model_branch = gr.Textbox(label="模型编码器", placeholder="请先选择模型", interactive=False)
#choice_ckpt = gr.Dropdown(value="G_388000.pth", visible=False)
#with gr.Row():
# config_choice = gr.Dropdown(label="配置文件", choices=config_list, value="no_config")
# config_info = gr.Textbox(label="配置文件编码器", placeholder="请选择配置文件")
config_choice = gr.Dropdown(value="config.json", visible=False)
#gr.Markdown(value="""**请检查模型和配置文件的编码器是否匹配**""")
#with gr.Row():
# diff_choice = gr.Dropdown(label="(可选)选择扩散模型", choices=diff_list, value="no_diff", interactive=True)
# diff_config_choice = gr.Dropdown(label="扩散模型配置文件", choices=diff_config_list, value="no_diff_config", interactive=True)
diff_choice = gr.Dropdown(value="no_diff", visible=False)
diff_config_choice = gr.Dropdown(value="no_diff_config", visible=False)
with gr.Row():
cluster_choice = gr.Dropdown(label="(可选)选择聚类模型/特征检索模型", choices=cluster_list, value="no_clu")
with gr.Row():
enhance = gr.Checkbox(label="是否使用NSF_HIFIGAN增强,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭", value=False)
#only_diffusion = gr.Checkbox(label="是否使用全扩散推理,开启后将不使用So-VITS模型,仅使用扩散模型进行完整扩散推理,默认关闭", value=False)
only_diffusion = gr.Checkbox(value=False, visible=False)
#using_device = gr.Dropdown(label="推理设备,默认为自动选择", choices=["Auto","cuda","cpu"], value="Auto")
using_device = gr.Dropdown(value='Auto', visible=False)
#refresh = gr.Button("刷新选项")
#loadckpt = gr.Button("加载模型", variant="primary")
#with gr.Row():
# model_message = gr.Textbox(label="Output Message")
# sid = gr.Dropdown(label="So-VITS说话人", value="speaker0")
sid = gr.Dropdown(value="1056", visible=False)
#choice_ckpt.change(get_model_info, [choice_ckpt], [model_branch])
model_branch = Newget_model_info("G_388000.pth")
#config_choice.change(load_json_encoder, [config_choice], [config_info])
#refresh.click(refresh_options,[],[choice_ckpt,config_choice,cluster_choice,diff_choice,diff_config_choice])
gr.Markdown(value="""
请稍等片刻,模型加载大约需要10秒。后续操作不需要重新加载模型
""")
with gr.Tabs():
with gr.TabItem("单个音频上传"):
vc_input3 = gr.Audio(label="单个音频上传")
with gr.TabItem("批量音频上传"):
vc_batch_files = gr.Files(label="批量音频上传", file_types=["audio"], file_count="multiple")
with gr.TabItem("文字转语音(实验性)"):
gr.Markdown("""
文字转语音(TTS)说明:使用edge_tts服务生成音频,并转换为So-VITS模型音色。可以在输入文字中使用标点符号简单控制情绪
zh-CN-XiaoyiNeural:中文女声
zh-CN-YunxiNeural: 中文男声
ja-JP-NanamiNeural:日文女声
ja-JP-KeitaNeural:日文男声
zh-CN-liaoning-XiaobeiNeural:东北话女声
zh-CN-shaanxi-XiaoniNeural: 陕西话女声
zh-HK-HiuMaanNeural: 粤语女声
zh-HK-WanLungNeural: 粤语男声
""")
with gr.Row():
text_input = gr.Textbox(label = "在此输入需要转译的文字(建议打开自动f0预测)",)
tts_spk = gr.Dropdown(label = "选择原始音频音色(来自微软TTS)", choices=["zh-CN-XiaoyiNeural", "zh-CN-YunxiNeural", "zh-CN-liaoning-XiaobeiNeural", "zh-CN-shaanxi-XiaoniNeural", "zh-HK-HiuMaanNeural", "zh-HK-WanLungNeural", "ja-JP-NanamiNeural", "ja-JP-KeitaNeural"], value = "zh-CN-XiaoyiNeural")
#with gr.Row():
# tts_rate = gr.Slider(label = "TTS语音变速(倍速)", minimum = 0, maximum = 3, value = 1)
# tts_volume = gr.Slider(label = "TTS语音音量(相对值)", minimum = 0, maximum = 1.5, value = 1)
with gr.Row():
auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声不要勾选此项会跑调)", value=False)
f0_predictor = gr.Radio(label="f0预测器选择(如遇哑音可以更换f0预测器解决,crepe为原F0使用均值滤波器)", choices=["pm","crepe","harvest","dio"], value="pm")
cr_threshold = gr.Number(label="F0过滤阈值,只有使用crepe时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音", value=0.05)
with gr.Row():
vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
cluster_ratio = gr.Number(label="聚类模型/特征检索混合比例,0-1之间,默认为0不启用聚类或特征检索,能提升音色相似度,但会导致咬字下降", value=0)
k_step = gr.Slider(label="浅扩散步数,只有使用了扩散模型才有效,步数越大越接近扩散模型的结果", value=100, minimum = 1, maximum = 1000)
with gr.Row():
enhancer_adaptive_key = gr.Number(label="使NSF-HIFIGAN增强器适应更高的音域(单位为半音数)|默认为0", value=0,interactive=True)
slice_db = gr.Number(label="切片阈值", value=-50)
cl_num = gr.Number(label="音频自动切片,0为按默认方式切片,单位为秒/s,爆显存可以设置此处强制切片", value=0)
with gr.Accordion("高级设置(一般不需要动)", open=False):
noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4)
pad_seconds = gr.Number(label="推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现", value=0.5)
lg_num = gr.Number(label="两端音频切片的交叉淡入长度,如果自动切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,注意,该设置会影响推理速度,单位为秒/s", value=1)
lgr_num = gr.Number(label="自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭", value=0.75,interactive=True)
second_encoding = gr.Checkbox(label = "二次编码,浅扩散前会对原始音频进行二次编码,玄学选项,效果时好时差,默认关闭", value=False)
loudness_envelope_adjustment = gr.Number(label="输入源响度包络替换输出响度包络融合比例,越靠近1越使用输出响度包络", value = 0)
use_spk_mix = gr.Checkbox(label="动态声线融合,暂时没做完", value=False, interactive=False)
with gr.Row():
vc_submit = gr.Button("音频转换", variant="primary")
vc_batch_submit = gr.Button("批量转换", variant="primary")
vc_tts_submit = gr.Button("文本转语音", variant="primary")
vc_output1 = gr.Textbox(label="Output Message")
vc_output2 = gr.Audio(label="Output Audio")
def Newvc_fn(sid, input_audio, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment, clus2):
global model, loaded
if loaded != clus2:
Newload_model_func("G_388000.pth",clus2,config_choice,enhance,diff_choice,diff_config_choice,only_diffusion,model_branch,using_device)
loaded = clus2
try:
if input_audio is None:
return "You need to upload an audio", None
if model is None:
return "You need to upload an model", None
sampling_rate, audio = input_audio
temp_path = "temp.wav"
sf.write(temp_path, audio, sampling_rate, format="wav")
output_file_path = vc_infer(sid, audio, temp_path, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment)
os.remove(temp_path)
return "Success", output_file_path
except Exception as e:
if debug: traceback.print_exc()
raise gr.Error(e)
#loadckpt.click(load_model_func,[choice_ckpt,cluster_choice,config_choice,enhance,diff_choice,diff_config_choice,only_diffusion,model_branch,using_device],[model_message, sid, cl_num])
vc_submit.click(Newvc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment,cluster_choice], [vc_output1, vc_output2])
vc_batch_submit.click(vc_batch_fn, [sid, vc_batch_files, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1])
vc_tts_submit.click(tts_fn, [text_input, tts_spk, sid, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1, vc_output2])
'''
with gr.TabItem("训练"):
gr.Markdown(value="""请将数据集文件夹放置在dataset_raw文件夹下,确认放置正确后点击下方获取数据集名称""")
raw_dirs_list=gr.Textbox(label="Raw dataset directory(s):")
get_raw_dirs=gr.Button("识别数据集", variant="primary")
gr.Markdown(value="""确认数据集正确识别后请选择训练使用的特征编码器和f0预测器,**如果要训练扩散模型,请选择Vec768l12或hubertsoft,并确保So-VITS和扩散模型使用同一个编码器**""")
with gr.Row():
gr.Markdown(value="""**vec256l9**: ContentVec(256Layer9),旧版本叫v1,So-VITS-SVC 4.0的基础版本,**暂不支持扩散模型**
**vec768l12**: 特征输入更换为ContentVec的第12层Transformer输出,模型理论上会更加还原训练集音色
**hubertsoft**: So-VITS-SVC 3.0使用的编码器,咬字更为准确,但可能存在多说话人音色泄露问题
**whisper-ppg**: 来自OpenAI,咬字最为准确,但和Hubertsoft一样存在多说话人音色泄露,且显存占用和训练时间有明显增加。**暂不支持扩散模型**
""")
gr.Markdown(value="""**crepe**: 抗噪能力最强,但预处理速度慢(不过如果你的显卡很强的话速度会很快)
**pm**: 预处理速度快,但抗噪能力较弱
**dio**: 先前版本预处理默认使用的f0预测器
**harvest**: 有一定抗噪能力,预处理显存占用友好,速度比较慢
""")
with gr.Row():
branch_selection = gr.Radio(label="选择训练使用的编码器", choices=["vec256l9","vec768l12","hubertsoft","whisper-ppg"], value="vec768l12", interactive=True)
f0_predictor_selection = gr.Radio(label="选择训练使用的f0预测器", choices=["crepe","pm","dio","harvest"], value="crepe", interactive=True)
use_diff = gr.Checkbox(label="是否使用浅扩散模型,如要训练浅扩散模型请勾选此项", value=True)
vol_aug=gr.Checkbox(label="是否启用响度嵌入和音量增强,启用后可以根据输入源控制输出响度,但对数据集质量的要求更高。**仅支持vec768l12编码器**", value=False)
with gr.Row():
skip_loudnorm = gr.Checkbox(label="是否跳过响度匹配,如果你已经用音频处理软件做过响度匹配,请勾选此处")
num_processes = gr.Slider(label="预处理使用的CPU线程数,可以大幅加快预处理速度,但线程数过大容易爆显存,建议12G显存设置为2", minimum=1, maximum=multiprocessing.cpu_count(), value=1, step=1)
with gr.Row():
raw_preprocess=gr.Button("数据预处理", variant="primary")
regenerate_config_btn=gr.Button("重新生成配置文件", variant="primary")
preprocess_output=gr.Textbox(label="预处理输出信息,完成后请检查一下是否有报错信息,如无则可以进行下一步", max_lines=999)
clear_preprocess_output=gr.Button("清空输出信息")
with gr.Group():
gr.Markdown(value="""填写训练设置和超参数""")
with gr.Row():
gr.Textbox(label="当前使用显卡信息", value=gpu_info)
gpu_selection=gr.Textbox(label="多卡用户请指定希望训练使用的显卡ID(0,1,2...)", value=gpus, interactive=True)
with gr.Row():
log_interval=gr.Textbox(label="每隔多少步(steps)生成一次评估日志", value=sovits_params['log_interval'])
eval_interval=gr.Textbox(label="每隔多少步(steps)验证并保存一次模型", value=sovits_params['eval_interval'])
keep_ckpts=gr.Textbox(label="仅保留最新的X个模型,超出该数字的旧模型会被删除。设置为0则永不删除", value=sovits_params['keep_ckpts'])
with gr.Row():
batch_size=gr.Textbox(label="批量大小,每步取多少条数据进行训练,大batch有助于训练但显著增加显存占用。6G显存建议设定为4", value=sovits_params['batch_size'])
lr=gr.Textbox(label="学习率,一般不用动,批量大小较大时可以适当增大学习率,但强烈不建议超过0.0002,有炸炉风险", value=sovits_params['learning_rate'])
fp16_run=gr.Checkbox(label="是否使用fp16混合精度训练,fp16训练可能降低显存占用和训练时间,但对模型质量的影响尚未查证", value=sovits_params['fp16_run'])
all_in_mem=gr.Checkbox(label="是否加载所有数据集到内存中,硬盘IO过于低下、同时内存容量远大于数据集体积时可以启用,能显著加快训练速度", value=sovits_params['all_in_mem'])
with gr.Row():
gr.Markdown("请检查右侧的说话人列表是否和你要训练的目标说话人一致,确认无误后点击写入配置文件,然后就可以开始训练了")
speakers=gr.Textbox(label="说话人列表")
with gr.Accordion(label = "扩散模型配置(训练扩散模型需要写入此处)", open=True):
with gr.Row():
diff_num_workers = gr.Number(label="num_workers, 如果你的电脑配置较高,可以将这里设置为0加快训练速度", value=diff_params['num_workers'])
diff_cache_all_data = gr.Checkbox(label="是否缓存数据,启用后可以加快训练速度,关闭后可以节省显存或内存,但会减慢训练速度", value=diff_params['cache_all_data'])
diff_cache_device = gr.Radio(label="若启用缓存数据,使用显存(cuda)还是内存(cpu)缓存,如果显卡显存充足,选择cuda以加快训练速度", choices=["cuda","cpu"], value=diff_params['cache_device'])
diff_amp_dtype = gr.Radio(label="训练数据类型,fp16可能会有更快的训练速度,前提是你的显卡支持", choices=["fp32","fp16"], value=diff_params['amp_dtype'])
with gr.Row():
diff_batch_size = gr.Number(label="批量大小(batch_size),根据显卡显存设置,小显存适当降低该项,6G显存可以设定为48,但该数值不要超过数据集总数量的1/4", value=diff_params['diff_batch_size'])
diff_lr = gr.Number(label="学习率(一般不需要动)", value=diff_params['diff_lr'])
diff_interval_log = gr.Number(label="每隔多少步(steps)生成一次评估日志", value = diff_params['diff_interval_log'])
diff_interval_val = gr.Number(label="每隔多少步(steps)验证并保存一次模型,如果你的批量大小较大,可以适当减少这里的数字,但不建议设置为1000以下", value=diff_params['diff_interval_val'])
diff_force_save = gr.Number(label="每隔多少步强制保留模型,只有该步数的倍数保存的模型会被保留,其余会被删除。设置为与验证步数相同的值则每个模型都会被保留", value=diff_params['diff_force_save'])
with gr.Row():
save_params=gr.Button("将当前设置保存为默认设置", variant="primary")
write_config=gr.Button("写入配置文件", variant="primary")
write_config_output=gr.Textbox(label="输出信息")
gr.Markdown(value="""**点击从头开始训练**将会自动将已有的训练进度保存到models_backup文件夹,并自动装载预训练模型。
**继续上一次的训练进度**将从上一个保存模型的进度继续训练。继续训练进度无需重新预处理和写入配置文件。
关于扩散、聚类和特征检索的详细说明请看[此处](https://www.yuque.com/umoubuton/ueupp5/kmui02dszo5zrqkz)。
""")
with gr.Row():
with gr.Column():
start_training=gr.Button("从头开始训练", variant="primary")
training_output=gr.Textbox(label="训练输出信息")
with gr.Column():
continue_training_btn=gr.Button("继续上一次的训练进度", variant="primary")
continue_training_output=gr.Textbox(label="训练输出信息")
with gr.Row():
with gr.Column():
diff_training_btn=gr.Button("从头训练扩散模型", variant="primary")
diff_training_output=gr.Textbox(label="训练输出信息")
with gr.Column():
diff_continue_training_btn=gr.Button("继续训练扩散模型", variant="primary")
diff_continue_training_output=gr.Textbox(label="训练输出信息")
with gr.Accordion(label = "聚类、特征检索训练", open=False):
with gr.Row():
with gr.Column():
kmeans_button=gr.Button("训练聚类模型", variant="primary")
kmeans_gpu = gr.Checkbox(label="使用GPU训练", value=True)
kmeans_output=gr.Textbox(label="训练输出信息")
with gr.Column():
index_button=gr.Button("训练特征检索模型", variant="primary")
index_output=gr.Textbox(label="训练输出信息")
'''
with gr.TabItem("小工具/实验室特性"):
gr.Markdown(value="""
### So-vits-svc 4.1 小工具/实验室特性
提供了一些有趣或实用的小工具,可以自行探索
""")
with gr.Tabs():
with gr.TabItem("静态声线融合"):
gr.Markdown(value="""
<font size=2> 介绍:该功能可以将多个声音模型合成为一个声音模型(多个模型参数的凸组合或线性组合),从而制造出现实中不存在的声线
注意:
1.该功能仅支持单说话人的模型
2.如果强行使用多说话人模型,需要保证多个模型的说话人数量相同,这样可以混合同一个SpaekerID下的声音
3.保证所有待混合模型的config.json中的model字段是相同的
4.输出的混合模型可以使用待合成模型的任意一个config.json,但聚类模型将不能使用
5.批量上传模型的时候最好把模型放到一个文件夹选中后一起上传
6.混合比例调整建议大小在0-100之间,也可以调为其他数字,但在线性组合模式下会出现未知的效果
7.混合完毕后,文件将会保存在项目根目录中,文件名为output.pth
8.凸组合模式会将混合比例执行Softmax使混合比例相加为1,而线性组合模式不会
</font>
""")
mix_model_path = gr.Files(label="选择需要混合模型文件")
mix_model_upload_button = gr.UploadButton("选择/追加需要混合模型文件", file_count="multiple")
mix_model_output1 = gr.Textbox(
label="混合比例调整,单位/%",
interactive = True
)
mix_mode = gr.Radio(choices=["凸组合", "线性组合"], label="融合模式",value="凸组合",interactive = True)
mix_submit = gr.Button("声线融合启动", variant="primary")
mix_model_output2 = gr.Textbox(
label="Output Message"
)
with gr.TabItem("onnx转换"):
gr.Markdown(value="""
提供了将.pth模型(批量)转换为.onnx模型的功能
源项目本身自带转换的功能,但不支持批量,操作也不够简单,这个工具可以支持在WebUI中以可视化的操作方式批量转换.onnx模型
有人可能会问,转.onnx模型有什么作用呢?相信我,如果你问出了这个问题,说明这个工具你应该用不上
### Step 1:
在整合包根目录下新建一个"checkpoints"文件夹,将pth模型和对应的json配置文件按目录分别放置到checkpoints文件夹下
看起来应该像这样:
checkpoints
├───xxxx
│ ├───xxxx.pth
│ └───xxxx.json
├───xxxx
│ ├───xxxx.pth
│ └───xxxx.json
└───……
""")
pth_dir_msg = gr.Textbox(label="识别待转换模型", placeholder="请将模型和配置文件按上述说明放置在正确位置")
pth_dir_identify_btn = gr.Button("识别", variant="primary")
gr.Markdown(value="""
### Step 2:
识别正确后点击下方开始转换,转换一个模型可能需要一分钟甚至更久
""")
pth2onnx_btn = gr.Button("开始转换", variant="primary")
pth2onnx_msg = gr.Textbox(label="输出信息")
with gr.TabItem("智能音频切片"):
gr.Markdown(value="""
该工具可以实现对音频的切片,无需调整参数即可完成符合要求的数据集制作。
数据集要求的音频切片约在2-15秒内,用传统的Slicer-GUI切片工具需要精准调参和二次切片才能符合要求,该工具省去了上述繁琐的操作,只要上传原始音频即可一键制作数据集。
""")
with gr.Row():
raw_audio_path = gr.Textbox(label="原始音频文件夹", placeholder="包含所有待切片音频的文件夹,示例: D:\干声\speakers")
load_raw_audio_btn = gr.Button("加载原始音频", variant = "primary")
load_raw_audio_output = gr.Textbox(label = "输出信息")
raw_audio_dataset = gr.Textbox(label = "音频列表", value = "")
slicer_output_dir = gr.Textbox(label = "输出目录", placeholder = "选择输出目录")
with gr.Row():
process_method = gr.Radio(label = "对过短音频的处理方式", choices = ["丢弃","将过短音频整合为长音频"], value = "丢弃")
max_sec = gr.Number(label = "切片的最长秒数", value = 15)
min_sec = gr.Number(label = "切片的最短秒数", value = 2)
slicer_btn = gr.Button("开始切片", variant = "primary")
slicer_output_msg = gr.Textbox(label = "输出信息")
mix_model_path.change(updata_mix_info,[mix_model_path],[mix_model_output1])
mix_model_upload_button.upload(upload_mix_append_file, [mix_model_upload_button,mix_model_path], [mix_model_path,mix_model_output1])
mix_submit.click(mix_submit_click, [mix_model_output1,mix_mode], [mix_model_output2])
pth_dir_identify_btn.click(pth_identify, [], [pth_dir_msg])
pth2onnx_btn.click(onnx_export, [], [pth2onnx_msg])
load_raw_audio_btn.click(load_raw_audio, [raw_audio_path], [load_raw_audio_output, raw_audio_dataset])
slicer_btn.click(slicer_fn, [raw_audio_path, slicer_output_dir, process_method, max_sec, min_sec], [slicer_output_msg])
with gr.TabItem("模型压缩工具"):
gr.Markdown(value="""
该工具可以实现对模型的体积压缩,在**不影响模型推理功能**的情况下,将原本约600M的So-VITS模型压缩至约200M, 大大减少了硬盘的压力。
**注意:压缩后的模型将无法继续训练,请在确认封炉后再压缩。**
将模型文件放置在logs/44k下,然后选择需要压缩的模型
""")
model_to_compress = gr.Dropdown(label="模型选择", choices=ckpt_list, value="")
compress_model_btn = gr.Button("压缩模型", variant="primary")
compress_model_output = gr.Textbox(label="输出信息", value="")
compress_model_btn.click(model_compression, [model_to_compress], [compress_model_output])
"""
get_raw_dirs.click(load_raw_dirs,[],[raw_dirs_list])
raw_preprocess.click(dataset_preprocess,[branch_selection, f0_predictor_selection, use_diff, vol_aug, skip_loudnorm, num_processes],[preprocess_output, speakers])
regenerate_config_btn.click(regenerate_config,[branch_selection, vol_aug],[preprocess_output])
clear_preprocess_output.click(clear_output,[],[preprocess_output])
save_params.click(save_default_settings, [log_interval,eval_interval,keep_ckpts,batch_size,lr,fp16_run,all_in_mem,diff_num_workers,diff_cache_all_data,diff_cache_device,diff_amp_dtype,diff_batch_size,diff_lr,diff_interval_log,diff_interval_val,diff_force_save], [write_config_output])
write_config.click(config_fn,[log_interval, eval_interval, keep_ckpts, batch_size, lr, fp16_run, all_in_mem, diff_num_workers, diff_cache_all_data, diff_batch_size, diff_lr, diff_interval_log, diff_interval_val, diff_cache_device, diff_amp_dtype, diff_force_save],[write_config_output])
start_training.click(training,[gpu_selection, branch_selection],[training_output])
diff_training_btn.click(diff_training,[branch_selection],[diff_training_output])
continue_training_btn.click(continue_training,[gpu_selection, branch_selection],[continue_training_output])
diff_continue_training_btn.click(diff_continue_training,[branch_selection],[diff_continue_training_output])
kmeans_button.click(kmeans_training,[kmeans_gpu],[kmeans_output])
index_button.click(index_training, [], [index_output])
"""
with gr.Tabs():
with gr.Row(variant="panel"):
with gr.Column():
gr.Markdown(value="""
<font size=2> WebUI设置</font>
""")
debug_button = gr.Checkbox(label="Debug模式,反馈BUG需要打开,打开后控制台可以显示具体错误提示", value=debug)
debug_button.change(debug_change,[],[])
app.queue(concurrency_count=1022, max_size=2044).launch()
|