# flake8: noqa: E402 import os import logging import re_matching logging.getLogger("numba").setLevel(logging.WARNING) logging.getLogger("markdown_it").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("matplotlib").setLevel(logging.WARNING) logging.basicConfig( level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s" ) logger = logging.getLogger(__name__) import warnings warnings.filterwarnings("ignore", category=UserWarning, module="gradio.blocks") import re import torch import utils from infer import infer, latest_version, get_net_g import gradio as gr import numpy as np from tools.sentence import extrac, is_japanese, is_chinese import sys, os import math net_g = None cara_list = ["ひまり","たえ","彩","日菜","美咲","ましろ","燐子","香子","珠緒","たえ"] BandList = { "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"], "Afterglow":["蘭","モカ","ひまり","巴","つぐみ"], "HelloHappyWorld":["こころ","美咲","薫","花音","はぐみ"], "PastelPalettes":["彩","日菜","千聖","イヴ","麻弥"], "Roselia":["友希那","紗夜","リサ","燐子","あこ"], "RaiseASuilen":["レイヤ","ロック","ますき","チュチュ","パレオ"], "Morfonica":["ましろ","瑠唯","つくし","七深","透子"], "MyGo&AveMujica(Part)":["燈","愛音","そよ","立希","楽奈","祥子","睦","海鈴"], "圣翔音乐学园":["華戀","光","香子","雙葉","真晝","純那","克洛迪娜","真矢","奈奈"], "凛明馆女子学校":["珠緒","壘","文","悠悠子","一愛"], "弗隆提亚艺术学校":["艾露","艾露露","菈樂菲","司","靜羽"], "西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"] } if sys.platform == "darwin" and torch.backends.mps.is_available(): device = "mps" os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" else: device = "cuda" def generate_audio( text, sdp_ratio, noise_scale, noise_scale_w, length_scale, speaker, language, ): audio_list = [] with torch.no_grad(): if language == 'Auto': language = "EN" if is_japanese(text): language = "JP" elif is_chinese(text): language = "ZH" print(text+":"+language) audio = infer( text, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, sid=speaker, language=language, hps=hps, net_g=net_g, device=device, ) return audio def tts_fn( text: str, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language, LongSentence, ): if not LongSentence: with torch.no_grad(): audio = generate_audio( text, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, speaker=speaker, language= language, ) torch.cuda.empty_cache() return (hps.data.sampling_rate, audio) else: final_list = extrac(text) audio_fin = [] for sentence in final_list: if len(sentence) > 1: with torch.no_grad(): audio = generate_audio( sentence, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, speaker=speaker, language= language, ) silence_frames = int(math.log(len(sentence)+1, 1000) * 44010) if is_chinese(sentence) else int(math.log(len(sentence)+1, 3000) * 44010) silence_data = np.zeros((silence_frames,), dtype=audio.dtype) audio_fin.append(audio) audio_fin.append(silence_data) return (hps.data.sampling_rate, np.concatenate(audio_fin)) def loadmodel(model): _ = net_g.eval() _ = utils.load_checkpoint(model, net_g, None, skip_optimizer=True) return "success" if __name__ == "__main__": hps = utils.get_hparams_from_file('Data/BangDream/config.json') version = hps.version if hasattr(hps, "version") else latest_version net_g = get_net_g( model_path='Data/BangDream/models/G_10000.pth', version=version, device=device, hps=hps ) speaker_ids = hps.data.spk2id speakers = list(speaker_ids.keys()) languages = [ "Auto", "ZH", "JP"] modelPaths = [] for dirpath, dirnames, filenames in os.walk("Data/BangDream/models/"): for filename in filenames: modelPaths.append(os.path.join(dirpath, filename)) with gr.Blocks() as app: gr.Markdown( f"少歌邦邦全员TTS,使用本模型请严格遵守法律法规!\n 发布二创作品请注明项目和本模型作者B站@Mahiroshi及项目链接\n从 我的博客站点 查看使用说明" ) for band in BandList: with gr.TabItem(band): for name in BandList[band]: with gr.TabItem(name): with gr.Row(): with gr.Column(): with gr.Row(): gr.Markdown( '
' f'' '
' ) length_scale = gr.Slider( minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节" ) with gr.Accordion(label="切换模型", open=False): modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value") btnMod = gr.Button("载入模型") statusa = gr.TextArea() btnMod.click(loadmodel, inputs=[modelstrs], outputs = [statusa]) with gr.Column(): text = gr.TextArea( label="输入纯日语或者中文", placeholder="输入纯日语或者中文", value="有个人躺在地上,哀嚎......\n有个人睡着了,睡在盒子里。\n我要把它打开,看看他的梦是什么。", ) btn = gr.Button("点击生成", variant="primary") audio_output = gr.Audio(label="Output Audio") with gr.Accordion(label="其它参数设定", open=False): sdp_ratio = gr.Slider( minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比" ) noise_scale = gr.Slider( minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节" ) noise_scale_w = gr.Slider( minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度" ) LongSentence = gr.Checkbox(value=True, label="Generate LongSentence") language = gr.Dropdown( choices=languages, value=languages[0], label="选择语言(默认自动)" ) speaker = gr.Dropdown( choices=speakers, value=name, label="说话人" ) btn.click( tts_fn, inputs=[ text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language, LongSentence, ], outputs=[audio_output], ) print("推理页面已开启!") app.launch()