# flake8: noqa: E402 import os import logging import re_matching logging.getLogger("numba").setLevel(logging.WARNING) logging.getLogger("markdown_it").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("matplotlib").setLevel(logging.WARNING) logging.basicConfig( level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s" ) logger = logging.getLogger(__name__) import warnings warnings.filterwarnings("ignore", category=UserWarning, module="gradio.blocks") import shutil from datetime import datetime import re import torch import utils from infer import infer, latest_version, get_net_g import gradio as gr import numpy as np from tools.sentence import extrac, is_japanese, is_chinese, seconds_to_ass_time, extract_text_from_file, remove_annotations import sys import math from scipy.io.wavfile import write from tools.translate import translate import random net_g = None cara_list = ["ひまり","たえ","彩","日菜","美咲","ましろ","燐子","香子","珠緒","たえ"] BandList = { "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"], "Afterglow":["蘭","モカ","ひまり","巴","つぐみ"], "HelloHappyWorld":["こころ","美咲","薫","花音","はぐみ"], "PastelPalettes":["彩","日菜","千聖","イヴ","麻弥"], "Roselia":["友希那","紗夜","リサ","燐子","あこ"], "RaiseASuilen":["レイヤ","ロック","ますき","チュチュ","パレオ"], "Morfonica":["ましろ","瑠唯","つくし","七深","透子"], "MyGo&AveMujica(Part)":["燈","愛音","そよ","立希","楽奈","祥子","睦","海鈴"], "圣翔音乐学园":["華戀","光","香子","雙葉","真晝","純那","克洛迪娜","真矢","奈奈"], "凛明馆女子学校":["珠緒","壘","文","悠悠子","一愛"], "弗隆提亚艺术学校":["艾露","艾露露","菈樂菲","司","靜羽"], "西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"] } device = ( "cuda:0" if torch.cuda.is_available() else ( "mps" if sys.platform == "darwin" and torch.backends.mps.is_available() else "cpu" ) ) def generate_audio( text, sdp_ratio, noise_scale, noise_scale_w, length_scale, speaker, language, ): if len(text) < 2: return with torch.no_grad(): if language == 'Auto': language = "EN" if is_japanese(text): language = "JP" elif is_chinese(text): language = "ZH" current_time = datetime.now() print(str(current_time)+':'+str(speaker)+":"+ text+":"+language) audio = infer( text, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, sid=speaker, language=language, hps=hps, net_g=net_g, device=device, ) return gr.processing_utils.convert_to_16_bit_wav(audio) def tts_fn( text: str, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language, LongSentence, ): if not LongSentence: with torch.no_grad(): audio = generate_audio( text, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, speaker=speaker, language= language, ) torch.cuda.empty_cache() return (hps.data.sampling_rate, audio) else: final_list = extrac(text) audio_fin = [] for sentence in final_list: if len(sentence) > 1: with torch.no_grad(): audio = generate_audio( sentence, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, speaker=speaker, language= language, ) silence_frames = int(math.log(len(sentence)+1, 1000) * 44010) if is_chinese(sentence) else int(math.log(len(sentence)+1, 3000) * 44010) silence_data = np.zeros((silence_frames,), dtype=audio.dtype) audio_fin.append(audio) audio_fin.append(silence_data) return (hps.data.sampling_rate, np.concatenate(audio_fin)) def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime): audio_fin = [] ass_entries = [] start_time = 0 speaker = random.choice(cara_list) ass_header = """[Script Info] ; 我没意见 Title: Audiobook ScriptType: v4.00+ WrapStyle: 0 PlayResX: 640 PlayResY: 360 ScaledBorderAndShadow: yes [V4+ Styles] Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1 [Events] Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text """ for sentence in group: try: FakeSpeaker = sentence.split("|")[0] print(FakeSpeaker) SpeakersList = re.split('\n', spealerList) if FakeSpeaker in list(hps.data.spk2id.keys()): speaker = FakeSpeaker for i in SpeakersList: if FakeSpeaker == i.split("|")[1]: speaker = i.split("|")[0] if sentence != '\n': audio = generate_audio(remove_annotations(sentence.split("|")[-1]).replace(" ",""), speaker=speaker, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, language='Auto') silence_frames = int(silenceTime * 44010) silence_data = np.zeros((silence_frames,), dtype=audio.dtype) audio_fin.append(audio) audio_fin.append(silence_data) duration = len(audio) / sampling_rate end_time = start_time + duration + silenceTime ass_entries.append("Dialogue: 0,{},{},".format(seconds_to_ass_time(start_time), seconds_to_ass_time(end_time)) + "Default,,0,0,0,,{}".format(sentence.replace("|",":"))) start_time = end_time except: pass wav_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.wav') ass_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.ass') write(wav_filename, sampling_rate, np.concatenate(audio_fin)) with open(ass_filename, 'w', encoding='utf-8') as f: f.write(ass_header + '\n'.join(ass_entries)) return (hps.data.sampling_rate, np.concatenate(audio_fin)) def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime,filepath): directory_path = filepath if torch.cuda.is_available() else "books" if os.path.exists(directory_path): shutil.rmtree(directory_path) os.makedirs(directory_path) text = extract_text_from_file(inputFile.name) sentences = extrac(text) GROUP_SIZE = groupsize for i in range(0, len(sentences), GROUP_SIZE): group = sentences[i:i+GROUP_SIZE] if spealerList == "": spealerList = "无" result = generate_audio_and_srt_for_group(group,directory_path, i//GROUP_SIZE + 1, 44100, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime) if not torch.cuda.is_available(): return result return result def loadmodel(model): _ = net_g.eval() _ = utils.load_checkpoint(model, net_g, None, skip_optimizer=True) return "success" if __name__ == "__main__": hps = utils.get_hparams_from_file('Data/BangDream/config.json') version = hps.version if hasattr(hps, "version") else latest_version net_g = get_net_g( model_path='Data/BangDream/models/G_10000.pth', version=version, device=device, hps=hps ) speaker_ids = hps.data.spk2id speakers = list(speaker_ids.keys()) languages = [ "Auto", "ZH", "JP"] modelPaths = [] for dirpath, dirnames, filenames in os.walk("Data/BangDream/models/"): for filename in filenames: modelPaths.append(os.path.join(dirpath, filename)) with gr.Blocks() as app: gr.Markdown(value=""" 少歌邦邦全员在线语音合成(Bert-Vits2)\n 作者:B站@Mahiroshi https://space.bilibili.com/19874615\n 声音归属:BangDream及少歌手游\n Bert-VITS2项目:https://github.com/Stardust-minus/Bert-VITS2\n 使用参考: https://nijigaku.top/2023/10/03/BangDreamTTS\n 数据集制作: https://huggingface.co/spaces/Mahiruoshi/BangDream-Bert-VITS2/tree/main/%E7%88%AC%E8%99%AB 服务器启动示例: https://huggingface.co/spaces/Mahiruoshi/BangDream-Bert-VITS2/blob/main/server.py\n 使用本模型请严格遵守法律法规!禁止生成任何有损声优或者企划的内容!!!!!\n このモデルを使用する際は法律法規を厳守してください!声優や企画に損害を与える内容の生成は禁止です!!!!!\n Please strictly follow the laws in your country and regulations when using this model! It is prohibited to generate any content that is harmful to others!!!!!\n 发布二创作品请标注本项目作者及链接、作品使用Bert-VITS2 AI生成!\n """) for band in BandList: with gr.TabItem(band): for name in BandList[band]: with gr.TabItem(name): with gr.Row(): with gr.Column(): with gr.Row(): gr.Markdown( '
' f'' '
' ) length_scale = gr.Slider( minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节" ) LongSentence = gr.Checkbox(value=True, label="自动拆分句子") with gr.Accordion(label="切换模型", open=False): modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value") btnMod = gr.Button("载入模型") statusa = gr.TextArea() btnMod.click(loadmodel, inputs=[modelstrs], outputs = [statusa]) with gr.Column(): text = gr.TextArea( label="输入纯日语或者中文", placeholder="输入纯日语或者中文", value="有个人躺在地上,哀嚎......\n有个人睡着了,睡在盒子里。\n我要把它打开,看看他的梦是什么。", ) btn = gr.Button("点击生成", variant="primary") audio_output = gr.Audio(label="Output Audio") btntran = gr.Button("快速中翻日") translateResult = gr.TextArea("从这复制翻译后的文本") btntran.click(translate, inputs=[text], outputs = [translateResult]) with gr.Accordion(label="其它参数设定", open=False): sdp_ratio = gr.Slider( minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比" ) noise_scale = gr.Slider( minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节" ) noise_scale_w = gr.Slider( minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度" ) language = gr.Dropdown( choices=languages, value=languages[0], label="选择语言(默认自动)" ) speaker = gr.Dropdown( choices=speakers, value=name, label="说话人" ) btn.click( tts_fn, inputs=[ text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language, LongSentence, ], outputs=[audio_output], ) with gr.Tab('拓展功能'): with gr.Row(): with gr.Column(): gr.Markdown( f"从 我的博客站点 查看自制galgame使用说明\n" ) inputFile = gr.UploadButton(label="上传txt(可设置角色对应表)、epub或mobi文件") groupSize = gr.Slider( minimum=10, maximum=1000 if torch.cuda.is_available() else 50,value = 50, step=1, label="单个音频文件包含的最大字数" ) silenceTime = gr.Slider( minimum=0, maximum=1, value=0.5, step=0.1, label="句子的间隔" ) filepath = gr.TextArea( label="本地合成时的音频存储文件夹(会清空文件夹警告)", value = "D:/audiobook/book1", ) spealerList = gr.TextArea( label="角色对应表(example)", placeholder="左边是你想要在每一句话合成中用到的speaker(见角色清单)右边是你上传文本时分隔符左边设置的说话人:{ChoseSpeakerFromConfigList1}|{SeakerInUploadText1}\n{ChoseSpeakerFromConfigList2}|{SeakerInUploadText2}\n{ChoseSpeakerFromConfigList3}|{SeakerInUploadText3}\n", value = "ましろ|真白\n七深|七深\n透子|透子\nつくし|筑紫\n瑠唯|瑠唯\nそよ|素世\n祥子|祥子", ) speaker = gr.Dropdown( choices=speakers, value = "ましろ", label="选择默认说话人" ) with gr.Column(): sdp_ratio = gr.Slider( minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比" ) noise_scale = gr.Slider( minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节" ) noise_scale_w = gr.Slider( minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度" ) length_scale = gr.Slider( minimum=0.1, maximum=2, value=1, step=0.01, label="生成长度" ) LastAudioOutput = gr.Audio(label="当使用cuda时才能在本地文件夹浏览全部文件") btn2 = gr.Button("点击生成", variant="primary") btn2.click( audiobook, inputs=[ inputFile, groupSize, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, spealerList, silenceTime, filepath ], outputs=[LastAudioOutput], ) print("推理页面已开启!") app.launch()