Spaces:
Sleeping
Sleeping
Mahiruoshi
commited on
Commit
•
6c46b68
1
Parent(s):
eb4b0f8
Upload 34 files
Browse files- app.py +71 -55
- default_config.yml +1 -1
- server.py +2 -2
app.py
CHANGED
@@ -41,8 +41,10 @@ from models import SynthesizerTrn
|
|
41 |
from text.symbols import symbols
|
42 |
import sys
|
43 |
|
|
|
|
|
44 |
net_g = None
|
45 |
-
|
46 |
device = (
|
47 |
"cuda:0"
|
48 |
if torch.cuda.is_available()
|
@@ -52,8 +54,8 @@ device = (
|
|
52 |
else "cpu"
|
53 |
)
|
54 |
)
|
55 |
-
|
56 |
-
device = "cpu"
|
57 |
BandList = {
|
58 |
"PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
|
59 |
"Afterglow":["蘭","モカ","ひまり","巴","つぐみ"],
|
@@ -82,8 +84,8 @@ def get_net_g(model_path: str, device: str, hps):
|
|
82 |
_ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
|
83 |
return net_g
|
84 |
|
85 |
-
def get_text(text, language_str, hps, device):
|
86 |
-
|
87 |
norm_text, phone, tone, word2ph = clean_text(text, language_str)
|
88 |
phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
|
89 |
|
@@ -94,18 +96,24 @@ def get_text(text, language_str, hps, device):
|
|
94 |
for i in range(len(word2ph)):
|
95 |
word2ph[i] = word2ph[i] * 2
|
96 |
word2ph[0] += 1
|
97 |
-
bert_ori = get_bert(
|
|
|
|
|
98 |
del word2ph
|
99 |
assert bert_ori.shape[-1] == len(phone), phone
|
100 |
|
101 |
if language_str == "ZH":
|
102 |
bert = bert_ori
|
103 |
-
ja_bert = torch.
|
104 |
-
en_bert = torch.
|
105 |
elif language_str == "JP":
|
106 |
-
bert = torch.
|
107 |
ja_bert = bert_ori
|
108 |
-
en_bert = torch.
|
|
|
|
|
|
|
|
|
109 |
else:
|
110 |
raise ValueError("language_str should be ZH, JP or EN")
|
111 |
|
@@ -118,6 +126,7 @@ def get_text(text, language_str, hps, device):
|
|
118 |
language = torch.LongTensor(language)
|
119 |
return bert, ja_bert, en_bert, phone, tone, language
|
120 |
|
|
|
121 |
def infer(
|
122 |
text,
|
123 |
sdp_ratio,
|
@@ -125,18 +134,18 @@ def infer(
|
|
125 |
noise_scale_w,
|
126 |
length_scale,
|
127 |
sid,
|
128 |
-
|
129 |
-
|
130 |
):
|
131 |
|
132 |
language= 'JP' if is_japanese(text) else 'ZH'
|
133 |
-
if isinstance(reference_audio, np.ndarray):
|
134 |
-
emo = get_clap_audio_feature(reference_audio, device)
|
135 |
-
else:
|
136 |
-
emo = get_clap_text_feature(emotion, device)
|
137 |
-
emo = torch.squeeze(emo, dim=1)
|
138 |
bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
|
139 |
-
text,
|
|
|
|
|
|
|
|
|
|
|
140 |
)
|
141 |
with torch.no_grad():
|
142 |
x_tst = phones.to(device).unsqueeze(0)
|
@@ -146,7 +155,7 @@ def infer(
|
|
146 |
ja_bert = ja_bert.to(device).unsqueeze(0)
|
147 |
en_bert = en_bert.to(device).unsqueeze(0)
|
148 |
x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
|
149 |
-
emo = emo.to(device).unsqueeze(0)
|
150 |
del phones
|
151 |
speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
|
152 |
audio = (
|
@@ -159,7 +168,6 @@ def infer(
|
|
159 |
bert,
|
160 |
ja_bert,
|
161 |
en_bert,
|
162 |
-
emo,
|
163 |
sdp_ratio=sdp_ratio,
|
164 |
noise_scale=noise_scale,
|
165 |
noise_scale_w=noise_scale_w,
|
@@ -169,7 +177,16 @@ def infer(
|
|
169 |
.float()
|
170 |
.numpy()
|
171 |
)
|
172 |
-
del
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
if torch.cuda.is_available():
|
174 |
torch.cuda.empty_cache()
|
175 |
return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
|
@@ -188,10 +205,10 @@ def loadmodel(model):
|
|
188 |
if __name__ == "__main__":
|
189 |
languages = [ "Auto", "ZH", "JP"]
|
190 |
modelPaths = []
|
191 |
-
for dirpath, dirnames, filenames in os.walk('Data/
|
192 |
for filename in filenames:
|
193 |
modelPaths.append(os.path.join(dirpath, filename))
|
194 |
-
hps = utils.get_hparams_from_file('Data/
|
195 |
net_g = get_net_g(
|
196 |
model_path=modelPaths[-1], device=device, hps=hps
|
197 |
)
|
@@ -199,20 +216,21 @@ if __name__ == "__main__":
|
|
199 |
speakers = list(speaker_ids.keys())
|
200 |
with gr.Blocks() as app:
|
201 |
gr.Markdown(value="""
|
202 |
-
|
203 |
-
镜像[
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
|
|
|
|
|
|
|
|
|
|
208 |
for band in BandList:
|
209 |
with gr.TabItem(band):
|
210 |
for name in BandList[band]:
|
211 |
with gr.TabItem(name):
|
212 |
-
classifiedPaths = []
|
213 |
-
for dirpath, dirnames, filenames in os.walk("Data/Bushiroad/classifedSample/"+name):
|
214 |
-
for filename in filenames:
|
215 |
-
classifiedPaths.append(os.path.join(dirpath, filename))
|
216 |
with gr.Row():
|
217 |
with gr.Column():
|
218 |
with gr.Row():
|
@@ -224,21 +242,15 @@ if __name__ == "__main__":
|
|
224 |
length_scale = gr.Slider(
|
225 |
minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
|
226 |
)
|
227 |
-
|
228 |
-
label="Text prompt",
|
229 |
-
placeholder="用文字描述生成风格。如:Happy",
|
230 |
-
value="Happy",
|
231 |
-
visible=True,
|
232 |
-
)
|
233 |
-
with gr.Accordion(label="参数设定", open=False):
|
234 |
sdp_ratio = gr.Slider(
|
235 |
-
minimum=0, maximum=1, value=0.
|
236 |
)
|
237 |
noise_scale = gr.Slider(
|
238 |
minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
|
239 |
)
|
240 |
noise_scale_w = gr.Slider(
|
241 |
-
minimum=0.1, maximum=2, value=0.
|
242 |
)
|
243 |
speaker = gr.Dropdown(
|
244 |
choices=speakers, value=name, label="说话人"
|
@@ -246,25 +258,29 @@ if __name__ == "__main__":
|
|
246 |
with gr.Accordion(label="切换模型", open=False):
|
247 |
modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value")
|
248 |
btnMod = gr.Button("载入模型")
|
249 |
-
statusa = gr.TextArea()
|
250 |
btnMod.click(loadmodel, inputs=[modelstrs], outputs = [statusa])
|
251 |
with gr.Column():
|
252 |
text = gr.TextArea(
|
253 |
-
label="
|
254 |
-
|
255 |
-
value="
|
256 |
)
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
|
|
|
|
|
|
|
|
|
|
261 |
btn = gr.Button("点击生成", variant="primary")
|
262 |
audio_output = gr.Audio(label="Output Audio")
|
263 |
-
'''
|
264 |
btntran = gr.Button("快速中翻日")
|
265 |
-
translateResult = gr.TextArea("
|
266 |
btntran.click(translate, inputs=[text], outputs = [translateResult])
|
267 |
-
|
268 |
btn.click(
|
269 |
infer,
|
270 |
inputs=[
|
@@ -274,8 +290,8 @@ if __name__ == "__main__":
|
|
274 |
noise_scale_w,
|
275 |
length_scale,
|
276 |
speaker,
|
277 |
-
|
278 |
-
|
279 |
],
|
280 |
outputs=[audio_output],
|
281 |
)
|
|
|
41 |
from text.symbols import symbols
|
42 |
import sys
|
43 |
|
44 |
+
from tools.translate import translate
|
45 |
+
|
46 |
net_g = None
|
47 |
+
|
48 |
device = (
|
49 |
"cuda:0"
|
50 |
if torch.cuda.is_available()
|
|
|
54 |
else "cpu"
|
55 |
)
|
56 |
)
|
57 |
+
|
58 |
+
#device = "cpu"
|
59 |
BandList = {
|
60 |
"PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
|
61 |
"Afterglow":["蘭","モカ","ひまり","巴","つぐみ"],
|
|
|
84 |
_ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
|
85 |
return net_g
|
86 |
|
87 |
+
def get_text(text, language_str, hps, device, style_text=None, style_weight=0.7):
|
88 |
+
style_text = None if style_text == "" else style_text
|
89 |
norm_text, phone, tone, word2ph = clean_text(text, language_str)
|
90 |
phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
|
91 |
|
|
|
96 |
for i in range(len(word2ph)):
|
97 |
word2ph[i] = word2ph[i] * 2
|
98 |
word2ph[0] += 1
|
99 |
+
bert_ori = get_bert(
|
100 |
+
norm_text, word2ph, language_str, device, style_text, style_weight
|
101 |
+
)
|
102 |
del word2ph
|
103 |
assert bert_ori.shape[-1] == len(phone), phone
|
104 |
|
105 |
if language_str == "ZH":
|
106 |
bert = bert_ori
|
107 |
+
ja_bert = torch.randn(1024, len(phone))
|
108 |
+
en_bert = torch.randn(1024, len(phone))
|
109 |
elif language_str == "JP":
|
110 |
+
bert = torch.randn(1024, len(phone))
|
111 |
ja_bert = bert_ori
|
112 |
+
en_bert = torch.randn(1024, len(phone))
|
113 |
+
elif language_str == "EN":
|
114 |
+
bert = torch.randn(1024, len(phone))
|
115 |
+
ja_bert = torch.randn(1024, len(phone))
|
116 |
+
en_bert = bert_ori
|
117 |
else:
|
118 |
raise ValueError("language_str should be ZH, JP or EN")
|
119 |
|
|
|
126 |
language = torch.LongTensor(language)
|
127 |
return bert, ja_bert, en_bert, phone, tone, language
|
128 |
|
129 |
+
|
130 |
def infer(
|
131 |
text,
|
132 |
sdp_ratio,
|
|
|
134 |
noise_scale_w,
|
135 |
length_scale,
|
136 |
sid,
|
137 |
+
style_text=None,
|
138 |
+
style_weight=0.7,
|
139 |
):
|
140 |
|
141 |
language= 'JP' if is_japanese(text) else 'ZH'
|
|
|
|
|
|
|
|
|
|
|
142 |
bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
|
143 |
+
text,
|
144 |
+
language,
|
145 |
+
hps,
|
146 |
+
device,
|
147 |
+
style_text=style_text,
|
148 |
+
style_weight=style_weight,
|
149 |
)
|
150 |
with torch.no_grad():
|
151 |
x_tst = phones.to(device).unsqueeze(0)
|
|
|
155 |
ja_bert = ja_bert.to(device).unsqueeze(0)
|
156 |
en_bert = en_bert.to(device).unsqueeze(0)
|
157 |
x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
|
158 |
+
# emo = emo.to(device).unsqueeze(0)
|
159 |
del phones
|
160 |
speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
|
161 |
audio = (
|
|
|
168 |
bert,
|
169 |
ja_bert,
|
170 |
en_bert,
|
|
|
171 |
sdp_ratio=sdp_ratio,
|
172 |
noise_scale=noise_scale,
|
173 |
noise_scale_w=noise_scale_w,
|
|
|
177 |
.float()
|
178 |
.numpy()
|
179 |
)
|
180 |
+
del (
|
181 |
+
x_tst,
|
182 |
+
tones,
|
183 |
+
lang_ids,
|
184 |
+
bert,
|
185 |
+
x_tst_lengths,
|
186 |
+
speakers,
|
187 |
+
ja_bert,
|
188 |
+
en_bert,
|
189 |
+
) # , emo
|
190 |
if torch.cuda.is_available():
|
191 |
torch.cuda.empty_cache()
|
192 |
return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
|
|
|
205 |
if __name__ == "__main__":
|
206 |
languages = [ "Auto", "ZH", "JP"]
|
207 |
modelPaths = []
|
208 |
+
for dirpath, dirnames, filenames in os.walk('Data/Data/V23/models/'):
|
209 |
for filename in filenames:
|
210 |
modelPaths.append(os.path.join(dirpath, filename))
|
211 |
+
hps = utils.get_hparams_from_file('Data/Data/V23/configs/config.json')
|
212 |
net_g = get_net_g(
|
213 |
model_path=modelPaths[-1], device=device, hps=hps
|
214 |
)
|
|
|
216 |
speakers = list(speaker_ids.keys())
|
217 |
with gr.Blocks() as app:
|
218 |
gr.Markdown(value="""
|
219 |
+
少歌邦邦全员在线语音合成([Bert-Vits2](https://github.com/Stardust-minus/Bert-VITS2) V2.3)\n
|
220 |
+
镜像 [V2.2](https://huggingface.co/spaces/Mahiruoshi/MyGO_VIts-bert)\n
|
221 |
+
[好玩的](http://love.soyorin.top/)\n
|
222 |
+
该界面的真实链接(国内可用): https://mahiruoshi-bangdream-bert-vits2.hf.space/\n
|
223 |
+
API: https://mahiruoshi-bert-vits2-api.hf.space/ \n
|
224 |
+
调用方式: https://mahiruoshi-bert-vits2-api.hf.space/?text=%E4%B8%BA%E4%BB%80%E4%B9%88%E8%A6%81%E6%BC%94%E5%A5%8F%E6%98%A5%E6%97%A5%E5%BD%B1&speaker=%E9%A6%99%E6%BE%84\n
|
225 |
+
推荐搭配[Legado开源阅读](https://github.com/gedoor/legado)或[聊天bot](https://github.com/Paraworks/BangDreamAi)使用\n
|
226 |
+
二创请标注作者:B站@Mahiroshi: https://space.bilibili.com/19874615\n
|
227 |
+
训练数据集归属:BangDream及少歌手游,提取自BestDori,[数据集获取流程](https://nijigaku.top/2023/09/29/Bestbushiroad%E8%AE%A1%E5%88%92-vits-%E9%9F%B3%E9%A2%91%E6%8A%93%E5%8F%96%E5%8F%8A%E6%95%B0%E6%8D%AE%E9%9B%86%E5%AF%B9%E9%BD%90/)\n
|
228 |
+
BangDream数据集下载[链接](https://huggingface.co/spaces/Mahiruoshi/BangDream-Bert-VITS2/blob/main/%E7%88%AC%E8%99%AB/SortPathUrl.txt)\n
|
229 |
+
!!!注意:huggingface容器仅用作展示,建议在右上角更多选项中克隆本项目或Docker运行app.py/server.py,环境参考requirements.txt\n""")
|
230 |
for band in BandList:
|
231 |
with gr.TabItem(band):
|
232 |
for name in BandList[band]:
|
233 |
with gr.TabItem(name):
|
|
|
|
|
|
|
|
|
234 |
with gr.Row():
|
235 |
with gr.Column():
|
236 |
with gr.Row():
|
|
|
242 |
length_scale = gr.Slider(
|
243 |
minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
|
244 |
)
|
245 |
+
with gr.Accordion(label="参数设定", open=True):
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
sdp_ratio = gr.Slider(
|
247 |
+
minimum=0, maximum=1, value=0.5, step=0.01, label="SDP/DP混合比"
|
248 |
)
|
249 |
noise_scale = gr.Slider(
|
250 |
minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
|
251 |
)
|
252 |
noise_scale_w = gr.Slider(
|
253 |
+
minimum=0.1, maximum=2, value=0.667, step=0.01, label="音素长度"
|
254 |
)
|
255 |
speaker = gr.Dropdown(
|
256 |
choices=speakers, value=name, label="说话人"
|
|
|
258 |
with gr.Accordion(label="切换模型", open=False):
|
259 |
modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value")
|
260 |
btnMod = gr.Button("载入模型")
|
261 |
+
statusa = gr.TextArea(label = "模型加载状态")
|
262 |
btnMod.click(loadmodel, inputs=[modelstrs], outputs = [statusa])
|
263 |
with gr.Column():
|
264 |
text = gr.TextArea(
|
265 |
+
label="文本输入",
|
266 |
+
info="输入纯日语或者中文",
|
267 |
+
value="我是来结束这个乐队的。",
|
268 |
)
|
269 |
+
style_text = gr.Textbox(label="辅助文本",info="语言保持跟主文本一致",placeholder="为什么要演奏春日影!")
|
270 |
+
style_weight = gr.Slider(
|
271 |
+
minimum=0,
|
272 |
+
maximum=1,
|
273 |
+
value=0.7,
|
274 |
+
step=0.1,
|
275 |
+
label="Weight",
|
276 |
+
info="主文本和辅助文本的bert混合比率,0表示仅主文本,1表示仅辅助文本",
|
277 |
+
)
|
278 |
btn = gr.Button("点击生成", variant="primary")
|
279 |
audio_output = gr.Audio(label="Output Audio")
|
|
|
280 |
btntran = gr.Button("快速中翻日")
|
281 |
+
translateResult = gr.TextArea(label="百度翻译",value="从这里翻译后的文本")
|
282 |
btntran.click(translate, inputs=[text], outputs = [translateResult])
|
283 |
+
|
284 |
btn.click(
|
285 |
infer,
|
286 |
inputs=[
|
|
|
290 |
noise_scale_w,
|
291 |
length_scale,
|
292 |
speaker,
|
293 |
+
style_text,
|
294 |
+
style_weight,
|
295 |
],
|
296 |
outputs=[audio_output],
|
297 |
)
|
default_config.yml
CHANGED
@@ -83,7 +83,7 @@ train_ms:
|
|
83 |
base:
|
84 |
use_base_model: false
|
85 |
repo_id: "Stardust_minus/Bert-VITS2"
|
86 |
-
model_image: "Bert-VITS2_2.
|
87 |
# 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下
|
88 |
model: "models"
|
89 |
# 配置文件路径
|
|
|
83 |
base:
|
84 |
use_base_model: false
|
85 |
repo_id: "Stardust_minus/Bert-VITS2"
|
86 |
+
model_image: "Bert-VITS2_2.3底模" # openi网页的模型名
|
87 |
# 训练模型存储目录:与旧版本的区别,原先数据集是存放在logs/model_name下的,现在改为统一存放在Data/你的数据集/models下
|
88 |
model: "models"
|
89 |
# 配置文件路径
|
server.py
CHANGED
@@ -331,10 +331,10 @@ def gradio_interface():
|
|
331 |
if __name__ == "__main__":
|
332 |
languages = [ "Auto", "ZH", "JP"]
|
333 |
modelPaths = []
|
334 |
-
for dirpath, dirnames, filenames in os.walk('Data/V23/models/'):
|
335 |
for filename in filenames:
|
336 |
modelPaths.append(os.path.join(dirpath, filename))
|
337 |
-
hps = utils.get_hparams_from_file('Data/V23/configs/config.json')
|
338 |
net_g = get_net_g(
|
339 |
model_path=modelPaths[-1], device=device, hps=hps
|
340 |
)
|
|
|
331 |
if __name__ == "__main__":
|
332 |
languages = [ "Auto", "ZH", "JP"]
|
333 |
modelPaths = []
|
334 |
+
for dirpath, dirnames, filenames in os.walk('Data/Data/V23/models/'):
|
335 |
for filename in filenames:
|
336 |
modelPaths.append(os.path.join(dirpath, filename))
|
337 |
+
hps = utils.get_hparams_from_file('Data/Data/V23/configs/config.json')
|
338 |
net_g = get_net_g(
|
339 |
model_path=modelPaths[-1], device=device, hps=hps
|
340 |
)
|