v2: two models
Browse files- app.py +194 -133
- requirements.txt +2 -1
- resource/__init__.py +0 -0
- util.py +28 -10
app.py
CHANGED
@@ -2,39 +2,69 @@ import os
|
|
2 |
import numpy as np
|
3 |
import gradio as gr
|
4 |
import pyopenjtalk
|
5 |
-
from util import preprocess_input, get_tokenizer, load_pitch_dict, get_pinyin
|
6 |
|
7 |
from espnet_model_zoo.downloader import ModelDownloader
|
8 |
-
from espnet2.fileio.read_text import read_label
|
9 |
from espnet2.bin.svs_inference import SingingGenerate
|
10 |
|
11 |
|
12 |
singer_embeddings = {
|
13 |
-
"
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
}
|
24 |
|
|
|
|
|
25 |
langs = {
|
26 |
"zh": 2,
|
27 |
"jp": 1,
|
28 |
}
|
29 |
|
30 |
-
def gen_song(
|
31 |
fs = 44100
|
32 |
tempo = 120
|
33 |
-
|
34 |
-
|
35 |
-
# "train_config": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/config.yaml",
|
36 |
-
# "model_file": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/500epoch.pth",
|
37 |
-
# }
|
38 |
if texts is None:
|
39 |
return (fs, np.array([0.0])), "Error: No Text provided!"
|
40 |
if durs is None:
|
@@ -60,7 +90,7 @@ def gen_song(lang, texts, durs, pitchs, spk):
|
|
60 |
return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!"
|
61 |
|
62 |
## text to phoneme
|
63 |
-
tokenizer = get_tokenizer(lang)
|
64 |
sybs = []
|
65 |
for text in text_list:
|
66 |
if text == "AP" or text == "SP":
|
@@ -69,9 +99,9 @@ def gen_song(lang, texts, durs, pitchs, spk):
|
|
69 |
rev = [text]
|
70 |
else:
|
71 |
rev = tokenizer(text)
|
72 |
-
rev = [phn + f"@{lang}" for phn in rev]
|
73 |
if rev == False:
|
74 |
return (fs, np.array([0.0])), f"Error: text `{text}` is invalid!"
|
|
|
75 |
phns = "_".join(rev)
|
76 |
sybs.append(phns)
|
77 |
|
@@ -105,7 +135,7 @@ def gen_song(lang, texts, durs, pitchs, spk):
|
|
105 |
),
|
106 |
"text": phns_str,
|
107 |
}
|
108 |
-
|
109 |
# return (fs, np.array([0.0])), "success!"
|
110 |
|
111 |
# Infer
|
@@ -118,128 +148,159 @@ def gen_song(lang, texts, durs, pitchs, spk):
|
|
118 |
model_file = pretrain_downloaded["model_file"],
|
119 |
device = device
|
120 |
)
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
|
|
126 |
wav_info = output_dict["wav"].cpu().numpy()
|
127 |
return (fs, wav_info), "success!"
|
128 |
|
129 |
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
<
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
</div>
|
169 |
-
"""
|
170 |
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
<div style='margin:20px auto;'>
|
173 |
|
174 |
<p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> |
|
175 |
-
<a href="https://github.com/espnet/espnet">espnet
|
176 |
-
<a href="https://huggingface.co/espnet/
|
177 |
-
|
178 |
-
<pre>
|
179 |
-
@inproceedings{wu2024muskits,
|
180 |
-
title = {{Muskits-ESPnet}: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm},
|
181 |
-
author = {Yuning Wu and Jiatong Shi and Yifeng Yu and Yuxun Tang and Tao Qian and Yueqian Lin and Jionghao Han and Xinyi Bai and Shinji Watanabe and Qin Jin},
|
182 |
-
booktitle={Proceedings of the 32st ACM International Conference on Multimedia},
|
183 |
-
year={2024},
|
184 |
-
}
|
185 |
-
</pre>
|
186 |
|
187 |
</div>
|
188 |
"""
|
|
|
189 |
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
["zh", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.66 0.58 0.27 0.3 0.97\n0.48 0.36 0.69 0.3 0.53 0.56 1.27 0.89", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
|
198 |
-
["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP\n你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34\n0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0\n63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
|
199 |
-
["zh", "修 炼 爱 情 的 心 酸 SP AP", "0.42 0.21 0.19 0.28 0.22 0.33 1.53 0.1 0.29", "68 70 68 66 63 68 68 0 0", "singer2 (female)"],
|
200 |
-
["zh", "学 会 放 好 以 前 的 渴 望 SP AP", "0.3 0.22 0.29 0.27 0.25 0.44 0.54 0.29 1.03 0.08 0.39", "68 70 68 66 61 68 68 65 66 0 0", "singer2 (female)"],
|
201 |
-
["zh", "SP 你 看 着 车 窗 - SP", " 0.41 0.96 0.7 0.64 1.12 1.14 1.04 0.29", "0 60 60 62 60 64 65 0", "singer3 (male)"],
|
202 |
-
["jp", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
|
203 |
-
["jp", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "62 62 62 58 58 58 57 57 57 55 58", "singer8 (female)"],
|
204 |
-
["jp", "い じ ん さ ん に つ れ ら れ て", "1.2 0.6 0.6 0.6 0.6 1.2 1.2 0.6 0.6 1.2 0.45", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
|
205 |
-
["jp", "�� じ ん さ ん に つ れ ら れ て", "0.3 0.15 0.15 0.15 0.15 0.3 0.3 0.15 0.15 0.3 0.11", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
|
206 |
-
["jp", "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0", "singer2 (female)"],
|
207 |
-
["jp", "じゃ の め で お む か え う れ し い な", "0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.65", "60 60 60 62 64 67 69 69 64 64 64 62 60", "singer10 (female)"],
|
208 |
-
["jp", "お と め わ ら い か ふぁ い や ら い か ん な い す ぶ ろ うぃ ん ぶ ろ うぃ ん い ん ざ うぃ ん", "0.15 0.15 0.15 0.15 0.3 0.15 0.3 0.15 0.15 0.3 0.07 0.07 0.15 0.15 0.15 0.15 0.15 0.15 0.45 0.07 0.07 0.07 0.38 0.07 0.07 0.15 0.15 0.3 0.15 0.15", "67 67 67 67 67 67 69 67 67 69 67 67 64 64 64 64 64 64 62 64 64 62 62 64 64 62 62 59 59 59", "singer9 (male)"],
|
209 |
-
]
|
210 |
-
|
211 |
-
app = gr.Interface(
|
212 |
-
fn=gen_song,
|
213 |
-
inputs=[
|
214 |
-
gr.Radio(label="language", choices=["zh", "jp"], value="zh"),
|
215 |
-
gr.Textbox(label="Lyrics"),
|
216 |
-
gr.Textbox(label="Duration"),
|
217 |
-
gr.Textbox(label="Pitch"),
|
218 |
-
gr.Radio(
|
219 |
-
label="Singer",
|
220 |
-
choices=[
|
221 |
-
"singer1 (male)",
|
222 |
-
"singer2 (female)",
|
223 |
-
"singer3 (male)",
|
224 |
-
"singer4 (female)",
|
225 |
-
"singer4 (male)",
|
226 |
-
"singer6 (female)",
|
227 |
-
"singer7 (male)",
|
228 |
-
"singer8 (female)",
|
229 |
-
"singer9 (male)",
|
230 |
-
"singer10 (female)",
|
231 |
-
],
|
232 |
-
value="singer1 (male)",
|
233 |
-
),
|
234 |
-
],
|
235 |
-
outputs=[
|
236 |
-
gr.Audio(label="Generated Song", type="numpy"),
|
237 |
-
gr.Textbox(label="Running Status"),
|
238 |
-
],
|
239 |
-
title=title,
|
240 |
-
description=description,
|
241 |
-
article=article,
|
242 |
-
examples=examples,
|
243 |
-
)
|
244 |
-
|
245 |
-
app.launch()
|
|
|
2 |
import numpy as np
|
3 |
import gradio as gr
|
4 |
import pyopenjtalk
|
5 |
+
from util import preprocess_input, postprocess_phn, get_tokenizer, load_pitch_dict, get_pinyin
|
6 |
|
7 |
from espnet_model_zoo.downloader import ModelDownloader
|
|
|
8 |
from espnet2.bin.svs_inference import SingingGenerate
|
9 |
|
10 |
|
11 |
singer_embeddings = {
|
12 |
+
"Model①(Chinese)-zh": {
|
13 |
+
"singer1 (male)": 1,
|
14 |
+
"singer2 (female)": 12,
|
15 |
+
"singer3 (male)": 23,
|
16 |
+
"singer4 (female)": 29,
|
17 |
+
"singer5 (male)": 18,
|
18 |
+
"singer6 (female)": 8,
|
19 |
+
"singer7 (male)": 25,
|
20 |
+
"singer8 (female)": 5,
|
21 |
+
"singer9 (male)": 10,
|
22 |
+
"singer10 (female)": 15,
|
23 |
+
},
|
24 |
+
"Model②(Multilingual)-zh": {
|
25 |
+
"singer1 (male)": "resource/singer/singer_embedding_ace-1.npy",
|
26 |
+
"singer2 (female)": "resource/singer/singer_embedding_ace-2.npy",
|
27 |
+
"singer3 (male)": "resource/singer/singer_embedding_ace-3.npy",
|
28 |
+
"singer4 (female)": "resource/singer/singer_embedding_ace-8.npy",
|
29 |
+
"singer5 (male)": "resource/singer/singer_embedding_ace-7.npy",
|
30 |
+
"singer6 (female)": "resource/singer/singer_embedding_itako.npy",
|
31 |
+
"singer7 (male)": "resource/singer/singer_embedding_ofuton.npy",
|
32 |
+
"singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy",
|
33 |
+
"singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy",
|
34 |
+
"singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy",
|
35 |
+
},
|
36 |
+
"Model②(Multilingual)-jp": {
|
37 |
+
"singer1 (male)": "resource/singer/singer_embedding_ace-1.npy",
|
38 |
+
"singer2 (female)": "resource/singer/singer_embedding_ace-2.npy",
|
39 |
+
"singer3 (male)": "resource/singer/singer_embedding_ace-3.npy",
|
40 |
+
"singer4 (female)": "resource/singer/singer_embedding_ace-8.npy",
|
41 |
+
"singer5 (male)": "resource/singer/singer_embedding_ace-7.npy",
|
42 |
+
"singer6 (female)": "resource/singer/singer_embedding_itako.npy",
|
43 |
+
"singer7 (male)": "resource/singer/singer_embedding_ofuton.npy",
|
44 |
+
"singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy",
|
45 |
+
"singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy",
|
46 |
+
"singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy",
|
47 |
+
}
|
48 |
+
}
|
49 |
+
|
50 |
+
model_dict = {
|
51 |
+
"Model①(Chinese)-zh": "espnet/aceopencpop_svs_visinger2_40singer_pretrain",
|
52 |
+
"Model②(Multilingual)-zh": "espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
|
53 |
+
"Model②(Multilingual)-jp": "espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
|
54 |
}
|
55 |
|
56 |
+
total_singers = list(singer_embeddings["Model②(Multilingual)-zh"].keys())
|
57 |
+
|
58 |
langs = {
|
59 |
"zh": 2,
|
60 |
"jp": 1,
|
61 |
}
|
62 |
|
63 |
+
def gen_song(model_name, spk, texts, durs, pitchs):
|
64 |
fs = 44100
|
65 |
tempo = 120
|
66 |
+
lang = model_name.split("-")[-1]
|
67 |
+
PRETRAIN_MODEL = model_dict[model_name]
|
|
|
|
|
|
|
68 |
if texts is None:
|
69 |
return (fs, np.array([0.0])), "Error: No Text provided!"
|
70 |
if durs is None:
|
|
|
90 |
return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!"
|
91 |
|
92 |
## text to phoneme
|
93 |
+
tokenizer = get_tokenizer(model_name, lang)
|
94 |
sybs = []
|
95 |
for text in text_list:
|
96 |
if text == "AP" or text == "SP":
|
|
|
99 |
rev = [text]
|
100 |
else:
|
101 |
rev = tokenizer(text)
|
|
|
102 |
if rev == False:
|
103 |
return (fs, np.array([0.0])), f"Error: text `{text}` is invalid!"
|
104 |
+
rev = postprocess_phn(rev, model_name, lang)
|
105 |
phns = "_".join(rev)
|
106 |
sybs.append(phns)
|
107 |
|
|
|
135 |
),
|
136 |
"text": phns_str,
|
137 |
}
|
138 |
+
print(batch)
|
139 |
# return (fs, np.array([0.0])), "success!"
|
140 |
|
141 |
# Infer
|
|
|
148 |
model_file = pretrain_downloaded["model_file"],
|
149 |
device = device
|
150 |
)
|
151 |
+
if model_name == "Model①(Chinese)-zh":
|
152 |
+
sid = np.array([singer_embeddings[model_name][spk]])
|
153 |
+
output_dict = svs(batch, sids=sid)
|
154 |
+
else:
|
155 |
+
lid = np.array([langs[lang]])
|
156 |
+
spk_embed = np.load(singer_embeddings[model_name][spk])
|
157 |
+
output_dict = svs(batch, lids=lid, spembs=spk_embed)
|
158 |
wav_info = output_dict["wav"].cpu().numpy()
|
159 |
return (fs, wav_info), "success!"
|
160 |
|
161 |
|
162 |
+
# SP: silence, AP: aspirate.
|
163 |
+
examples = [
|
164 |
+
["Model①(Chinese)-zh", "singer1 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0"],
|
165 |
+
["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest"], # midi note
|
166 |
+
["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C#4 D#4 D#4 D#4 rest D#4 B3 rest\nB3 B3 rest B3 B3 E4 rest"], # up 1 key
|
167 |
+
["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 大 地 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest"], # lyrics
|
168 |
+
["Model②(Multilingual)-zh", "singer3 (male)", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0"],
|
169 |
+
["Model②(Multilingual)-zh", "singer3 (male)", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.66 0.58 0.27 0.3 0.97\n0.48 0.36 0.69 0.3 0.53 0.56 1.27 0.89", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0"], # double duration
|
170 |
+
["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP\n你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34\n0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0\n63 63 63 63 0 63\n62 62 62 63 65 63 62 0"], # long
|
171 |
+
["Model①(Chinese)-zh", "singer3 (male)", "修 炼 爱 情 的 心 酸 SP AP", "0.42 0.21 0.19 0.28 0.22 0.33 1.53 0.1 0.29", "68 70 68 66 63 68 68 0 0"],
|
172 |
+
["Model①(Chinese)-zh", "singer3 (male)", "学 会 放 好 以 前 的 渴 望 SP AP", "0.3 0.22 0.29 0.27 0.25 0.44 0.54 0.29 1.03 0.08 0.39", "68 70 68 66 61 68 68 65 66 0 0"],
|
173 |
+
["Model①(Chinese)-zh", "singer3 (male)", "SP 我 不 - 是 一 定 要 你 回 - 来 SP", "0.37 0.45 0.47 0.17 0.52 0.28 0.46 0.31 0.44 0.45 0.2 2.54 0.19", "0 51 60 61 59 59 57 57 59 60 61 59 0"], # slur
|
174 |
+
["Model①(Chinese)-zh", "singer4 (female)", "AP 我 多 想 再 见 你\n哪 怕 匆 - 匆 一 AP 眼 就 别 离 AP", "0.13 0.24 0.68 0.78 0.86 0.4 0.94 0.54 0.3 0.56 0.16 0.86 0.26 0.22 0.28 0.78 0.68 1.5 0.32", "0 57 66 63 63 63 63 60 61 61 63 66 66 0 61 61 59 58 0"],
|
175 |
+
["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "60 60 60 56 56 56 55 55 55 53 56"],
|
176 |
+
["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "62 62 62 58 58 58 57 57 57 55 58"], # pitch
|
177 |
+
["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "1.2 0.6 0.6 0.6 0.6 1.2 1.2 0.6 0.6 1.2 0.45", "60 60 60 56 56 56 55 55 55 53 56"], # double dur
|
178 |
+
["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "0.3 0.15 0.15 0.15 0.15 0.3 0.3 0.15 0.15 0.3 0.11", "60 60 60 56 56 56 55 55 55 53 56"], # half dur
|
179 |
+
["Model②(Multilingual)-jp", "singer8 (female)", "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0"],
|
180 |
+
["Model②(Multilingual)-jp", "singer8 (female)", "じゃ の め で お む か え う れ し い な", "0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.65", "60 60 60 62 64 67 69 69 64 64 64 62 60"],
|
181 |
+
["Model②(Multilingual)-jp", "singer10 (female)", "お と め わ ら い か ふぁ い や ら い か ん な い す ぶ ろ うぃ ん ぶ ろ うぃ ん い ん ざ うぃ ん", "0.15 0.15 0.15 0.15 0.3 0.15 0.3 0.15 0.15 0.3 0.07 0.07 0.15 0.15 0.15 0.15 0.15 0.15 0.45 0.07 0.07 0.07 0.38 0.07 0.07 0.15 0.15 0.3 0.15 0.15", "67 67 67 67 67 67 69 67 67 69 67 67 64 64 64 64 64 64 62 64 64 62 62 64 64 62 62 59 59 59"],
|
182 |
+
]
|
183 |
+
|
184 |
+
with gr.Blocks() as demo:
|
185 |
+
gr.Markdown(
|
186 |
+
"""
|
187 |
+
<h1 align="center"> Demo of Singing Voice Synthesis in Muskits-ESPnet </h1>
|
188 |
+
|
189 |
+
<div style="font-size: 20px;">
|
190 |
+
This is the demo page of our toolkit <a href="https://arxiv.org/abs/2409.07226"><b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b></a>.
|
191 |
+
|
192 |
+
Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.
|
193 |
+
|
194 |
+
Music score usually includes lyrics, as well as duration and pitch of each word in lyrics,
|
195 |
+
|
196 |
+
<h2>How to use:</h2>
|
197 |
+
1. <b>Choose Model-Language</b>:
|
198 |
+
<ul>
|
199 |
+
<li> "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
|
200 |
+
<li> For example, "Model②(Mulitlingual)-zh" means model "Model②(Multilingual)" with lyrics input in Chinese. </li>
|
201 |
+
</ul>
|
202 |
+
|
203 |
+
2. <b>[Optional] Choose Singer</b>: Choose one singer you like from the drop-down list.
|
204 |
+
|
205 |
+
3. <b>Input lyrics</b>:
|
206 |
+
<ul>
|
207 |
+
<li> Lyrics use Chinese characters when the language is 'zh' and hiragana when the language is 'jp'. </li>
|
208 |
+
<li> Special characters such as 'AP' (breath), 'SP' (silence), and '-' (slur, only for Chinese lyrics) can also be used. </li>
|
209 |
+
<li> Lyrics sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
|
210 |
+
</ul>
|
211 |
+
|
212 |
+
4. <b>Input durations</b>:
|
213 |
+
<ul>
|
214 |
+
<li> Durations use float number as input. </li>
|
215 |
+
<li> Length of duration sequence should <b>be same as lyric sequence</b>, with each duration corresponding to the respective lyric. </li>
|
216 |
+
<li> Durations sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
|
217 |
+
</ul>
|
218 |
+
|
219 |
+
5. <b>Input pitches</b>:
|
220 |
+
<ul>
|
221 |
+
<li> Pitches use MIDI note or MIDI note number as input. Specially, "69" in MIDI note number represents "A4" in MIDI note. </li>
|
222 |
+
<li> Length of pitch sequence should <b>be same as lyric sequence</b>, with each pitch corresponding to the respective lyric. </li>
|
223 |
+
<li> Pitches sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
|
224 |
+
</ul>
|
225 |
+
|
226 |
+
6. <b>Hit "Generate" and listen to the result!</b>
|
227 |
+
|
228 |
</div>
|
|
|
229 |
|
230 |
+
<h2>Notice:</h2>
|
231 |
+
<ul>
|
232 |
+
<li> Plenty of exmpales are provided. </li>
|
233 |
+
<li> Extreme values may result in suboptimal generation quality! </li>
|
234 |
+
</ul>
|
235 |
+
"""
|
236 |
+
)
|
237 |
+
# Row-1
|
238 |
+
with gr.Row():
|
239 |
+
with gr.Column(variant="panel"):
|
240 |
+
model_name = gr.Radio(
|
241 |
+
label="Model-Language",
|
242 |
+
choices=[
|
243 |
+
"Model①(Chinese)-zh",
|
244 |
+
"Model②(Multilingual)-zh",
|
245 |
+
"Model②(Multilingual)-jp",
|
246 |
+
],
|
247 |
+
)
|
248 |
+
|
249 |
+
with gr.Column(variant="panel"):
|
250 |
+
singer = gr.Dropdown(
|
251 |
+
label="Singer",
|
252 |
+
choices=total_singers,
|
253 |
+
)
|
254 |
+
|
255 |
+
# def set_model(model_name_str: str):
|
256 |
+
# """
|
257 |
+
# gets value from `model_name`. either
|
258 |
+
# uses cached list of speakers for the given model name
|
259 |
+
# or loads the addon and checks what are the speakers.
|
260 |
+
# """
|
261 |
+
# speakers = list(singer_embeddings[model_name_str].keys())
|
262 |
+
# value = speakers[0]
|
263 |
+
# return gr.update(
|
264 |
+
# choices=speakers, value=value, visible=True, interactive=True
|
265 |
+
# )
|
266 |
+
|
267 |
+
# model_name.change(set_model, inputs=model_name, outputs=singer)
|
268 |
+
|
269 |
+
# Row-2
|
270 |
+
with gr.Row():
|
271 |
+
with gr.Column(variant="panel"):
|
272 |
+
lyrics = gr.Textbox(label="Lyrics")
|
273 |
+
duration = gr.Textbox(label="Duration")
|
274 |
+
pitch = gr.Textbox(label="Pitch")
|
275 |
+
generate = gr.Button("Generate")
|
276 |
+
with gr.Column(variant="panel"):
|
277 |
+
gened_song = gr.Audio(label="Generated Song", type="numpy")
|
278 |
+
run_status = gr.Textbox(label="Running Status")
|
279 |
+
|
280 |
+
gr.Examples(
|
281 |
+
examples=examples,
|
282 |
+
inputs=[model_name, singer, lyrics, duration, pitch],
|
283 |
+
outputs=[singer],
|
284 |
+
label="Examples",
|
285 |
+
examples_per_page=20,
|
286 |
+
)
|
287 |
+
|
288 |
+
gr.Markdown("""
|
289 |
<div style='margin:20px auto;'>
|
290 |
|
291 |
<p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> |
|
292 |
+
<a href="https://github.com/espnet/espnet">espnet</a> |
|
293 |
+
<a href="https://huggingface.co/espnet/aceopencpop_svs_visinger2_40singer_pretrain">Model①(Chinese)</a> |
|
294 |
+
<a href="https://huggingface.co/espnet/mixdata_svs_visinger2_spkembed_lang_pretrained">Model②(Multilingual)</a></p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
</div>
|
297 |
"""
|
298 |
+
)
|
299 |
|
300 |
+
generate.click(
|
301 |
+
fn=gen_song,
|
302 |
+
inputs=[model_name, singer, lyrics, duration, pitch],
|
303 |
+
outputs=[gened_song, run_status],
|
304 |
+
)
|
305 |
+
|
306 |
+
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -7,4 +7,5 @@ importlib
|
|
7 |
pathlib
|
8 |
pypinyin
|
9 |
torchaudio
|
10 |
-
pyopenjtalk
|
|
|
|
7 |
pathlib
|
8 |
pypinyin
|
9 |
torchaudio
|
10 |
+
pyopenjtalk
|
11 |
+
re
|
resource/__init__.py
ADDED
File without changes
|
util.py
CHANGED
@@ -2,11 +2,11 @@ import os
|
|
2 |
import json
|
3 |
import warnings
|
4 |
from typing import List
|
5 |
-
from pypinyin import lazy_pinyin
|
6 |
import re
|
7 |
|
8 |
import pyopenjtalk
|
9 |
-
|
|
|
10 |
|
11 |
def preprocess_input(src_str, seg_syb=" "):
|
12 |
src_str = src_str.replace("\n", seg_syb)
|
@@ -14,6 +14,12 @@ def preprocess_input(src_str, seg_syb=" "):
|
|
14 |
return src_str
|
15 |
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
def pyopenjtalk_g2p(text) -> List[str]:
|
18 |
with warnings.catch_warnings(record=True) as w:
|
19 |
warnings.simplefilter("always")
|
@@ -28,7 +34,7 @@ def pyopenjtalk_g2p(text) -> List[str]:
|
|
28 |
return phones
|
29 |
|
30 |
|
31 |
-
def
|
32 |
# load pinyin dict from local/pinyin.dict
|
33 |
pinyin = pinyin.lower()
|
34 |
if pinyin in zh_plan["dict"]:
|
@@ -39,14 +45,26 @@ def split_pinyin(pinyin: str, zh_plan: dict) -> tuple[str]:
|
|
39 |
return False
|
40 |
|
41 |
|
42 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
if lang == "zh":
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
50 |
elif lang == "jp":
|
51 |
return pyopenjtalk_g2p
|
52 |
|
|
|
2 |
import json
|
3 |
import warnings
|
4 |
from typing import List
|
|
|
5 |
import re
|
6 |
|
7 |
import pyopenjtalk
|
8 |
+
from resource.pinyin_dict import PINYIN_DICT
|
9 |
+
from pypinyin import lazy_pinyin
|
10 |
|
11 |
def preprocess_input(src_str, seg_syb=" "):
|
12 |
src_str = src_str.replace("\n", seg_syb)
|
|
|
14 |
return src_str
|
15 |
|
16 |
|
17 |
+
def postprocess_phn(phns, model_name, lang):
|
18 |
+
if "Chinese" in model_name:
|
19 |
+
return phns
|
20 |
+
return [phn + "@" + lang for phn in phns]
|
21 |
+
|
22 |
+
|
23 |
def pyopenjtalk_g2p(text) -> List[str]:
|
24 |
with warnings.catch_warnings(record=True) as w:
|
25 |
warnings.simplefilter("always")
|
|
|
34 |
return phones
|
35 |
|
36 |
|
37 |
+
def split_pinyin_ace(pinyin: str, zh_plan: dict) -> tuple[str]:
|
38 |
# load pinyin dict from local/pinyin.dict
|
39 |
pinyin = pinyin.lower()
|
40 |
if pinyin in zh_plan["dict"]:
|
|
|
45 |
return False
|
46 |
|
47 |
|
48 |
+
def split_pinyin_py(pinyin: str) -> tuple[str]:
|
49 |
+
pinyin = pinyin.lower()
|
50 |
+
if pinyin in PINYIN_DICT:
|
51 |
+
return PINYIN_DICT[pinyin]
|
52 |
+
else:
|
53 |
+
return False
|
54 |
+
|
55 |
+
|
56 |
+
def get_tokenizer(model, lang):
|
57 |
if lang == "zh":
|
58 |
+
if "Chinese" in model:
|
59 |
+
print("hello")
|
60 |
+
return lambda text: split_pinyin_py(text)
|
61 |
+
else:
|
62 |
+
with open(os.path.join("resource/all_plans.json"), "r") as f:
|
63 |
+
all_plan_dict = json.load(f)
|
64 |
+
for plan in all_plan_dict["plans"]:
|
65 |
+
if plan["language"] == "zh":
|
66 |
+
zh_plan = plan
|
67 |
+
return lambda text: split_pinyin_ace(text, zh_plan)
|
68 |
elif lang == "jp":
|
69 |
return pyopenjtalk_g2p
|
70 |
|