TangRain commited on
Commit
4513698
·
1 Parent(s): e0646f6

v2: two models

Browse files
Files changed (4) hide show
  1. app.py +194 -133
  2. requirements.txt +2 -1
  3. resource/__init__.py +0 -0
  4. util.py +28 -10
app.py CHANGED
@@ -2,39 +2,69 @@ import os
2
  import numpy as np
3
  import gradio as gr
4
  import pyopenjtalk
5
- from util import preprocess_input, get_tokenizer, load_pitch_dict, get_pinyin
6
 
7
  from espnet_model_zoo.downloader import ModelDownloader
8
- from espnet2.fileio.read_text import read_label
9
  from espnet2.bin.svs_inference import SingingGenerate
10
 
11
 
12
  singer_embeddings = {
13
- "singer1 (male)": "resource/singer/singer_embedding_ace-1.npy",
14
- "singer2 (female)": "resource/singer/singer_embedding_ace-2.npy",
15
- "singer3 (male)": "resource/singer/singer_embedding_ace-3.npy",
16
- "singer4 (female)": "resource/singer/singer_embedding_ace-8.npy",
17
- "singer4 (male)": "resource/singer/singer_embedding_ace-7.npy",
18
- "singer6 (female)": "resource/singer/singer_embedding_itako.npy",
19
- "singer7 (male)": "resource/singer/singer_embedding_ofuton.npy",
20
- "singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy",
21
- "singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy",
22
- "singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
 
 
 
25
  langs = {
26
  "zh": 2,
27
  "jp": 1,
28
  }
29
 
30
- def gen_song(lang, texts, durs, pitchs, spk):
31
  fs = 44100
32
  tempo = 120
33
- PRETRAIN_MODEL = "TangRain/mixdata_svs_visinger2_spkembed_lang_pretrained"
34
- # pretrain_downloaded = {
35
- # "train_config": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/config.yaml",
36
- # "model_file": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/500epoch.pth",
37
- # }
38
  if texts is None:
39
  return (fs, np.array([0.0])), "Error: No Text provided!"
40
  if durs is None:
@@ -60,7 +90,7 @@ def gen_song(lang, texts, durs, pitchs, spk):
60
  return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!"
61
 
62
  ## text to phoneme
63
- tokenizer = get_tokenizer(lang)
64
  sybs = []
65
  for text in text_list:
66
  if text == "AP" or text == "SP":
@@ -69,9 +99,9 @@ def gen_song(lang, texts, durs, pitchs, spk):
69
  rev = [text]
70
  else:
71
  rev = tokenizer(text)
72
- rev = [phn + f"@{lang}" for phn in rev]
73
  if rev == False:
74
  return (fs, np.array([0.0])), f"Error: text `{text}` is invalid!"
 
75
  phns = "_".join(rev)
76
  sybs.append(phns)
77
 
@@ -105,7 +135,7 @@ def gen_song(lang, texts, durs, pitchs, spk):
105
  ),
106
  "text": phns_str,
107
  }
108
- # print(batch)
109
  # return (fs, np.array([0.0])), "success!"
110
 
111
  # Infer
@@ -118,128 +148,159 @@ def gen_song(lang, texts, durs, pitchs, spk):
118
  model_file = pretrain_downloaded["model_file"],
119
  device = device
120
  )
121
- # sid = spks[spk]
122
- lid = langs[lang]
123
- spk_embed = np.load(singer_embeddings[spk])
124
- # output_dict = svs(batch, sids=np.array([sid]))
125
- output_dict = svs(batch, lids=np.array([lid]), spembs=spk_embed)
 
 
126
  wav_info = output_dict["wav"].cpu().numpy()
127
  return (fs, wav_info), "success!"
128
 
129
 
130
- title = "Demo of Singing Voice Synthesis in Muskits-ESPnet"
131
-
132
- description = """
133
- <div style="font-size: 20px; ">
134
- <p>This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b>.</p>
135
- <p>Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.\n
136
- Music score contains information about lyrics, as well as duration and pitch of each word in lyrics.</p>
137
-
138
- <h1>How to use:</h1>
139
- <ol>
140
- <li> <b>Choose language ID</b>: "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
141
- <li> <b>Input lyrics</b>:
142
- <ul>
143
- <li> Lyrics use Chinese characters when the language is 'zh' and hiragana when the language is 'jp'. </li>
144
- <li> Special characters such as 'AP' (breath), 'SP' (silence), and '-' (slur, only for 'zh') can also be used. </li>
145
- <li> Lyrics sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
146
- </ul>
147
- </li>
148
- <li> <b>Input durations</b>:
149
- <ul>
150
- <li> Length of duration sequence should <b>be same as lyric sequence</b>, with each duration corresponding to the respective lyric. </li>
151
- <li> Durations sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
152
- </ul>
153
- </li>
154
- <li> <b>Input pitches</b>:
155
- <ul>
156
- <li> Length of pitch sequence should <b>be same as lyric sequence</b>, with each pitch corresponding to the respective lyric. </li>
157
- <li> Pitches sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
158
- </ul>
159
- </li>
160
- <li> <b>Choose one singer</b> </li>
161
- <li> <b>Click submit button</b> </li>
162
- </ol>
163
-
164
- <h1>Notice:</h1>
165
- <ul>
166
- <li> Values outside this range may result in suboptimal generation quality! </li>
167
- </ul>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  </div>
169
- """
170
 
171
- article = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  <div style='margin:20px auto;'>
173
 
174
  <p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> |
175
- <a href="https://github.com/espnet/espnet">espnet GitHub</a> |
176
- <a href="https://huggingface.co/espnet/mixdata_svs_visinger2_spkembed_lang_pretrained">pretrained model</a></p>
177
-
178
- <pre>
179
- @inproceedings{wu2024muskits,
180
- title = {{Muskits-ESPnet}: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm},
181
- author = {Yuning Wu and Jiatong Shi and Yifeng Yu and Yuxun Tang and Tao Qian and Yueqian Lin and Jionghao Han and Xinyi Bai and Shinji Watanabe and Qin Jin},
182
- booktitle={Proceedings of the 32st ACM International Conference on Multimedia},
183
- year={2024},
184
- }
185
- </pre>
186
 
187
  </div>
188
  """
 
189
 
190
-
191
- # SP: silence, AP: aspirate.
192
- examples = [
193
- ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0", "singer1 (male)"],
194
- ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest", "singer1 (male)"],
195
- ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C#4 D#4 D#4 D#4 rest D#4 B3 rest\nB3 B3 rest B3 B3 E4 rest", "singer1 (male)"],
196
- ["zh", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
197
- ["zh", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.66 0.58 0.27 0.3 0.97\n0.48 0.36 0.69 0.3 0.53 0.56 1.27 0.89", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
198
- ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP\n你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34\n0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0\n63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
199
- ["zh", "修 炼 爱 情 的 心 酸 SP AP", "0.42 0.21 0.19 0.28 0.22 0.33 1.53 0.1 0.29", "68 70 68 66 63 68 68 0 0", "singer2 (female)"],
200
- ["zh", "学 会 放 好 以 前 的 渴 望 SP AP", "0.3 0.22 0.29 0.27 0.25 0.44 0.54 0.29 1.03 0.08 0.39", "68 70 68 66 61 68 68 65 66 0 0", "singer2 (female)"],
201
- ["zh", "SP 你 看 着 车 窗 - SP", " 0.41 0.96 0.7 0.64 1.12 1.14 1.04 0.29", "0 60 60 62 60 64 65 0", "singer3 (male)"],
202
- ["jp", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
203
- ["jp", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "62 62 62 58 58 58 57 57 57 55 58", "singer8 (female)"],
204
- ["jp", "い じ ん さ ん に つ れ ら れ て", "1.2 0.6 0.6 0.6 0.6 1.2 1.2 0.6 0.6 1.2 0.45", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
205
- ["jp", "�� じ ん さ ん に つ れ ら れ て", "0.3 0.15 0.15 0.15 0.15 0.3 0.3 0.15 0.15 0.3 0.11", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
206
- ["jp", "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0", "singer2 (female)"],
207
- ["jp", "じゃ の め で お む か え う れ し い な", "0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.65", "60 60 60 62 64 67 69 69 64 64 64 62 60", "singer10 (female)"],
208
- ["jp", "お と め わ ら い か ふぁ い や ら い か ん な い す ぶ ろ うぃ ん ぶ ろ うぃ ん い ん ざ うぃ ん", "0.15 0.15 0.15 0.15 0.3 0.15 0.3 0.15 0.15 0.3 0.07 0.07 0.15 0.15 0.15 0.15 0.15 0.15 0.45 0.07 0.07 0.07 0.38 0.07 0.07 0.15 0.15 0.3 0.15 0.15", "67 67 67 67 67 67 69 67 67 69 67 67 64 64 64 64 64 64 62 64 64 62 62 64 64 62 62 59 59 59", "singer9 (male)"],
209
- ]
210
-
211
- app = gr.Interface(
212
- fn=gen_song,
213
- inputs=[
214
- gr.Radio(label="language", choices=["zh", "jp"], value="zh"),
215
- gr.Textbox(label="Lyrics"),
216
- gr.Textbox(label="Duration"),
217
- gr.Textbox(label="Pitch"),
218
- gr.Radio(
219
- label="Singer",
220
- choices=[
221
- "singer1 (male)",
222
- "singer2 (female)",
223
- "singer3 (male)",
224
- "singer4 (female)",
225
- "singer4 (male)",
226
- "singer6 (female)",
227
- "singer7 (male)",
228
- "singer8 (female)",
229
- "singer9 (male)",
230
- "singer10 (female)",
231
- ],
232
- value="singer1 (male)",
233
- ),
234
- ],
235
- outputs=[
236
- gr.Audio(label="Generated Song", type="numpy"),
237
- gr.Textbox(label="Running Status"),
238
- ],
239
- title=title,
240
- description=description,
241
- article=article,
242
- examples=examples,
243
- )
244
-
245
- app.launch()
 
2
  import numpy as np
3
  import gradio as gr
4
  import pyopenjtalk
5
+ from util import preprocess_input, postprocess_phn, get_tokenizer, load_pitch_dict, get_pinyin
6
 
7
  from espnet_model_zoo.downloader import ModelDownloader
 
8
  from espnet2.bin.svs_inference import SingingGenerate
9
 
10
 
11
  singer_embeddings = {
12
+ "Model①(Chinese)-zh": {
13
+ "singer1 (male)": 1,
14
+ "singer2 (female)": 12,
15
+ "singer3 (male)": 23,
16
+ "singer4 (female)": 29,
17
+ "singer5 (male)": 18,
18
+ "singer6 (female)": 8,
19
+ "singer7 (male)": 25,
20
+ "singer8 (female)": 5,
21
+ "singer9 (male)": 10,
22
+ "singer10 (female)": 15,
23
+ },
24
+ "Model②(Multilingual)-zh": {
25
+ "singer1 (male)": "resource/singer/singer_embedding_ace-1.npy",
26
+ "singer2 (female)": "resource/singer/singer_embedding_ace-2.npy",
27
+ "singer3 (male)": "resource/singer/singer_embedding_ace-3.npy",
28
+ "singer4 (female)": "resource/singer/singer_embedding_ace-8.npy",
29
+ "singer5 (male)": "resource/singer/singer_embedding_ace-7.npy",
30
+ "singer6 (female)": "resource/singer/singer_embedding_itako.npy",
31
+ "singer7 (male)": "resource/singer/singer_embedding_ofuton.npy",
32
+ "singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy",
33
+ "singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy",
34
+ "singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy",
35
+ },
36
+ "Model②(Multilingual)-jp": {
37
+ "singer1 (male)": "resource/singer/singer_embedding_ace-1.npy",
38
+ "singer2 (female)": "resource/singer/singer_embedding_ace-2.npy",
39
+ "singer3 (male)": "resource/singer/singer_embedding_ace-3.npy",
40
+ "singer4 (female)": "resource/singer/singer_embedding_ace-8.npy",
41
+ "singer5 (male)": "resource/singer/singer_embedding_ace-7.npy",
42
+ "singer6 (female)": "resource/singer/singer_embedding_itako.npy",
43
+ "singer7 (male)": "resource/singer/singer_embedding_ofuton.npy",
44
+ "singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy",
45
+ "singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy",
46
+ "singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy",
47
+ }
48
+ }
49
+
50
+ model_dict = {
51
+ "Model①(Chinese)-zh": "espnet/aceopencpop_svs_visinger2_40singer_pretrain",
52
+ "Model②(Multilingual)-zh": "espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
53
+ "Model②(Multilingual)-jp": "espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
54
  }
55
 
56
+ total_singers = list(singer_embeddings["Model②(Multilingual)-zh"].keys())
57
+
58
  langs = {
59
  "zh": 2,
60
  "jp": 1,
61
  }
62
 
63
+ def gen_song(model_name, spk, texts, durs, pitchs):
64
  fs = 44100
65
  tempo = 120
66
+ lang = model_name.split("-")[-1]
67
+ PRETRAIN_MODEL = model_dict[model_name]
 
 
 
68
  if texts is None:
69
  return (fs, np.array([0.0])), "Error: No Text provided!"
70
  if durs is None:
 
90
  return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!"
91
 
92
  ## text to phoneme
93
+ tokenizer = get_tokenizer(model_name, lang)
94
  sybs = []
95
  for text in text_list:
96
  if text == "AP" or text == "SP":
 
99
  rev = [text]
100
  else:
101
  rev = tokenizer(text)
 
102
  if rev == False:
103
  return (fs, np.array([0.0])), f"Error: text `{text}` is invalid!"
104
+ rev = postprocess_phn(rev, model_name, lang)
105
  phns = "_".join(rev)
106
  sybs.append(phns)
107
 
 
135
  ),
136
  "text": phns_str,
137
  }
138
+ print(batch)
139
  # return (fs, np.array([0.0])), "success!"
140
 
141
  # Infer
 
148
  model_file = pretrain_downloaded["model_file"],
149
  device = device
150
  )
151
+ if model_name == "Model①(Chinese)-zh":
152
+ sid = np.array([singer_embeddings[model_name][spk]])
153
+ output_dict = svs(batch, sids=sid)
154
+ else:
155
+ lid = np.array([langs[lang]])
156
+ spk_embed = np.load(singer_embeddings[model_name][spk])
157
+ output_dict = svs(batch, lids=lid, spembs=spk_embed)
158
  wav_info = output_dict["wav"].cpu().numpy()
159
  return (fs, wav_info), "success!"
160
 
161
 
162
+ # SP: silence, AP: aspirate.
163
+ examples = [
164
+ ["Model①(Chinese)-zh", "singer1 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0"],
165
+ ["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest"], # midi note
166
+ ["Model①(Chinese)-zh", "singer3 (male)", "雨 湿 SP AP\n毁 SP AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C#4 D#4 D#4 D#4 rest D#4 B3 rest\nB3 B3 rest B3 B3 E4 rest"], # up 1 key
167
+ ["Model①(Chinese)-zh", "singer3 (male)", "雨 湿 SP AP\n毁 SP AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest"], # lyrics
168
+ ["Model②(Multilingual)-zh", "singer3 (male)", "你 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0"],
169
+ ["Model②(Multilingual)-zh", "singer3 (male)", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.66 0.58 0.27 0.3 0.97\n0.48 0.36 0.69 0.3 0.53 0.56 1.27 0.89", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0"], # double duration
170
+ ["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP\n你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34\n0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0\n63 63 63 63 0 63\n62 62 62 63 65 63 62 0"], # long
171
+ ["Model①(Chinese)-zh", "singer3 (male)", "修 炼 爱 情 的 心 酸 SP AP", "0.42 0.21 0.19 0.28 0.22 0.33 1.53 0.1 0.29", "68 70 68 66 63 68 68 0 0"],
172
+ ["Model①(Chinese)-zh", "singer3 (male)", "学 会 放 好 以 前 的 渴 望 SP AP", "0.3 0.22 0.29 0.27 0.25 0.44 0.54 0.29 1.03 0.08 0.39", "68 70 68 66 61 68 68 65 66 0 0"],
173
+ ["Model①(Chinese)-zh", "singer3 (male)", "SP 我 不 - 是 一 定 要 你 回 - 来 SP", "0.37 0.45 0.47 0.17 0.52 0.28 0.46 0.31 0.44 0.45 0.2 2.54 0.19", "0 51 60 61 59 59 57 57 59 60 61 59 0"], # slur
174
+ ["Model①(Chinese)-zh", "singer4 (female)", "AP 我 多 想 再 见 你\n哪 怕 匆 - 匆 一 AP 眼 就 别 离 AP", "0.13 0.24 0.68 0.78 0.86 0.4 0.94 0.54 0.3 0.56 0.16 0.86 0.26 0.22 0.28 0.78 0.68 1.5 0.32", "0 57 66 63 63 63 63 60 61 61 63 66 66 0 61 61 59 58 0"],
175
+ ["Model②(Multilingual)-jp", "singer8 (female)", "い て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "60 60 60 56 56 56 55 55 55 53 56"],
176
+ ["Model②(Multilingual)-jp", "singer8 (female)", "い ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "62 62 62 58 58 58 57 57 57 55 58"], # pitch
177
+ ["Model②(Multilingual)-jp", "singer8 (female)", "い て", "1.2 0.6 0.6 0.6 0.6 1.2 1.2 0.6 0.6 1.2 0.45", "60 60 60 56 56 56 55 55 55 53 56"], # double dur
178
+ ["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "0.3 0.15 0.15 0.15 0.15 0.3 0.3 0.15 0.15 0.3 0.11", "60 60 60 56 56 56 55 55 55 53 56"], # half dur
179
+ ["Model②(Multilingual)-jp", "singer8 (female)", "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0"],
180
+ ["Model②(Multilingual)-jp", "singer8 (female)", "じゃ の め で お む か え う れ し い な", "0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.65", "60 60 60 62 64 67 69 69 64 64 64 62 60"],
181
+ ["Model②(Multilingual)-jp", "singer10 (female)", "お と め わ ら い か ふぁ い や ら い か ん な い す ぶ ろ うぃ ん ぶ ろ うぃ ん い ん ざ うぃ ん", "0.15 0.15 0.15 0.15 0.3 0.15 0.3 0.15 0.15 0.3 0.07 0.07 0.15 0.15 0.15 0.15 0.15 0.15 0.45 0.07 0.07 0.07 0.38 0.07 0.07 0.15 0.15 0.3 0.15 0.15", "67 67 67 67 67 67 69 67 67 69 67 67 64 64 64 64 64 64 62 64 64 62 62 64 64 62 62 59 59 59"],
182
+ ]
183
+
184
+ with gr.Blocks() as demo:
185
+ gr.Markdown(
186
+ """
187
+ <h1 align="center"> Demo of Singing Voice Synthesis in Muskits-ESPnet </h1>
188
+
189
+ <div style="font-size: 20px;">
190
+ This is the demo page of our toolkit <a href="https://arxiv.org/abs/2409.07226"><b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b></a>.
191
+
192
+ Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.
193
+
194
+ Music score usually includes lyrics, as well as duration and pitch of each word in lyrics,
195
+
196
+ <h2>How to use:</h2>
197
+ 1. <b>Choose Model-Language</b>:
198
+ <ul>
199
+ <li> "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
200
+ <li> For example, "Model②(Mulitlingual)-zh" means model "Model②(Multilingual)" with lyrics input in Chinese. </li>
201
+ </ul>
202
+
203
+ 2. <b>[Optional] Choose Singer</b>: Choose one singer you like from the drop-down list.
204
+
205
+ 3. <b>Input lyrics</b>:
206
+ <ul>
207
+ <li> Lyrics use Chinese characters when the language is 'zh' and hiragana when the language is 'jp'. </li>
208
+ <li> Special characters such as 'AP' (breath), 'SP' (silence), and '-' (slur, only for Chinese lyrics) can also be used. </li>
209
+ <li> Lyrics sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
210
+ </ul>
211
+
212
+ 4. <b>Input durations</b>:
213
+ <ul>
214
+ <li> Durations use float number as input. </li>
215
+ <li> Length of duration sequence should <b>be same as lyric sequence</b>, with each duration corresponding to the respective lyric. </li>
216
+ <li> Durations sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
217
+ </ul>
218
+
219
+ 5. <b>Input pitches</b>:
220
+ <ul>
221
+ <li> Pitches use MIDI note or MIDI note number as input. Specially, "69" in MIDI note number represents "A4" in MIDI note. </li>
222
+ <li> Length of pitch sequence should <b>be same as lyric sequence</b>, with each pitch corresponding to the respective lyric. </li>
223
+ <li> Pitches sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
224
+ </ul>
225
+
226
+ 6. <b>Hit "Generate" and listen to the result!</b>
227
+
228
  </div>
 
229
 
230
+ <h2>Notice:</h2>
231
+ <ul>
232
+ <li> Plenty of exmpales are provided. </li>
233
+ <li> Extreme values may result in suboptimal generation quality! </li>
234
+ </ul>
235
+ """
236
+ )
237
+ # Row-1
238
+ with gr.Row():
239
+ with gr.Column(variant="panel"):
240
+ model_name = gr.Radio(
241
+ label="Model-Language",
242
+ choices=[
243
+ "Model①(Chinese)-zh",
244
+ "Model②(Multilingual)-zh",
245
+ "Model②(Multilingual)-jp",
246
+ ],
247
+ )
248
+
249
+ with gr.Column(variant="panel"):
250
+ singer = gr.Dropdown(
251
+ label="Singer",
252
+ choices=total_singers,
253
+ )
254
+
255
+ # def set_model(model_name_str: str):
256
+ # """
257
+ # gets value from `model_name`. either
258
+ # uses cached list of speakers for the given model name
259
+ # or loads the addon and checks what are the speakers.
260
+ # """
261
+ # speakers = list(singer_embeddings[model_name_str].keys())
262
+ # value = speakers[0]
263
+ # return gr.update(
264
+ # choices=speakers, value=value, visible=True, interactive=True
265
+ # )
266
+
267
+ # model_name.change(set_model, inputs=model_name, outputs=singer)
268
+
269
+ # Row-2
270
+ with gr.Row():
271
+ with gr.Column(variant="panel"):
272
+ lyrics = gr.Textbox(label="Lyrics")
273
+ duration = gr.Textbox(label="Duration")
274
+ pitch = gr.Textbox(label="Pitch")
275
+ generate = gr.Button("Generate")
276
+ with gr.Column(variant="panel"):
277
+ gened_song = gr.Audio(label="Generated Song", type="numpy")
278
+ run_status = gr.Textbox(label="Running Status")
279
+
280
+ gr.Examples(
281
+ examples=examples,
282
+ inputs=[model_name, singer, lyrics, duration, pitch],
283
+ outputs=[singer],
284
+ label="Examples",
285
+ examples_per_page=20,
286
+ )
287
+
288
+ gr.Markdown("""
289
  <div style='margin:20px auto;'>
290
 
291
  <p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> |
292
+ <a href="https://github.com/espnet/espnet">espnet</a> |
293
+ <a href="https://huggingface.co/espnet/aceopencpop_svs_visinger2_40singer_pretrain">Model①(Chinese)</a> |
294
+ <a href="https://huggingface.co/espnet/mixdata_svs_visinger2_spkembed_lang_pretrained">Model②(Multilingual)</a></p>
 
 
 
 
 
 
 
 
295
 
296
  </div>
297
  """
298
+ )
299
 
300
+ generate.click(
301
+ fn=gen_song,
302
+ inputs=[model_name, singer, lyrics, duration, pitch],
303
+ outputs=[gened_song, run_status],
304
+ )
305
+
306
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -7,4 +7,5 @@ importlib
7
  pathlib
8
  pypinyin
9
  torchaudio
10
- pyopenjtalk
 
 
7
  pathlib
8
  pypinyin
9
  torchaudio
10
+ pyopenjtalk
11
+ re
resource/__init__.py ADDED
File without changes
util.py CHANGED
@@ -2,11 +2,11 @@ import os
2
  import json
3
  import warnings
4
  from typing import List
5
- from pypinyin import lazy_pinyin
6
  import re
7
 
8
  import pyopenjtalk
9
-
 
10
 
11
  def preprocess_input(src_str, seg_syb=" "):
12
  src_str = src_str.replace("\n", seg_syb)
@@ -14,6 +14,12 @@ def preprocess_input(src_str, seg_syb=" "):
14
  return src_str
15
 
16
 
 
 
 
 
 
 
17
  def pyopenjtalk_g2p(text) -> List[str]:
18
  with warnings.catch_warnings(record=True) as w:
19
  warnings.simplefilter("always")
@@ -28,7 +34,7 @@ def pyopenjtalk_g2p(text) -> List[str]:
28
  return phones
29
 
30
 
31
- def split_pinyin(pinyin: str, zh_plan: dict) -> tuple[str]:
32
  # load pinyin dict from local/pinyin.dict
33
  pinyin = pinyin.lower()
34
  if pinyin in zh_plan["dict"]:
@@ -39,14 +45,26 @@ def split_pinyin(pinyin: str, zh_plan: dict) -> tuple[str]:
39
  return False
40
 
41
 
42
- def get_tokenizer(lang):
 
 
 
 
 
 
 
 
43
  if lang == "zh":
44
- with open(os.path.join("resource/all_plans.json"), "r") as f:
45
- all_plan_dict = json.load(f)
46
- for plan in all_plan_dict["plans"]:
47
- if plan["language"] == "zh":
48
- zh_plan = plan
49
- return lambda text: split_pinyin(text, zh_plan)
 
 
 
 
50
  elif lang == "jp":
51
  return pyopenjtalk_g2p
52
 
 
2
  import json
3
  import warnings
4
  from typing import List
 
5
  import re
6
 
7
  import pyopenjtalk
8
+ from resource.pinyin_dict import PINYIN_DICT
9
+ from pypinyin import lazy_pinyin
10
 
11
  def preprocess_input(src_str, seg_syb=" "):
12
  src_str = src_str.replace("\n", seg_syb)
 
14
  return src_str
15
 
16
 
17
+ def postprocess_phn(phns, model_name, lang):
18
+ if "Chinese" in model_name:
19
+ return phns
20
+ return [phn + "@" + lang for phn in phns]
21
+
22
+
23
  def pyopenjtalk_g2p(text) -> List[str]:
24
  with warnings.catch_warnings(record=True) as w:
25
  warnings.simplefilter("always")
 
34
  return phones
35
 
36
 
37
+ def split_pinyin_ace(pinyin: str, zh_plan: dict) -> tuple[str]:
38
  # load pinyin dict from local/pinyin.dict
39
  pinyin = pinyin.lower()
40
  if pinyin in zh_plan["dict"]:
 
45
  return False
46
 
47
 
48
+ def split_pinyin_py(pinyin: str) -> tuple[str]:
49
+ pinyin = pinyin.lower()
50
+ if pinyin in PINYIN_DICT:
51
+ return PINYIN_DICT[pinyin]
52
+ else:
53
+ return False
54
+
55
+
56
+ def get_tokenizer(model, lang):
57
  if lang == "zh":
58
+ if "Chinese" in model:
59
+ print("hello")
60
+ return lambda text: split_pinyin_py(text)
61
+ else:
62
+ with open(os.path.join("resource/all_plans.json"), "r") as f:
63
+ all_plan_dict = json.load(f)
64
+ for plan in all_plan_dict["plans"]:
65
+ if plan["language"] == "zh":
66
+ zh_plan = plan
67
+ return lambda text: split_pinyin_ace(text, zh_plan)
68
  elif lang == "jp":
69
  return pyopenjtalk_g2p
70