Spaces:

TangRain
/

muskits-espnet-svs-demo

Running

App Files Files Community

TangRain commited on Oct 25, 2024

Commit

4513698

1 Parent(s): e0646f6

v2: two models

Browse files

Files changed (4) hide show

app.py +194 -133
requirements.txt +2 -1
resource/__init__.py +0 -0
util.py +28 -10

app.py CHANGED Viewed

@@ -2,39 +2,69 @@ import os
 import numpy as np
 import gradio as gr
 import pyopenjtalk
-from util import preprocess_input, get_tokenizer, load_pitch_dict, get_pinyin
 from espnet_model_zoo.downloader import ModelDownloader
-from espnet2.fileio.read_text import read_label
 from espnet2.bin.svs_inference import SingingGenerate
 singer_embeddings = {
-    "singer1 (male)": "resource/singer/singer_embedding_ace-1.npy",
-    "singer2 (female)": "resource/singer/singer_embedding_ace-2.npy",
-    "singer3 (male)": "resource/singer/singer_embedding_ace-3.npy",
-    "singer4 (female)": "resource/singer/singer_embedding_ace-8.npy",
-    "singer4 (male)": "resource/singer/singer_embedding_ace-7.npy",
-    "singer6 (female)": "resource/singer/singer_embedding_itako.npy",
-    "singer7 (male)": "resource/singer/singer_embedding_ofuton.npy",
-    "singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy",
-    "singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy",
-    "singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy",
 }
 langs = {
     "zh": 2,
     "jp": 1,
 }
-def gen_song(lang, texts, durs, pitchs, spk):
     fs = 44100
     tempo = 120
-    PRETRAIN_MODEL = "TangRain/mixdata_svs_visinger2_spkembed_lang_pretrained"
-    # pretrain_downloaded = {
-    #     "train_config": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/config.yaml",
-    #     "model_file": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/500epoch.pth",
-    # }
     if texts is None:
         return (fs, np.array([0.0])), "Error: No Text provided!"
     if durs is None:
@@ -60,7 +90,7 @@ def gen_song(lang, texts, durs, pitchs, spk):
         return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!"
     ## text to phoneme
-    tokenizer = get_tokenizer(lang)
     sybs = []
     for text in text_list:
         if text == "AP" or text == "SP":
@@ -69,9 +99,9 @@ def gen_song(lang, texts, durs, pitchs, spk):
             rev = [text]
         else:
             rev = tokenizer(text)
-            rev = [phn + f"@{lang}" for phn in rev]
         if rev == False:
             return (fs, np.array([0.0])), f"Error: text `{text}` is invalid!"
         phns = "_".join(rev)
         sybs.append(phns)
@@ -105,7 +135,7 @@ def gen_song(lang, texts, durs, pitchs, spk):
         ),
         "text": phns_str,
     }
-    # print(batch)
     # return (fs, np.array([0.0])), "success!"
     # Infer
@@ -118,128 +148,159 @@ def gen_song(lang, texts, durs, pitchs, spk):
         model_file = pretrain_downloaded["model_file"],
         device = device
     )
-    # sid = spks[spk]
-    lid = langs[lang]
-    spk_embed = np.load(singer_embeddings[spk])
-    # output_dict = svs(batch, sids=np.array([sid]))
-    output_dict = svs(batch, lids=np.array([lid]), spembs=spk_embed)
     wav_info = output_dict["wav"].cpu().numpy()
     return (fs, wav_info), "success!"
-title = "Demo of Singing Voice Synthesis in Muskits-ESPnet"
-description = """
-<div style="font-size: 20px; ">
-  <p>This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b>.</p>
-  <p>Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.\n
-  Music score contains information about lyrics, as well as duration and pitch of each word in lyrics.</p>
-  <h1>How to use:</h1>
-  <ol>
-    <li> <b>Choose language ID</b>: "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
-    <li> <b>Input lyrics</b>:
-        <ul>
-            <li> Lyrics use Chinese characters when the language is 'zh' and hiragana when the language is 'jp'. </li>
-            <li> Special characters such as 'AP' (breath), 'SP' (silence), and '-' (slur, only for 'zh') can also be used. </li>
-            <li> Lyrics sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
-        </ul>
-    </li>
-    <li> <b>Input durations</b>:
-        <ul>
-            <li> Length of duration sequence should <b>be same as lyric sequence</b>, with each duration corresponding to the respective lyric. </li>
-            <li> Durations sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
-        </ul>
-    </li>
-    <li> <b>Input pitches</b>:
-        <ul>
-            <li> Length of pitch sequence should <b>be same as lyric sequence</b>, with each pitch corresponding to the respective lyric. </li>
-            <li> Pitches sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
-        </ul>
-    </li>
-    <li> <b>Choose one singer</b> </li>
-    <li> <b>Click submit button</b> </li>
-  </ol>
-  <h1>Notice:</h1>
-  <ul>
-    <li> Values outside this range may result in suboptimal generation quality! </li>
-  </ul>
 </div>
-"""
-article = """
 <div style='margin:20px auto;'>
 <p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> |
-<a href="https://github.com/espnet/espnet">espnet GitHub</a> |
-<a href="https://huggingface.co/espnet/mixdata_svs_visinger2_spkembed_lang_pretrained">pretrained model</a></p>
-<pre>
-@inproceedings{wu2024muskits,
-  title = {{Muskits-ESPnet}: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm},
-  author = {Yuning Wu and Jiatong Shi and Yifeng Yu and Yuxun Tang and Tao Qian and Yueqian Lin and Jionghao Han and Xinyi Bai and Shinji Watanabe and Qin Jin},
-  booktitle={Proceedings of the 32st ACM International Conference on Multimedia},
-  year={2024},
-}
-</pre>
 </div>
 """
-# SP: silence, AP: aspirate.
-examples = [
-    ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0", "singer1 (male)"],
-    ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest", "singer1 (male)"],
-    ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C#4 D#4 D#4 D#4 rest D#4 B3 rest\nB3 B3 rest B3 B3 E4 rest", "singer1 (male)"],
-    ["zh", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
-    ["zh", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.66 0.58 0.27 0.3 0.97\n0.48 0.36 0.69 0.3 0.53 0.56 1.27 0.89", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
-    ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP\n你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34\n0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0\n63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
-    ["zh", "修 炼 爱 情 的 心 酸 SP AP", "0.42 0.21 0.19 0.28 0.22 0.33 1.53 0.1 0.29", "68 70 68 66 63 68 68 0 0", "singer2 (female)"],
-    ["zh", "学 会 放 好 以 前 的 渴 望 SP AP", "0.3 0.22 0.29 0.27 0.25 0.44 0.54 0.29 1.03 0.08 0.39", "68 70 68 66 61 68 68 65 66 0 0", "singer2 (female)"],
-    ["zh", "SP 你 看 着 车 窗 - SP", " 0.41 0.96 0.7 0.64 1.12 1.14 1.04 0.29", "0 60 60 62 60 64 65 0", "singer3 (male)"],
-    ["jp", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
-    ["jp", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "62 62 62 58 58 58 57 57 57 55 58", "singer8 (female)"],
-    ["jp", "い じ ん さ ん に つ れ ら れ て", "1.2 0.6 0.6 0.6 0.6 1.2 1.2 0.6 0.6 1.2 0.45", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
-    ["jp", "�� じ ん さ ん に つ れ ら れ て", "0.3 0.15 0.15 0.15 0.15 0.3 0.3 0.15 0.15 0.3 0.11", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
-    ["jp", "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0", "singer2 (female)"],
-    ["jp", "じゃ の め で お む か え う れ し い な", "0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.65", "60 60 60 62 64 67 69 69 64 64 64 62 60", "singer10 (female)"],
-    ["jp", "お と め わ ら い か ふぁ い や ら い か ん な い す ぶ ろ うぃ ん ぶ ろ うぃ ん い ん ざ うぃ ん", "0.15 0.15 0.15 0.15 0.3 0.15 0.3 0.15 0.15 0.3 0.07 0.07 0.15 0.15 0.15 0.15 0.15 0.15 0.45 0.07 0.07 0.07 0.38 0.07 0.07 0.15 0.15 0.3 0.15 0.15", "67 67 67 67 67 67 69 67 67 69 67 67 64 64 64 64 64 64 62 64 64 62 62 64 64 62 62 59 59 59", "singer9 (male)"],
-]
-app = gr.Interface(
-    fn=gen_song,
-    inputs=[
-        gr.Radio(label="language", choices=["zh", "jp"], value="zh"),
-        gr.Textbox(label="Lyrics"),
-        gr.Textbox(label="Duration"),
-        gr.Textbox(label="Pitch"),
-        gr.Radio(
-            label="Singer",
-            choices=[
-                "singer1 (male)",
-                "singer2 (female)",
-                "singer3 (male)",
-                "singer4 (female)",
-                "singer4 (male)",
-                "singer6 (female)",
-                "singer7 (male)",
-                "singer8 (female)",
-                "singer9 (male)",
-                "singer10 (female)",
-            ],
-            value="singer1 (male)",
-        ),
-    ],
-    outputs=[
-        gr.Audio(label="Generated Song", type="numpy"),
-        gr.Textbox(label="Running Status"),
-    ],
-    title=title,
-    description=description,
-    article=article,
-    examples=examples,
-)
-app.launch()

 import numpy as np
 import gradio as gr
 import pyopenjtalk
+from util import preprocess_input, postprocess_phn, get_tokenizer, load_pitch_dict, get_pinyin
 from espnet_model_zoo.downloader import ModelDownloader
 from espnet2.bin.svs_inference import SingingGenerate
 singer_embeddings = {
+    "Model①(Chinese)-zh": {
+        "singer1 (male)": 1,
+        "singer2 (female)": 12,
+        "singer3 (male)": 23,
+        "singer4 (female)": 29,
+        "singer5 (male)": 18,
+        "singer6 (female)": 8,
+        "singer7 (male)": 25,
+        "singer8 (female)": 5,
+        "singer9 (male)": 10,
+        "singer10 (female)": 15,
+    },
+    "Model②(Multilingual)-zh": {
+        "singer1 (male)": "resource/singer/singer_embedding_ace-1.npy",
+        "singer2 (female)": "resource/singer/singer_embedding_ace-2.npy",
+        "singer3 (male)": "resource/singer/singer_embedding_ace-3.npy",
+        "singer4 (female)": "resource/singer/singer_embedding_ace-8.npy",
+        "singer5 (male)": "resource/singer/singer_embedding_ace-7.npy",
+        "singer6 (female)": "resource/singer/singer_embedding_itako.npy",
+        "singer7 (male)": "resource/singer/singer_embedding_ofuton.npy",
+        "singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy",
+        "singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy",
+        "singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy",
+    },
+    "Model②(Multilingual)-jp": {
+        "singer1 (male)": "resource/singer/singer_embedding_ace-1.npy",
+        "singer2 (female)": "resource/singer/singer_embedding_ace-2.npy",
+        "singer3 (male)": "resource/singer/singer_embedding_ace-3.npy",
+        "singer4 (female)": "resource/singer/singer_embedding_ace-8.npy",
+        "singer5 (male)": "resource/singer/singer_embedding_ace-7.npy",
+        "singer6 (female)": "resource/singer/singer_embedding_itako.npy",
+        "singer7 (male)": "resource/singer/singer_embedding_ofuton.npy",
+        "singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy",
+        "singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy",
+        "singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy",
+    }
+}
+model_dict = {
+    "Model①(Chinese)-zh": "espnet/aceopencpop_svs_visinger2_40singer_pretrain",
+    "Model②(Multilingual)-zh": "espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
+    "Model②(Multilingual)-jp": "espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
 }
+total_singers = list(singer_embeddings["Model②(Multilingual)-zh"].keys())
 langs = {
     "zh": 2,
     "jp": 1,
 }
+def gen_song(model_name, spk, texts, durs, pitchs):
     fs = 44100
     tempo = 120
+    lang = model_name.split("-")[-1]
+    PRETRAIN_MODEL = model_dict[model_name]
     if texts is None:
         return (fs, np.array([0.0])), "Error: No Text provided!"
     if durs is None:
         return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!"
     ## text to phoneme
+    tokenizer = get_tokenizer(model_name, lang)
     sybs = []
     for text in text_list:
         if text == "AP" or text == "SP":
             rev = [text]
         else:
             rev = tokenizer(text)
         if rev == False:
             return (fs, np.array([0.0])), f"Error: text `{text}` is invalid!"
+        rev = postprocess_phn(rev, model_name, lang)
         phns = "_".join(rev)
         sybs.append(phns)
         ),
         "text": phns_str,
     }
+    print(batch)
     # return (fs, np.array([0.0])), "success!"
     # Infer
         model_file = pretrain_downloaded["model_file"],
         device = device
     )
+    if model_name == "Model①(Chinese)-zh":
+        sid = np.array([singer_embeddings[model_name][spk]])
+        output_dict = svs(batch, sids=sid)
+    else:
+        lid = np.array([langs[lang]])
+        spk_embed = np.load(singer_embeddings[model_name][spk])
+        output_dict = svs(batch, lids=lid, spembs=spk_embed)
     wav_info = output_dict["wav"].cpu().numpy()
     return (fs, wav_info), "success!"
+# SP: silence, AP: aspirate.
+examples = [
+    ["Model①(Chinese)-zh", "singer1 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0"],
+    ["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest"], # midi note
+    ["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C#4 D#4 D#4 D#4 rest D#4 B3 rest\nB3 B3 rest B3 B3 E4 rest"], # up 1 key
+    ["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 大 地 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest"], # lyrics
+    ["Model②(Multilingual)-zh", "singer3 (male)", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0"],
+    ["Model②(Multilingual)-zh", "singer3 (male)", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.66 0.58 0.27 0.3 0.97\n0.48 0.36 0.69 0.3 0.53 0.56 1.27 0.89", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0"], # double duration
+    ["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP\n你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34\n0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0\n63 63 63 63 0 63\n62 62 62 63 65 63 62 0"], # long
+    ["Model①(Chinese)-zh", "singer3 (male)", "修 炼 爱 情 的 心 酸 SP AP", "0.42 0.21 0.19 0.28 0.22 0.33 1.53 0.1 0.29", "68 70 68 66 63 68 68 0 0"],
+    ["Model①(Chinese)-zh", "singer3 (male)", "学 会 放 好 以 前 的 渴 望 SP AP", "0.3 0.22 0.29 0.27 0.25 0.44 0.54 0.29 1.03 0.08 0.39", "68 70 68 66 61 68 68 65 66 0 0"],
+    ["Model①(Chinese)-zh", "singer3 (male)", "SP 我 不 - 是 一 定 要 你 回 - 来 SP", "0.37 0.45 0.47 0.17 0.52 0.28 0.46 0.31 0.44 0.45 0.2 2.54 0.19", "0 51 60 61 59 59 57 57 59 60 61 59 0"], # slur
+    ["Model①(Chinese)-zh", "singer4 (female)", "AP 我 多 想 再 见 你\n哪 怕 匆 - 匆 一 AP 眼 就 别 离 AP", "0.13 0.24 0.68 0.78 0.86 0.4 0.94 0.54 0.3 0.56 0.16 0.86 0.26 0.22 0.28 0.78 0.68 1.5 0.32", "0 57 66 63 63 63 63 60 61 61 63 66 66 0 61 61 59 58 0"],
+    ["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "60 60 60 56 56 56 55 55 55 53 56"],
+    ["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "62 62 62 58 58 58 57 57 57 55 58"], # pitch
+    ["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "1.2 0.6 0.6 0.6 0.6 1.2 1.2 0.6 0.6 1.2 0.45", "60 60 60 56 56 56 55 55 55 53 56"], # double dur
+    ["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "0.3 0.15 0.15 0.15 0.15 0.3 0.3 0.15 0.15 0.3 0.11", "60 60 60 56 56 56 55 55 55 53 56"], # half dur
+    ["Model②(Multilingual)-jp", "singer8 (female)", "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0"],
+    ["Model②(Multilingual)-jp", "singer8 (female)", "じゃ の め で お む か え う れ し い な", "0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.65", "60 60 60 62 64 67 69 69 64 64 64 62 60"],
+    ["Model②(Multilingual)-jp", "singer10 (female)", "お と め わ ら い か ふぁ い や ら い か ん な い す ぶ ろ うぃ ん ぶ ろ うぃ ん い ん ざ うぃ ん", "0.15 0.15 0.15 0.15 0.3 0.15 0.3 0.15 0.15 0.3 0.07 0.07 0.15 0.15 0.15 0.15 0.15 0.15 0.45 0.07 0.07 0.07 0.38 0.07 0.07 0.15 0.15 0.3 0.15 0.15", "67 67 67 67 67 67 69 67 67 69 67 67 64 64 64 64 64 64 62 64 64 62 62 64 64 62 62 59 59 59"],
+]
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+<h1 align="center"> Demo of Singing Voice Synthesis in Muskits-ESPnet </h1>
+<div style="font-size: 20px;">
+This is the demo page of our toolkit <a href="https://arxiv.org/abs/2409.07226"><b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b></a>.
+Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.
+Music score usually includes lyrics, as well as duration and pitch of each word in lyrics,
+<h2>How to use:</h2>
+1. <b>Choose Model-Language</b>:
+    <ul>
+        <li> "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
+        <li> For example, "Model②(Mulitlingual)-zh" means model "Model②(Multilingual)" with lyrics input in Chinese. </li>
+    </ul>
+2. <b>[Optional] Choose Singer</b>: Choose one singer you like from the drop-down list.
+3. <b>Input lyrics</b>:
+    <ul>
+        <li> Lyrics use Chinese characters when the language is 'zh' and hiragana when the language is 'jp'. </li>
+        <li> Special characters such as 'AP' (breath), 'SP' (silence), and '-' (slur, only for Chinese lyrics) can also be used. </li>
+        <li> Lyrics sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
+    </ul>
+4. <b>Input durations</b>:
+    <ul>
+        <li> Durations use float number as input. </li>
+        <li> Length of duration sequence should <b>be same as lyric sequence</b>, with each duration corresponding to the respective lyric. </li>
+        <li> Durations sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
+    </ul>
+5. <b>Input pitches</b>:
+    <ul>
+        <li> Pitches use MIDI note or MIDI note number as input. Specially, "69" in MIDI note number represents "A4" in MIDI note. </li>
+        <li> Length of pitch sequence should <b>be same as lyric sequence</b>, with each pitch corresponding to the respective lyric. </li>
+        <li> Pitches sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
+    </ul>
+6. <b>Hit "Generate" and listen to the result!</b>
 </div>
+<h2>Notice:</h2>
+    <ul>
+        <li> Plenty of exmpales are provided. </li>
+        <li> Extreme values may result in suboptimal generation quality! </li>
+    </ul>
+"""
+    )
+    # Row-1
+    with gr.Row():
+        with gr.Column(variant="panel"):
+            model_name = gr.Radio(
+                label="Model-Language",
+                choices=[
+                    "Model①(Chinese)-zh",
+                    "Model②(Multilingual)-zh",
+                    "Model②(Multilingual)-jp",
+                ],
+            )
+        with gr.Column(variant="panel"):
+            singer = gr.Dropdown(
+                label="Singer",
+                choices=total_singers,
+            )
+        # def set_model(model_name_str: str):
+        #     """
+        #     gets value from `model_name`. either
+        #     uses cached list of speakers for the given model name
+        #     or loads the addon and checks what are the speakers.
+        #     """
+        #     speakers = list(singer_embeddings[model_name_str].keys())
+        #     value = speakers[0]
+        #     return gr.update(
+        #         choices=speakers, value=value, visible=True, interactive=True
+        #     )
+        # model_name.change(set_model, inputs=model_name, outputs=singer)
+    # Row-2
+    with gr.Row():
+        with gr.Column(variant="panel"):
+            lyrics = gr.Textbox(label="Lyrics")
+            duration = gr.Textbox(label="Duration")
+            pitch = gr.Textbox(label="Pitch")
+            generate = gr.Button("Generate")
+        with gr.Column(variant="panel"):
+            gened_song = gr.Audio(label="Generated Song", type="numpy")
+            run_status = gr.Textbox(label="Running Status")
+    gr.Examples(
+        examples=examples,
+        inputs=[model_name, singer, lyrics, duration, pitch],
+        outputs=[singer],
+        label="Examples",
+        examples_per_page=20,
+    )
+    gr.Markdown("""
 <div style='margin:20px auto;'>
 <p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> |
+<a href="https://github.com/espnet/espnet">espnet</a> |
+<a href="https://huggingface.co/espnet/aceopencpop_svs_visinger2_40singer_pretrain">Model①(Chinese)</a> |
+<a href="https://huggingface.co/espnet/mixdata_svs_visinger2_spkembed_lang_pretrained">Model②(Multilingual)</a></p>
 </div>
 """
+    )
+    generate.click(
+        fn=gen_song,
+        inputs=[model_name, singer, lyrics, duration, pitch],
+        outputs=[gened_song, run_status],
+    )
+demo.launch()

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ importlib
 pathlib
 pypinyin
 torchaudio
-pyopenjtalk

 pathlib
 pypinyin
 torchaudio
+pyopenjtalk
+re

resource/__init__.py ADDED Viewed

File without changes

util.py CHANGED Viewed

@@ -2,11 +2,11 @@ import os
 import json
 import warnings
 from typing import List
-from pypinyin import lazy_pinyin
 import re
 import pyopenjtalk
 def preprocess_input(src_str, seg_syb=" "):
     src_str = src_str.replace("\n", seg_syb)
@@ -14,6 +14,12 @@ def preprocess_input(src_str, seg_syb=" "):
     return src_str
 def pyopenjtalk_g2p(text) -> List[str]:
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter("always")
@@ -28,7 +34,7 @@ def pyopenjtalk_g2p(text) -> List[str]:
     return phones
-def split_pinyin(pinyin: str, zh_plan: dict) -> tuple[str]:
     # load pinyin dict from local/pinyin.dict
     pinyin = pinyin.lower()
     if pinyin in zh_plan["dict"]:
@@ -39,14 +45,26 @@ def split_pinyin(pinyin: str, zh_plan: dict) -> tuple[str]:
         return False
-def get_tokenizer(lang):
     if lang == "zh":
-        with open(os.path.join("resource/all_plans.json"), "r") as f:
-            all_plan_dict = json.load(f)
-        for plan in all_plan_dict["plans"]:
-            if plan["language"] == "zh":
-                zh_plan = plan
-        return lambda text: split_pinyin(text, zh_plan)
     elif lang == "jp":
         return pyopenjtalk_g2p

 import json
 import warnings
 from typing import List
 import re
 import pyopenjtalk
+from resource.pinyin_dict import PINYIN_DICT
+from pypinyin import lazy_pinyin
 def preprocess_input(src_str, seg_syb=" "):
     src_str = src_str.replace("\n", seg_syb)
     return src_str
+def postprocess_phn(phns, model_name, lang):
+    if "Chinese" in model_name:
+        return phns
+    return [phn + "@" + lang for phn in phns]
 def pyopenjtalk_g2p(text) -> List[str]:
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter("always")
     return phones
+def split_pinyin_ace(pinyin: str, zh_plan: dict) -> tuple[str]:
     # load pinyin dict from local/pinyin.dict
     pinyin = pinyin.lower()
     if pinyin in zh_plan["dict"]:
         return False
+def split_pinyin_py(pinyin: str) -> tuple[str]:
+    pinyin = pinyin.lower()
+    if pinyin in PINYIN_DICT:
+        return PINYIN_DICT[pinyin]
+    else:
+        return False
+def get_tokenizer(model, lang):
     if lang == "zh":
+        if "Chinese" in model:
+            print("hello")
+            return lambda text: split_pinyin_py(text)
+        else:
+            with open(os.path.join("resource/all_plans.json"), "r") as f:
+                all_plan_dict = json.load(f)
+            for plan in all_plan_dict["plans"]:
+                if plan["language"] == "zh":
+                    zh_plan = plan
+            return lambda text: split_pinyin_ace(text, zh_plan)
     elif lang == "jp":
         return pyopenjtalk_g2p