Spaces:
Sleeping
Sleeping
feat(demo-v1): support Chinese song with pretrained VISinger2
Browse files- .gitignore +2 -0
- app.py +185 -0
- midi-note.scp +152 -0
- pinyin_dict.py +425 -0
- requirements.txt +8 -0
- util.py +5 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
.gradio
|
app.py
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import gradio as gr
|
3 |
+
from pypinyin import lazy_pinyin
|
4 |
+
|
5 |
+
from pinyin_dict import PINYIN_DICT
|
6 |
+
|
7 |
+
from espnet_model_zoo.downloader import ModelDownloader
|
8 |
+
from espnet2.fileio.read_text import read_label
|
9 |
+
from espnet2.bin.svs_inference import SingingGenerate
|
10 |
+
|
11 |
+
|
12 |
+
spks = {
|
13 |
+
"singer1 (man)": 1,
|
14 |
+
"singer2 (man)": 2,
|
15 |
+
"singer3 (female)": 5,
|
16 |
+
"singer4 (female)": 9,
|
17 |
+
"singer5 (man)": 18,
|
18 |
+
"singer6 (female)": 15,
|
19 |
+
"singer7 (man)": 23,
|
20 |
+
"singer8 (man)": 25,
|
21 |
+
"singer9 (female)": 29,
|
22 |
+
"singer10 (man)": 27,
|
23 |
+
}
|
24 |
+
|
25 |
+
def gen_song(lang, tempo, texts, durs, pitchs, spk):
|
26 |
+
if lang == "zh":
|
27 |
+
PRETRAIN_MODEL = "espnet/aceopencpop_svs_visinger2_40singer_pretrain"
|
28 |
+
fs = 44100
|
29 |
+
text_list = lazy_pinyin(texts)
|
30 |
+
|
31 |
+
# preprocess
|
32 |
+
if texts is None:
|
33 |
+
return (fs, np.array([0.0])), "Error: No Text provided!"
|
34 |
+
if durs is None:
|
35 |
+
return (fs, np.array([0.0])), "Error: No Dur provided!"
|
36 |
+
if pitchs is None:
|
37 |
+
return (fs, np.array([0.0])), "Error: No Pitch provided!"
|
38 |
+
|
39 |
+
dur_list = durs.strip().split()
|
40 |
+
pitch_list = pitchs.strip().split()
|
41 |
+
|
42 |
+
if len(text_list) != len(dur_list):
|
43 |
+
return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with duration({len(dur_list)})!"
|
44 |
+
if len(text_list) != len(pitch_list):
|
45 |
+
return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!"
|
46 |
+
|
47 |
+
## text to phoneme
|
48 |
+
sybs = []
|
49 |
+
if lang == "zh":
|
50 |
+
pinyin_dict = PINYIN_DICT
|
51 |
+
for text in text_list:
|
52 |
+
text = text.lower()
|
53 |
+
if text not in pinyin_dict:
|
54 |
+
return (fs, np.array([0.0])), f"Error: pinyin `{text}` is invalid!"
|
55 |
+
phns = "_".join(pinyin_dict[text])
|
56 |
+
sybs.append(phns)
|
57 |
+
|
58 |
+
## pitch
|
59 |
+
pitch_dict = {}
|
60 |
+
with open("./midi-note.scp", "r", encoding="utf-8") as f:
|
61 |
+
for line in f:
|
62 |
+
items = line.strip().split()
|
63 |
+
pitch_dict[items[0]] = int(items[1])
|
64 |
+
pitch_dict[items[1]] = int(items[1])
|
65 |
+
|
66 |
+
labels = []
|
67 |
+
notes = []
|
68 |
+
st = 0
|
69 |
+
for phns, dur, pitch in zip(sybs, dur_list, pitch_list):
|
70 |
+
if pitch not in pitch_dict:
|
71 |
+
return (fs, np.array([0.0])), f"Error: pitch `{pitch}` is invalid!"
|
72 |
+
pitch = pitch_dict[pitch]
|
73 |
+
dur = float(dur)
|
74 |
+
phn_list = phns.split("_")
|
75 |
+
lyric = "".join(phn_list)
|
76 |
+
note = [st, st + dur, lyric, pitch, phns]
|
77 |
+
st += dur
|
78 |
+
notes.append(note)
|
79 |
+
for phn in phn_list:
|
80 |
+
labels.append(phn)
|
81 |
+
|
82 |
+
phns_str = " ".join(labels)
|
83 |
+
batch = {
|
84 |
+
"score": (
|
85 |
+
int(tempo),
|
86 |
+
notes,
|
87 |
+
),
|
88 |
+
"text": phns_str,
|
89 |
+
}
|
90 |
+
|
91 |
+
# Infer
|
92 |
+
device = "cpu"
|
93 |
+
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
94 |
+
d = ModelDownloader()
|
95 |
+
pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL)
|
96 |
+
svs = SingingGenerate(
|
97 |
+
train_config = pretrain_downloaded["train_config"],
|
98 |
+
model_file = pretrain_downloaded["model_file"],
|
99 |
+
device = device
|
100 |
+
)
|
101 |
+
sid = spks[spk]
|
102 |
+
output_dict = svs(batch, sids=np.array([sid]))
|
103 |
+
wav_info = output_dict["wav"].cpu().numpy()
|
104 |
+
return (fs, wav_info), "success!"
|
105 |
+
|
106 |
+
|
107 |
+
title = "Demo of Singing Voice Synthesis in Muskits-ESPnet"
|
108 |
+
|
109 |
+
description = """
|
110 |
+
This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm<b>.
|
111 |
+
|
112 |
+
|
113 |
+
<p>How to use:</p>
|
114 |
+
<ol>
|
115 |
+
<li> Choose language ID. Language id </li>
|
116 |
+
<li> Input tempo in integer </li>
|
117 |
+
<li> Input text, duration, pitch of equal length </li>
|
118 |
+
<li> Choose ons singer </li>
|
119 |
+
<li> Click submit button </li>
|
120 |
+
</ol>
|
121 |
+
|
122 |
+
|
123 |
+
"""
|
124 |
+
|
125 |
+
article = """
|
126 |
+
<div style='margin:20px auto;'>
|
127 |
+
|
128 |
+
<p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> |
|
129 |
+
<a href="https://github.com/espnet/espnet">espnet GitHub</a> |
|
130 |
+
<a href="https://huggingface.co/espnet/aceopencpop_svs_visinger2_40singer_pretrain">pretrained model</a></p>
|
131 |
+
|
132 |
+
<pre>
|
133 |
+
@inproceedings{wu2024muskits,
|
134 |
+
title = {{Muskits-ESPnet}: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm},
|
135 |
+
author = {Yuning Wu and Jiatong Shi and Yifeng Yu and Yuxun Tang and Tao Qian and Yueqian Lin and Jionghao Han and Xinyi Bai and Shinji Watanabe and Qin Jin},
|
136 |
+
booktitle={Proc. ACM Multimedia},
|
137 |
+
year={2024},
|
138 |
+
}
|
139 |
+
</pre>
|
140 |
+
|
141 |
+
</div>
|
142 |
+
"""
|
143 |
+
|
144 |
+
|
145 |
+
# SP: silence, AP: aspirate.
|
146 |
+
examples = [
|
147 |
+
["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "60 62 62 62 0 62 58 0", "singer1 (man)"],
|
148 |
+
["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 A#3 rest", "singer2 (man)"],
|
149 |
+
["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 Bb3 rest", "singer3 (female)"],
|
150 |
+
]
|
151 |
+
|
152 |
+
gr.Interface(
|
153 |
+
fn=gen_song,
|
154 |
+
inputs=[
|
155 |
+
gr.Radio(label="language", choices=["zh"], value="zh"),
|
156 |
+
gr.Textbox(label="Tempo"),
|
157 |
+
gr.Textbox(label="Text"),
|
158 |
+
gr.Textbox(label="Duration"),
|
159 |
+
gr.Textbox(label="Pitch"),
|
160 |
+
gr.Radio(
|
161 |
+
label="Singer",
|
162 |
+
choices=[
|
163 |
+
"singer1 (man)",
|
164 |
+
"singer2 (man)",
|
165 |
+
"singer3 (female)",
|
166 |
+
"singer4 (female)",
|
167 |
+
"singer5 (man)",
|
168 |
+
"singer6 (female)",
|
169 |
+
"singer7 (man)",
|
170 |
+
"singer8 (man)",
|
171 |
+
"singer9 (female)",
|
172 |
+
"singer10 (man)",
|
173 |
+
],
|
174 |
+
value="singer1 (man)"
|
175 |
+
),
|
176 |
+
],
|
177 |
+
outputs=[
|
178 |
+
gr.Audio(label="Generated Song", type="numpy"),
|
179 |
+
gr.Textbox(label="Running Status"),
|
180 |
+
],
|
181 |
+
title=title,
|
182 |
+
description=description,
|
183 |
+
article=article,
|
184 |
+
examples=examples,
|
185 |
+
).launch()
|
midi-note.scp
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
G9 127
|
2 |
+
F#9 126
|
3 |
+
Gb9 126
|
4 |
+
F9 125
|
5 |
+
E9 124
|
6 |
+
D#9 123
|
7 |
+
Eb9 123
|
8 |
+
D9 122
|
9 |
+
C#9 121
|
10 |
+
Db9 121
|
11 |
+
C9 120
|
12 |
+
B8 119
|
13 |
+
A#8 118
|
14 |
+
Bb8 118
|
15 |
+
A8 117
|
16 |
+
G#8 116
|
17 |
+
Ab8 116
|
18 |
+
G8 115
|
19 |
+
F#8 114
|
20 |
+
Gb8 114
|
21 |
+
F8 113
|
22 |
+
E8 112
|
23 |
+
D#8 111
|
24 |
+
Eb8 111
|
25 |
+
D8 110
|
26 |
+
C#8 109
|
27 |
+
Db8 109
|
28 |
+
C8 108
|
29 |
+
B7 107
|
30 |
+
A#7 106
|
31 |
+
Bb7 106
|
32 |
+
A7 105
|
33 |
+
G#7 104
|
34 |
+
Ab7 104
|
35 |
+
G7 103
|
36 |
+
F#7 102
|
37 |
+
Gb7 102
|
38 |
+
F7 101
|
39 |
+
E7 100
|
40 |
+
D#7 99
|
41 |
+
Eb7 99
|
42 |
+
D7 98
|
43 |
+
C#7 97
|
44 |
+
Db7 97
|
45 |
+
C7 96
|
46 |
+
B6 95
|
47 |
+
A#6 94
|
48 |
+
Bb6 94
|
49 |
+
A6 93
|
50 |
+
G#6 92
|
51 |
+
Ab6 92
|
52 |
+
G6 91
|
53 |
+
F#6 90
|
54 |
+
Gb6 90
|
55 |
+
F6 89
|
56 |
+
E6 88
|
57 |
+
D#6 87
|
58 |
+
Eb6 87
|
59 |
+
D6 86
|
60 |
+
C#6 85
|
61 |
+
Db6 85
|
62 |
+
C6 84
|
63 |
+
B5 83
|
64 |
+
A#5 82
|
65 |
+
Bb5 82
|
66 |
+
A5 81
|
67 |
+
G#5 80
|
68 |
+
Ab5 80
|
69 |
+
G5 79
|
70 |
+
F#5 78
|
71 |
+
Gb5 78
|
72 |
+
F5 77
|
73 |
+
E5 76
|
74 |
+
D#5 75
|
75 |
+
Eb5 75
|
76 |
+
D5 74
|
77 |
+
C#5 73
|
78 |
+
Db5 73
|
79 |
+
C5 72
|
80 |
+
B4 71
|
81 |
+
A#4 70
|
82 |
+
Bb4 70
|
83 |
+
A4 69
|
84 |
+
G#4 68
|
85 |
+
Ab4 68
|
86 |
+
G4 67
|
87 |
+
F#4 66
|
88 |
+
Gb4 66
|
89 |
+
F4 65
|
90 |
+
E4 64
|
91 |
+
D#4 63
|
92 |
+
Eb4 63
|
93 |
+
D4 62
|
94 |
+
C#4 61
|
95 |
+
Db4 61
|
96 |
+
C4 60
|
97 |
+
B3 59
|
98 |
+
A#3 58
|
99 |
+
Bb3 58
|
100 |
+
A3 57
|
101 |
+
G#3 56
|
102 |
+
Ab3 56
|
103 |
+
G3 55
|
104 |
+
F#3 54
|
105 |
+
Gb3 54
|
106 |
+
F3 53
|
107 |
+
E3 52
|
108 |
+
D#3 51
|
109 |
+
Eb3 51
|
110 |
+
D3 50
|
111 |
+
C#3 49
|
112 |
+
Db3 49
|
113 |
+
C3 48
|
114 |
+
B2 47
|
115 |
+
A#2 46
|
116 |
+
Bb2 46
|
117 |
+
A2 45
|
118 |
+
G#2 44
|
119 |
+
Ab2 44
|
120 |
+
G2 43
|
121 |
+
F#2 42
|
122 |
+
Gb2 42
|
123 |
+
F2 41
|
124 |
+
E2 40
|
125 |
+
D#2 39
|
126 |
+
Eb2 39
|
127 |
+
D2 38
|
128 |
+
C#2 37
|
129 |
+
Db2 37
|
130 |
+
C2 36
|
131 |
+
B1 35
|
132 |
+
A#1 34
|
133 |
+
Bb1 34
|
134 |
+
A1 33
|
135 |
+
G#1 32
|
136 |
+
Ab1 32
|
137 |
+
G1 31
|
138 |
+
F#1 30
|
139 |
+
Gb1 30
|
140 |
+
F1 29
|
141 |
+
E1 28
|
142 |
+
D#1 27
|
143 |
+
Eb1 27
|
144 |
+
D1 26
|
145 |
+
C#1 25
|
146 |
+
Db1 25
|
147 |
+
C1 24
|
148 |
+
B0 23
|
149 |
+
A#0 22
|
150 |
+
Bb0 22
|
151 |
+
A0 21
|
152 |
+
rest 0
|
pinyin_dict.py
ADDED
@@ -0,0 +1,425 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Adapted from Opencpop's pinyin to phoneme mapping table:
|
2 |
+
# https://wenet.org.cn/opencpop/resources/annotationformat/
|
3 |
+
PINYIN_DICT = {
|
4 |
+
"a": ("a",),
|
5 |
+
"ai": ("ai",),
|
6 |
+
"an": ("an",),
|
7 |
+
"ang": ("ang",),
|
8 |
+
"ao": ("ao",),
|
9 |
+
"ba": ("b", "a"),
|
10 |
+
"bai": ("b", "ai"),
|
11 |
+
"ban": ("b", "an"),
|
12 |
+
"bang": ("b", "ang"),
|
13 |
+
"bao": ("b", "ao"),
|
14 |
+
"bei": ("b", "ei"),
|
15 |
+
"ben": ("b", "en"),
|
16 |
+
"beng": ("b", "eng"),
|
17 |
+
"bi": ("b", "i"),
|
18 |
+
"bian": ("b", "ian"),
|
19 |
+
"biao": ("b", "iao"),
|
20 |
+
"bie": ("b", "ie"),
|
21 |
+
"bin": ("b", "in"),
|
22 |
+
"bing": ("b", "ing"),
|
23 |
+
"bo": ("b", "o"),
|
24 |
+
"bu": ("b", "u"),
|
25 |
+
"ca": ("c", "a"),
|
26 |
+
"cai": ("c", "ai"),
|
27 |
+
"can": ("c", "an"),
|
28 |
+
"cang": ("c", "ang"),
|
29 |
+
"cao": ("c", "ao"),
|
30 |
+
"ce": ("c", "e"),
|
31 |
+
"cei": ("c", "ei"),
|
32 |
+
"cen": ("c", "en"),
|
33 |
+
"ceng": ("c", "eng"),
|
34 |
+
"cha": ("ch", "a"),
|
35 |
+
"chai": ("ch", "ai"),
|
36 |
+
"chan": ("ch", "an"),
|
37 |
+
"chang": ("ch", "ang"),
|
38 |
+
"chao": ("ch", "ao"),
|
39 |
+
"che": ("ch", "e"),
|
40 |
+
"chen": ("ch", "en"),
|
41 |
+
"cheng": ("ch", "eng"),
|
42 |
+
"chi": ("ch", "i"),
|
43 |
+
"chong": ("ch", "ong"),
|
44 |
+
"chou": ("ch", "ou"),
|
45 |
+
"chu": ("ch", "u"),
|
46 |
+
"chua": ("ch", "ua"),
|
47 |
+
"chuai": ("ch", "uai"),
|
48 |
+
"chuan": ("ch", "uan"),
|
49 |
+
"chuang": ("ch", "uang"),
|
50 |
+
"chui": ("ch", "ui"),
|
51 |
+
"chun": ("ch", "un"),
|
52 |
+
"chuo": ("ch", "uo"),
|
53 |
+
"ci": ("c", "i"),
|
54 |
+
"cong": ("c", "ong"),
|
55 |
+
"cou": ("c", "ou"),
|
56 |
+
"cu": ("c", "u"),
|
57 |
+
"cuan": ("c", "uan"),
|
58 |
+
"cui": ("c", "ui"),
|
59 |
+
"cun": ("c", "un"),
|
60 |
+
"cuo": ("c", "uo"),
|
61 |
+
"da": ("d", "a"),
|
62 |
+
"dai": ("d", "ai"),
|
63 |
+
"dan": ("d", "an"),
|
64 |
+
"dang": ("d", "ang"),
|
65 |
+
"dao": ("d", "ao"),
|
66 |
+
"de": ("d", "e"),
|
67 |
+
"dei": ("d", "ei"),
|
68 |
+
"den": ("d", "en"),
|
69 |
+
"deng": ("d", "eng"),
|
70 |
+
"di": ("d", "i"),
|
71 |
+
"dia": ("d", "ia"),
|
72 |
+
"dian": ("d", "ian"),
|
73 |
+
"diao": ("d", "iao"),
|
74 |
+
"die": ("d", "ie"),
|
75 |
+
"ding": ("d", "ing"),
|
76 |
+
"diu": ("d", "iu"),
|
77 |
+
"dong": ("d", "ong"),
|
78 |
+
"dou": ("d", "ou"),
|
79 |
+
"du": ("d", "u"),
|
80 |
+
"duan": ("d", "uan"),
|
81 |
+
"dui": ("d", "ui"),
|
82 |
+
"dun": ("d", "un"),
|
83 |
+
"duo": ("d", "uo"),
|
84 |
+
"e": ("e",),
|
85 |
+
"ei": ("ei",),
|
86 |
+
"en": ("en",),
|
87 |
+
"eng": ("eng",),
|
88 |
+
"er": ("er",),
|
89 |
+
"fa": ("f", "a"),
|
90 |
+
"fan": ("f", "an"),
|
91 |
+
"fang": ("f", "ang"),
|
92 |
+
"fei": ("f", "ei"),
|
93 |
+
"fen": ("f", "en"),
|
94 |
+
"feng": ("f", "eng"),
|
95 |
+
"fo": ("f", "o"),
|
96 |
+
"fou": ("f", "ou"),
|
97 |
+
"fu": ("f", "u"),
|
98 |
+
"ga": ("g", "a"),
|
99 |
+
"gai": ("g", "ai"),
|
100 |
+
"gan": ("g", "an"),
|
101 |
+
"gang": ("g", "ang"),
|
102 |
+
"gao": ("g", "ao"),
|
103 |
+
"ge": ("g", "e"),
|
104 |
+
"gei": ("g", "ei"),
|
105 |
+
"gen": ("g", "en"),
|
106 |
+
"geng": ("g", "eng"),
|
107 |
+
"gong": ("g", "ong"),
|
108 |
+
"gou": ("g", "ou"),
|
109 |
+
"gu": ("g", "u"),
|
110 |
+
"gua": ("g", "ua"),
|
111 |
+
"guai": ("g", "uai"),
|
112 |
+
"guan": ("g", "uan"),
|
113 |
+
"guang": ("g", "uang"),
|
114 |
+
"gui": ("g", "ui"),
|
115 |
+
"gun": ("g", "un"),
|
116 |
+
"guo": ("g", "uo"),
|
117 |
+
"ha": ("h", "a"),
|
118 |
+
"hai": ("h", "ai"),
|
119 |
+
"han": ("h", "an"),
|
120 |
+
"hang": ("h", "ang"),
|
121 |
+
"hao": ("h", "ao"),
|
122 |
+
"he": ("h", "e"),
|
123 |
+
"hei": ("h", "ei"),
|
124 |
+
"hen": ("h", "en"),
|
125 |
+
"heng": ("h", "eng"),
|
126 |
+
"hm": ("h", "m"),
|
127 |
+
"hng": ("h", "ng"),
|
128 |
+
"hong": ("h", "ong"),
|
129 |
+
"hou": ("h", "ou"),
|
130 |
+
"hu": ("h", "u"),
|
131 |
+
"hua": ("h", "ua"),
|
132 |
+
"huai": ("h", "uai"),
|
133 |
+
"huan": ("h", "uan"),
|
134 |
+
"huang": ("h", "uang"),
|
135 |
+
"hui": ("h", "ui"),
|
136 |
+
"hun": ("h", "un"),
|
137 |
+
"huo": ("h", "uo"),
|
138 |
+
"ji": ("j", "i"),
|
139 |
+
"jia": ("j", "ia"),
|
140 |
+
"jian": ("j", "ian"),
|
141 |
+
"jiang": ("j", "iang"),
|
142 |
+
"jiao": ("j", "iao"),
|
143 |
+
"jie": ("j", "ie"),
|
144 |
+
"jin": ("j", "in"),
|
145 |
+
"jing": ("j", "ing"),
|
146 |
+
"jiong": ("j", "iong"),
|
147 |
+
"jiu": ("j", "iu"),
|
148 |
+
"ju": ("j", "v"),
|
149 |
+
"juan": ("j", "van"),
|
150 |
+
"jue": ("j", "ve"),
|
151 |
+
"jun": ("j", "vn"),
|
152 |
+
"ka": ("k", "a"),
|
153 |
+
"kai": ("k", "ai"),
|
154 |
+
"kan": ("k", "an"),
|
155 |
+
"kang": ("k", "ang"),
|
156 |
+
"kao": ("k", "ao"),
|
157 |
+
"ke": ("k", "e"),
|
158 |
+
"kei": ("k", "ei"),
|
159 |
+
"ken": ("k", "en"),
|
160 |
+
"keng": ("k", "eng"),
|
161 |
+
"kong": ("k", "ong"),
|
162 |
+
"kou": ("k", "ou"),
|
163 |
+
"ku": ("k", "u"),
|
164 |
+
"kua": ("k", "ua"),
|
165 |
+
"kuai": ("k", "uai"),
|
166 |
+
"kuan": ("k", "uan"),
|
167 |
+
"kuang": ("k", "uang"),
|
168 |
+
"kui": ("k", "ui"),
|
169 |
+
"kun": ("k", "un"),
|
170 |
+
"kuo": ("k", "uo"),
|
171 |
+
"la": ("l", "a"),
|
172 |
+
"lai": ("l", "ai"),
|
173 |
+
"lan": ("l", "an"),
|
174 |
+
"lang": ("l", "ang"),
|
175 |
+
"lao": ("l", "ao"),
|
176 |
+
"le": ("l", "e"),
|
177 |
+
"lei": ("l", "ei"),
|
178 |
+
"leng": ("l", "eng"),
|
179 |
+
"li": ("l", "i"),
|
180 |
+
"lia": ("l", "ia"),
|
181 |
+
"lian": ("l", "ian"),
|
182 |
+
"liang": ("l", "iang"),
|
183 |
+
"liao": ("l", "iao"),
|
184 |
+
"lie": ("l", "ie"),
|
185 |
+
"lin": ("l", "in"),
|
186 |
+
"ling": ("l", "ing"),
|
187 |
+
"liu": ("l", "iu"),
|
188 |
+
"lo": ("l", "o"),
|
189 |
+
"long": ("l", "ong"),
|
190 |
+
"lou": ("l", "ou"),
|
191 |
+
"lu": ("l", "u"),
|
192 |
+
"luan": ("l", "uan"),
|
193 |
+
"lun": ("l", "un"),
|
194 |
+
"luo": ("l", "uo"),
|
195 |
+
"lv": ("l", "v"),
|
196 |
+
"lve": ("l", "ve"),
|
197 |
+
"m": ("m",),
|
198 |
+
"ma": ("m", "a"),
|
199 |
+
"mai": ("m", "ai"),
|
200 |
+
"man": ("m", "an"),
|
201 |
+
"mang": ("m", "ang"),
|
202 |
+
"mao": ("m", "ao"),
|
203 |
+
"me": ("m", "e"),
|
204 |
+
"mei": ("m", "ei"),
|
205 |
+
"men": ("m", "en"),
|
206 |
+
"meng": ("m", "eng"),
|
207 |
+
"mi": ("m", "i"),
|
208 |
+
"mian": ("m", "ian"),
|
209 |
+
"miao": ("m", "iao"),
|
210 |
+
"mie": ("m", "ie"),
|
211 |
+
"min": ("m", "in"),
|
212 |
+
"ming": ("m", "ing"),
|
213 |
+
"miu": ("m", "iu"),
|
214 |
+
"mo": ("m", "o"),
|
215 |
+
"mou": ("m", "ou"),
|
216 |
+
"mu": ("m", "u"),
|
217 |
+
"n": ("n",),
|
218 |
+
"na": ("n", "a"),
|
219 |
+
"nai": ("n", "ai"),
|
220 |
+
"nan": ("n", "an"),
|
221 |
+
"nang": ("n", "ang"),
|
222 |
+
"nao": ("n", "ao"),
|
223 |
+
"ne": ("n", "e"),
|
224 |
+
"nei": ("n", "ei"),
|
225 |
+
"nen": ("n", "en"),
|
226 |
+
"neng": ("n", "eng"),
|
227 |
+
"ng": ("n", "g"),
|
228 |
+
"ni": ("n", "i"),
|
229 |
+
"nian": ("n", "ian"),
|
230 |
+
"niang": ("n", "iang"),
|
231 |
+
"niao": ("n", "iao"),
|
232 |
+
"nie": ("n", "ie"),
|
233 |
+
"nin": ("n", "in"),
|
234 |
+
"ning": ("n", "ing"),
|
235 |
+
"niu": ("n", "iu"),
|
236 |
+
"nong": ("n", "ong"),
|
237 |
+
"nou": ("n", "ou"),
|
238 |
+
"nu": ("n", "u"),
|
239 |
+
"nuan": ("n", "uan"),
|
240 |
+
"nun": ("n", "un"),
|
241 |
+
"nuo": ("n", "uo"),
|
242 |
+
"nv": ("n", "v"),
|
243 |
+
"nve": ("n", "ve"),
|
244 |
+
"o": ("o",),
|
245 |
+
"ou": ("ou",),
|
246 |
+
"pa": ("p", "a"),
|
247 |
+
"pai": ("p", "ai"),
|
248 |
+
"pan": ("p", "an"),
|
249 |
+
"pang": ("p", "ang"),
|
250 |
+
"pao": ("p", "ao"),
|
251 |
+
"pei": ("p", "ei"),
|
252 |
+
"pen": ("p", "en"),
|
253 |
+
"peng": ("p", "eng"),
|
254 |
+
"pi": ("p", "i"),
|
255 |
+
"pian": ("p", "ian"),
|
256 |
+
"piao": ("p", "iao"),
|
257 |
+
"pie": ("p", "ie"),
|
258 |
+
"pin": ("p", "in"),
|
259 |
+
"ping": ("p", "ing"),
|
260 |
+
"po": ("p", "o"),
|
261 |
+
"pou": ("p", "ou"),
|
262 |
+
"pu": ("p", "u"),
|
263 |
+
"qi": ("q", "i"),
|
264 |
+
"qia": ("q", "ia"),
|
265 |
+
"qian": ("q", "ian"),
|
266 |
+
"qiang": ("q", "iang"),
|
267 |
+
"qiao": ("q", "iao"),
|
268 |
+
"qie": ("q", "ie"),
|
269 |
+
"qin": ("q", "in"),
|
270 |
+
"qing": ("q", "ing"),
|
271 |
+
"qiong": ("q", "iong"),
|
272 |
+
"qiu": ("q", "iu"),
|
273 |
+
"qu": ("q", "v"),
|
274 |
+
"quan": ("q", "van"),
|
275 |
+
"que": ("q", "ve"),
|
276 |
+
"qun": ("q", "vn"),
|
277 |
+
"ran": ("r", "an"),
|
278 |
+
"rang": ("r", "ang"),
|
279 |
+
"rao": ("r", "ao"),
|
280 |
+
"re": ("r", "e"),
|
281 |
+
"ren": ("r", "en"),
|
282 |
+
"reng": ("r", "eng"),
|
283 |
+
"ri": ("r", "i"),
|
284 |
+
"rong": ("r", "ong"),
|
285 |
+
"rou": ("r", "ou"),
|
286 |
+
"ru": ("r", "u"),
|
287 |
+
"rua": ("r", "ua"),
|
288 |
+
"ruan": ("r", "uan"),
|
289 |
+
"rui": ("r", "ui"),
|
290 |
+
"run": ("r", "un"),
|
291 |
+
"ruo": ("r", "uo"),
|
292 |
+
"sa": ("s", "a"),
|
293 |
+
"sai": ("s", "ai"),
|
294 |
+
"san": ("s", "an"),
|
295 |
+
"sang": ("s", "ang"),
|
296 |
+
"sao": ("s", "ao"),
|
297 |
+
"se": ("s", "e"),
|
298 |
+
"sen": ("s", "en"),
|
299 |
+
"seng": ("s", "eng"),
|
300 |
+
"sha": ("sh", "a"),
|
301 |
+
"shai": ("sh", "ai"),
|
302 |
+
"shan": ("sh", "an"),
|
303 |
+
"shang": ("sh", "ang"),
|
304 |
+
"shao": ("sh", "ao"),
|
305 |
+
"she": ("sh", "e"),
|
306 |
+
"shei": ("sh", "ei"),
|
307 |
+
"shen": ("sh", "en"),
|
308 |
+
"sheng": ("sh", "eng"),
|
309 |
+
"shi": ("sh", "i"),
|
310 |
+
"shou": ("sh", "ou"),
|
311 |
+
"shu": ("sh", "u"),
|
312 |
+
"shua": ("sh", "ua"),
|
313 |
+
"shuai": ("sh", "uai"),
|
314 |
+
"shuan": ("sh", "uan"),
|
315 |
+
"shuang": ("sh", "uang"),
|
316 |
+
"shui": ("sh", "ui"),
|
317 |
+
"shun": ("sh", "un"),
|
318 |
+
"shuo": ("sh", "uo"),
|
319 |
+
"si": ("s", "i"),
|
320 |
+
"song": ("s", "ong"),
|
321 |
+
"sou": ("s", "ou"),
|
322 |
+
"su": ("s", "u"),
|
323 |
+
"suan": ("s", "uan"),
|
324 |
+
"sui": ("s", "ui"),
|
325 |
+
"sun": ("s", "un"),
|
326 |
+
"suo": ("s", "uo"),
|
327 |
+
"ta": ("t", "a"),
|
328 |
+
"tai": ("t", "ai"),
|
329 |
+
"tan": ("t", "an"),
|
330 |
+
"tang": ("t", "ang"),
|
331 |
+
"tao": ("t", "ao"),
|
332 |
+
"te": ("t", "e"),
|
333 |
+
"tei": ("t", "ei"),
|
334 |
+
"teng": ("t", "eng"),
|
335 |
+
"ti": ("t", "i"),
|
336 |
+
"tian": ("t", "ian"),
|
337 |
+
"tiao": ("t", "iao"),
|
338 |
+
"tie": ("t", "ie"),
|
339 |
+
"ting": ("t", "ing"),
|
340 |
+
"tong": ("t", "ong"),
|
341 |
+
"tou": ("t", "ou"),
|
342 |
+
"tu": ("t", "u"),
|
343 |
+
"tuan": ("t", "uan"),
|
344 |
+
"tui": ("t", "ui"),
|
345 |
+
"tun": ("t", "un"),
|
346 |
+
"tuo": ("t", "uo"),
|
347 |
+
"wa": ("w", "a"),
|
348 |
+
"wai": ("w", "ai"),
|
349 |
+
"wan": ("w", "an"),
|
350 |
+
"wang": ("w", "ang"),
|
351 |
+
"wei": ("w", "ei"),
|
352 |
+
"wen": ("w", "en"),
|
353 |
+
"weng": ("w", "eng"),
|
354 |
+
"wo": ("w", "o"),
|
355 |
+
"wu": ("w", "u"),
|
356 |
+
"xi": ("x", "i"),
|
357 |
+
"xia": ("x", "ia"),
|
358 |
+
"xian": ("x", "ian"),
|
359 |
+
"xiang": ("x", "iang"),
|
360 |
+
"xiao": ("x", "iao"),
|
361 |
+
"xie": ("x", "ie"),
|
362 |
+
"xin": ("x", "in"),
|
363 |
+
"xing": ("x", "ing"),
|
364 |
+
"xiong": ("x", "iong"),
|
365 |
+
"xiu": ("x", "iu"),
|
366 |
+
"xu": ("x", "v"),
|
367 |
+
"xuan": ("x", "van"),
|
368 |
+
"xue": ("x", "ve"),
|
369 |
+
"xun": ("x", "vn"),
|
370 |
+
"ya": ("y", "a"),
|
371 |
+
"yan": ("y", "an"),
|
372 |
+
"yang": ("y", "ang"),
|
373 |
+
"yao": ("y", "ao"),
|
374 |
+
"ye": ("y", "e"),
|
375 |
+
"yi": ("y", "i"),
|
376 |
+
"yin": ("y", "in"),
|
377 |
+
"ying": ("y", "ing"),
|
378 |
+
"yo": ("y", "o"),
|
379 |
+
"yong": ("y", "ong"),
|
380 |
+
"you": ("y", "ou"),
|
381 |
+
"yu": ("y", "v"),
|
382 |
+
"yuan": ("y", "van"),
|
383 |
+
"yue": ("y", "ve"),
|
384 |
+
"yun": ("y", "vn"),
|
385 |
+
"za": ("z", "a"),
|
386 |
+
"zai": ("z", "ai"),
|
387 |
+
"zan": ("z", "an"),
|
388 |
+
"zang": ("z", "ang"),
|
389 |
+
"zao": ("z", "ao"),
|
390 |
+
"ze": ("z", "e"),
|
391 |
+
"zei": ("z", "ei"),
|
392 |
+
"zen": ("z", "en"),
|
393 |
+
"zeng": ("z", "eng"),
|
394 |
+
"zha": ("zh", "a"),
|
395 |
+
"zhai": ("zh", "ai"),
|
396 |
+
"zhan": ("zh", "an"),
|
397 |
+
"zhang": ("zh", "ang"),
|
398 |
+
"zhao": ("zh", "ao"),
|
399 |
+
"zhe": ("zh", "e"),
|
400 |
+
"zhei": ("zh", "ei"),
|
401 |
+
"zhen": ("zh", "en"),
|
402 |
+
"zheng": ("zh", "eng"),
|
403 |
+
"zhi": ("zh", "i"),
|
404 |
+
"zhong": ("zh", "ong"),
|
405 |
+
"zhou": ("zh", "ou"),
|
406 |
+
"zhu": ("zh", "u"),
|
407 |
+
"zhua": ("zh", "ua"),
|
408 |
+
"zhuai": ("zh", "uai"),
|
409 |
+
"zhuan": ("zh", "uan"),
|
410 |
+
"zhuang": ("zh", "uang"),
|
411 |
+
"zhui": ("zh", "ui"),
|
412 |
+
"zhun": ("zh", "un"),
|
413 |
+
"zhuo": ("zh", "uo"),
|
414 |
+
"zi": ("z", "i"),
|
415 |
+
"zong": ("z", "ong"),
|
416 |
+
"zou": ("z", "ou"),
|
417 |
+
"zu": ("z", "u"),
|
418 |
+
"zuan": ("z", "uan"),
|
419 |
+
"zui": ("z", "ui"),
|
420 |
+
"zun": ("z", "un"),
|
421 |
+
"zuo": ("z", "uo"),
|
422 |
+
"sp": ("SP",),
|
423 |
+
"ap": ("AP",),
|
424 |
+
}
|
425 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/South-Twilight/espnet
|
2 |
+
torch
|
3 |
+
numpy
|
4 |
+
librosa
|
5 |
+
espnet_model_zoo
|
6 |
+
importlib
|
7 |
+
pathlib
|
8 |
+
pypinyin
|
util.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
def split_pinyin(pinyin: str, pinyin_dict: dict) -> tuple[str]:
|
3 |
+
# load pinyin dict from local/pinyin.dict
|
4 |
+
pinyin = pinyin.lower()
|
5 |
+
return pinyin_dict[pinyin]
|