CaiRou-Huang commited on
Commit
5d904b0
1 Parent(s): 448c16f

Upload 17 files

Browse files
configs/config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "your_model_name",
3
+ "train": {
4
+ "log_interval": 200,
5
+ "eval_interval": 1000,
6
+ "seed": 42,
7
+ "epochs": 1000,
8
+ "learning_rate": 0.0002,
9
+ "betas": [0.8, 0.99],
10
+ "eps": 1e-9,
11
+ "batch_size": 4,
12
+ "bf16_run": true,
13
+ "lr_decay": 0.99995,
14
+ "segment_size": 16384,
15
+ "init_lr_ratio": 1,
16
+ "warmup_epochs": 0,
17
+ "c_mel": 45,
18
+ "c_kl": 1.0,
19
+ "skip_optimizer": false,
20
+ "freeze_ZH_bert": false,
21
+ "freeze_JP_bert": false,
22
+ "freeze_EN_bert": false,
23
+ "freeze_style": false
24
+ },
25
+ "data": {
26
+ "training_files": "Data/your_model_name/filelists/train.list",
27
+ "validation_files": "Data/your_model_name/filelists/val.list",
28
+ "max_wav_value": 32768.0,
29
+ "sampling_rate": 44100,
30
+ "filter_length": 2048,
31
+ "hop_length": 512,
32
+ "win_length": 2048,
33
+ "n_mel_channels": 128,
34
+ "mel_fmin": 0.0,
35
+ "mel_fmax": null,
36
+ "add_blank": true,
37
+ "n_speakers": 1,
38
+ "cleaned_text": true,
39
+ "num_styles": 1,
40
+ "style2id": {
41
+ "Neutral": 0
42
+ }
43
+ },
44
+ "model": {
45
+ "use_spk_conditioned_encoder": true,
46
+ "use_noise_scaled_mas": true,
47
+ "use_mel_posterior_encoder": false,
48
+ "use_duration_discriminator": true,
49
+ "inter_channels": 192,
50
+ "hidden_channels": 192,
51
+ "filter_channels": 768,
52
+ "n_heads": 2,
53
+ "n_layers": 6,
54
+ "kernel_size": 3,
55
+ "p_dropout": 0.1,
56
+ "resblock": "1",
57
+ "resblock_kernel_sizes": [3, 7, 11],
58
+ "resblock_dilation_sizes": [
59
+ [1, 3, 5],
60
+ [1, 3, 5],
61
+ [1, 3, 5]
62
+ ],
63
+ "upsample_rates": [8, 8, 2, 2, 2],
64
+ "upsample_initial_channel": 512,
65
+ "upsample_kernel_sizes": [16, 16, 8, 2, 2],
66
+ "n_layers_q": 3,
67
+ "use_spectral_norm": false,
68
+ "gin_channels": 256
69
+ },
70
+ "version": "2.0.1"
71
+ }
configs/configs_jp_extra.json ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 1000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 24,
11
+ "bf16_run": false,
12
+ "fp16_run": false,
13
+ "lr_decay": 0.99996,
14
+ "segment_size": 16384,
15
+ "init_lr_ratio": 1,
16
+ "warmup_epochs": 0,
17
+ "c_mel": 45,
18
+ "c_kl": 1.0,
19
+ "c_commit": 100,
20
+ "skip_optimizer": true,
21
+ "freeze_ZH_bert": false,
22
+ "freeze_JP_bert": false,
23
+ "freeze_EN_bert": false,
24
+ "freeze_emo": false,
25
+ "freeze_style": false
26
+ },
27
+ "data": {
28
+ "use_jp_extra": true,
29
+ "training_files": "filelists/train.list",
30
+ "validation_files": "filelists/val.list",
31
+ "max_wav_value": 32768.0,
32
+ "sampling_rate": 44100,
33
+ "filter_length": 2048,
34
+ "hop_length": 512,
35
+ "win_length": 2048,
36
+ "n_mel_channels": 128,
37
+ "mel_fmin": 0.0,
38
+ "mel_fmax": null,
39
+ "add_blank": true,
40
+ "n_speakers": 512,
41
+ "cleaned_text": true
42
+ },
43
+ "model": {
44
+ "use_spk_conditioned_encoder": true,
45
+ "use_noise_scaled_mas": true,
46
+ "use_mel_posterior_encoder": false,
47
+ "use_duration_discriminator": false,
48
+ "use_wavlm_discriminator": true,
49
+ "inter_channels": 192,
50
+ "hidden_channels": 192,
51
+ "filter_channels": 768,
52
+ "n_heads": 2,
53
+ "n_layers": 6,
54
+ "kernel_size": 3,
55
+ "p_dropout": 0.1,
56
+ "resblock": "1",
57
+ "resblock_kernel_sizes": [3, 7, 11],
58
+ "resblock_dilation_sizes": [
59
+ [1, 3, 5],
60
+ [1, 3, 5],
61
+ [1, 3, 5]
62
+ ],
63
+ "upsample_rates": [8, 8, 2, 2, 2],
64
+ "upsample_initial_channel": 512,
65
+ "upsample_kernel_sizes": [16, 16, 8, 2, 2],
66
+ "n_layers_q": 3,
67
+ "use_spectral_norm": false,
68
+ "gin_channels": 512,
69
+ "slm": {
70
+ "model": "./slm/wavlm-base-plus",
71
+ "sr": 16000,
72
+ "hidden": 768,
73
+ "nlayers": 13,
74
+ "initial_channel": 64
75
+ }
76
+ },
77
+ "version": "2.0.1-JP-Extra"
78
+ }
configs/paths.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Root directory of the training dataset.
2
+ # The training dataset of {model_name} should be placed in {dataset_root}/{model_name}.
3
+ dataset_root: Data
4
+
5
+ # Root directory of the model assets (for inference).
6
+ # In training, the model assets will be saved to {assets_root}/{model_name},
7
+ # and in inference, we load all the models from {assets_root}.
8
+ assets_root: model_assets
text/__init__.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from text.symbols import *
2
+
3
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
4
+
5
+
6
+ def cleaned_text_to_sequence(cleaned_text, tones, language):
7
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8
+ Args:
9
+ text: string to convert to a sequence
10
+ Returns:
11
+ List of integers corresponding to the symbols in the text
12
+ """
13
+ phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14
+ tone_start = language_tone_start_map[language]
15
+ tones = [i + tone_start for i in tones]
16
+ lang_id = language_id_map[language]
17
+ lang_ids = [lang_id for i in phones]
18
+ return phones, tones, lang_ids
19
+
20
+
21
+ def get_bert(
22
+ norm_text, word2ph, language, device, assist_text=None, assist_text_weight=0.7
23
+ ):
24
+ from .chinese_bert import get_bert_feature as zh_bert
25
+ from .english_bert_mock import get_bert_feature as en_bert
26
+ from .japanese_bert import get_bert_feature as jp_bert
27
+
28
+ lang_bert_func_map = {"ZH": zh_bert, "EN": en_bert, "JP": jp_bert}
29
+ bert = lang_bert_func_map[language](
30
+ norm_text, word2ph, device, assist_text, assist_text_weight
31
+ )
32
+ return bert
text/chinese.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+ import cn2an
5
+ from pypinyin import lazy_pinyin, Style
6
+
7
+ from text.symbols import punctuation
8
+ from text.tone_sandhi import ToneSandhi
9
+
10
+ current_file_path = os.path.dirname(__file__)
11
+ pinyin_to_symbol_map = {
12
+ line.split("\t")[0]: line.strip().split("\t")[1]
13
+ for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
14
+ }
15
+
16
+ import jieba.posseg as psg
17
+
18
+
19
+ rep_map = {
20
+ ":": ",",
21
+ ";": ",",
22
+ ",": ",",
23
+ "。": ".",
24
+ "!": "!",
25
+ "?": "?",
26
+ "\n": ".",
27
+ "·": ",",
28
+ "、": ",",
29
+ "...": "…",
30
+ "$": ".",
31
+ "“": "'",
32
+ "”": "'",
33
+ '"': "'",
34
+ "‘": "'",
35
+ "’": "'",
36
+ "(": "'",
37
+ ")": "'",
38
+ "(": "'",
39
+ ")": "'",
40
+ "《": "'",
41
+ "》": "'",
42
+ "【": "'",
43
+ "】": "'",
44
+ "[": "'",
45
+ "]": "'",
46
+ "—": "-",
47
+ "~": "-",
48
+ "~": "-",
49
+ "「": "'",
50
+ "」": "'",
51
+ }
52
+
53
+ tone_modifier = ToneSandhi()
54
+
55
+
56
+ def replace_punctuation(text):
57
+ text = text.replace("嗯", "恩").replace("呣", "母")
58
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
59
+
60
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
61
+
62
+ replaced_text = re.sub(
63
+ r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
64
+ )
65
+
66
+ return replaced_text
67
+
68
+
69
+ def g2p(text):
70
+ pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
71
+ sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
72
+ phones, tones, word2ph = _g2p(sentences)
73
+ assert sum(word2ph) == len(phones)
74
+ assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
75
+ phones = ["_"] + phones + ["_"]
76
+ tones = [0] + tones + [0]
77
+ word2ph = [1] + word2ph + [1]
78
+ return phones, tones, word2ph
79
+
80
+
81
+ def _get_initials_finals(word):
82
+ initials = []
83
+ finals = []
84
+ orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
85
+ orig_finals = lazy_pinyin(
86
+ word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
87
+ )
88
+ for c, v in zip(orig_initials, orig_finals):
89
+ initials.append(c)
90
+ finals.append(v)
91
+ return initials, finals
92
+
93
+
94
+ def _g2p(segments):
95
+ phones_list = []
96
+ tones_list = []
97
+ word2ph = []
98
+ for seg in segments:
99
+ # Replace all English words in the sentence
100
+ seg = re.sub("[a-zA-Z]+", "", seg)
101
+ seg_cut = psg.lcut(seg)
102
+ initials = []
103
+ finals = []
104
+ seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
105
+ for word, pos in seg_cut:
106
+ if pos == "eng":
107
+ continue
108
+ sub_initials, sub_finals = _get_initials_finals(word)
109
+ sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
110
+ initials.append(sub_initials)
111
+ finals.append(sub_finals)
112
+
113
+ # assert len(sub_initials) == len(sub_finals) == len(word)
114
+ initials = sum(initials, [])
115
+ finals = sum(finals, [])
116
+ #
117
+ for c, v in zip(initials, finals):
118
+ raw_pinyin = c + v
119
+ # NOTE: post process for pypinyin outputs
120
+ # we discriminate i, ii and iii
121
+ if c == v:
122
+ assert c in punctuation
123
+ phone = [c]
124
+ tone = "0"
125
+ word2ph.append(1)
126
+ else:
127
+ v_without_tone = v[:-1]
128
+ tone = v[-1]
129
+
130
+ pinyin = c + v_without_tone
131
+ assert tone in "12345"
132
+
133
+ if c:
134
+ # 多音节
135
+ v_rep_map = {
136
+ "uei": "ui",
137
+ "iou": "iu",
138
+ "uen": "un",
139
+ }
140
+ if v_without_tone in v_rep_map.keys():
141
+ pinyin = c + v_rep_map[v_without_tone]
142
+ else:
143
+ # 单音节
144
+ pinyin_rep_map = {
145
+ "ing": "ying",
146
+ "i": "yi",
147
+ "in": "yin",
148
+ "u": "wu",
149
+ }
150
+ if pinyin in pinyin_rep_map.keys():
151
+ pinyin = pinyin_rep_map[pinyin]
152
+ else:
153
+ single_rep_map = {
154
+ "v": "yu",
155
+ "e": "e",
156
+ "i": "y",
157
+ "u": "w",
158
+ }
159
+ if pinyin[0] in single_rep_map.keys():
160
+ pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
161
+
162
+ assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
163
+ phone = pinyin_to_symbol_map[pinyin].split(" ")
164
+ word2ph.append(len(phone))
165
+
166
+ phones_list += phone
167
+ tones_list += [int(tone)] * len(phone)
168
+ return phones_list, tones_list, word2ph
169
+
170
+
171
+ def text_normalize(text):
172
+ numbers = re.findall(r"\d+(?:\.?\d+)?", text)
173
+ for number in numbers:
174
+ text = text.replace(number, cn2an.an2cn(number), 1)
175
+ text = replace_punctuation(text)
176
+ return text
177
+
178
+
179
+ def get_bert_feature(text, word2ph):
180
+ from text import chinese_bert
181
+
182
+ return chinese_bert.get_bert_feature(text, word2ph)
183
+
184
+
185
+ if __name__ == "__main__":
186
+ from text.chinese_bert import get_bert_feature
187
+
188
+ text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
189
+ text = text_normalize(text)
190
+ print(text)
191
+ phones, tones, word2ph = g2p(text)
192
+ bert = get_bert_feature(text, word2ph)
193
+
194
+ print(phones, tones, word2ph, bert.shape)
195
+
196
+
197
+ # # 示例用法
198
+ # text = "这是一个示例文本:,你好!这是一个测试...."
199
+ # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
text/chinese_bert.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import torch
4
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
5
+
6
+ from config import config
7
+
8
+ LOCAL_PATH = "./bert/chinese-roberta-wwm-ext-large"
9
+
10
+ tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
11
+
12
+ models = dict()
13
+
14
+
15
+ def get_bert_feature(
16
+ text,
17
+ word2ph,
18
+ device=config.bert_gen_config.device,
19
+ assist_text=None,
20
+ assist_text_weight=0.7,
21
+ ):
22
+ if (
23
+ sys.platform == "darwin"
24
+ and torch.backends.mps.is_available()
25
+ and device == "cpu"
26
+ ):
27
+ device = "mps"
28
+ if not device:
29
+ device = "cuda"
30
+ if device == "cuda" and not torch.cuda.is_available():
31
+ device = "cpu"
32
+ if device not in models.keys():
33
+ models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
34
+ with torch.no_grad():
35
+ inputs = tokenizer(text, return_tensors="pt")
36
+ for i in inputs:
37
+ inputs[i] = inputs[i].to(device)
38
+ res = models[device](**inputs, output_hidden_states=True)
39
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
40
+ if assist_text:
41
+ style_inputs = tokenizer(assist_text, return_tensors="pt")
42
+ for i in style_inputs:
43
+ style_inputs[i] = style_inputs[i].to(device)
44
+ style_res = models[device](**style_inputs, output_hidden_states=True)
45
+ style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
46
+ style_res_mean = style_res.mean(0)
47
+ assert len(word2ph) == len(text) + 2
48
+ word2phone = word2ph
49
+ phone_level_feature = []
50
+ for i in range(len(word2phone)):
51
+ if assist_text:
52
+ repeat_feature = (
53
+ res[i].repeat(word2phone[i], 1) * (1 - assist_text_weight)
54
+ + style_res_mean.repeat(word2phone[i], 1) * assist_text_weight
55
+ )
56
+ else:
57
+ repeat_feature = res[i].repeat(word2phone[i], 1)
58
+ phone_level_feature.append(repeat_feature)
59
+
60
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
61
+
62
+ return phone_level_feature.T
63
+
64
+
65
+ if __name__ == "__main__":
66
+ word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
67
+ word2phone = [
68
+ 1,
69
+ 2,
70
+ 1,
71
+ 2,
72
+ 2,
73
+ 1,
74
+ 2,
75
+ 2,
76
+ 1,
77
+ 2,
78
+ 2,
79
+ 1,
80
+ 2,
81
+ 2,
82
+ 2,
83
+ 2,
84
+ 2,
85
+ 1,
86
+ 1,
87
+ 2,
88
+ 2,
89
+ 1,
90
+ 2,
91
+ 2,
92
+ 2,
93
+ 2,
94
+ 1,
95
+ 2,
96
+ 2,
97
+ 2,
98
+ 2,
99
+ 2,
100
+ 1,
101
+ 2,
102
+ 2,
103
+ 2,
104
+ 2,
105
+ 1,
106
+ ]
107
+
108
+ # 计算总帧数
109
+ total_frames = sum(word2phone)
110
+ print(word_level_feature.shape)
111
+ print(word2phone)
112
+ phone_level_feature = []
113
+ for i in range(len(word2phone)):
114
+ print(word_level_feature[i].shape)
115
+
116
+ # 对每个词重复word2phone[i]次
117
+ repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
118
+ phone_level_feature.append(repeat_feature)
119
+
120
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
121
+ print(phone_level_feature.shape) # torch.Size([36, 1024])
text/cleaner.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from text import chinese, japanese, english, cleaned_text_to_sequence
2
+
3
+
4
+ language_module_map = {"ZH": chinese, "JP": japanese, "EN": english}
5
+
6
+
7
+ def clean_text(text, language, use_jp_extra=True):
8
+ language_module = language_module_map[language]
9
+ norm_text = language_module.text_normalize(text)
10
+ if language == "JP":
11
+ phones, tones, word2ph = language_module.g2p(norm_text, use_jp_extra)
12
+ else:
13
+ phones, tones, word2ph = language_module.g2p(norm_text)
14
+ return norm_text, phones, tones, word2ph
15
+
16
+
17
+ def clean_text_bert(text, language):
18
+ language_module = language_module_map[language]
19
+ norm_text = language_module.text_normalize(text)
20
+ phones, tones, word2ph = language_module.g2p(norm_text)
21
+ bert = language_module.get_bert_feature(norm_text, word2ph)
22
+ return phones, tones, bert
23
+
24
+
25
+ def text_to_sequence(text, language):
26
+ norm_text, phones, tones, word2ph = clean_text(text, language)
27
+ return cleaned_text_to_sequence(phones, tones, language)
28
+
29
+
30
+ if __name__ == "__main__":
31
+ pass
text/cmudict.rep ADDED
The diff for this file is too large to render. See raw diff
 
text/cmudict_cache.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9b21b20325471934ba92f2e4a5976989e7d920caa32e7a286eacb027d197949
3
+ size 6212655
text/english.py ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import os
3
+ import re
4
+ from g2p_en import G2p
5
+ from transformers import DebertaV2Tokenizer
6
+
7
+ from text import symbols
8
+ from text.symbols import punctuation
9
+
10
+ current_file_path = os.path.dirname(__file__)
11
+ CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
12
+ CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
13
+ _g2p = G2p()
14
+ LOCAL_PATH = "./bert/deberta-v3-large"
15
+ tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
16
+
17
+ arpa = {
18
+ "AH0",
19
+ "S",
20
+ "AH1",
21
+ "EY2",
22
+ "AE2",
23
+ "EH0",
24
+ "OW2",
25
+ "UH0",
26
+ "NG",
27
+ "B",
28
+ "G",
29
+ "AY0",
30
+ "M",
31
+ "AA0",
32
+ "F",
33
+ "AO0",
34
+ "ER2",
35
+ "UH1",
36
+ "IY1",
37
+ "AH2",
38
+ "DH",
39
+ "IY0",
40
+ "EY1",
41
+ "IH0",
42
+ "K",
43
+ "N",
44
+ "W",
45
+ "IY2",
46
+ "T",
47
+ "AA1",
48
+ "ER1",
49
+ "EH2",
50
+ "OY0",
51
+ "UH2",
52
+ "UW1",
53
+ "Z",
54
+ "AW2",
55
+ "AW1",
56
+ "V",
57
+ "UW2",
58
+ "AA2",
59
+ "ER",
60
+ "AW0",
61
+ "UW0",
62
+ "R",
63
+ "OW1",
64
+ "EH1",
65
+ "ZH",
66
+ "AE0",
67
+ "IH2",
68
+ "IH",
69
+ "Y",
70
+ "JH",
71
+ "P",
72
+ "AY1",
73
+ "EY0",
74
+ "OY2",
75
+ "TH",
76
+ "HH",
77
+ "D",
78
+ "ER0",
79
+ "CH",
80
+ "AO1",
81
+ "AE1",
82
+ "AO2",
83
+ "OY1",
84
+ "AY2",
85
+ "IH1",
86
+ "OW0",
87
+ "L",
88
+ "SH",
89
+ }
90
+
91
+
92
+ def post_replace_ph(ph):
93
+ rep_map = {
94
+ ":": ",",
95
+ ";": ",",
96
+ ",": ",",
97
+ "。": ".",
98
+ "!": "!",
99
+ "?": "?",
100
+ "\n": ".",
101
+ "·": ",",
102
+ "、": ",",
103
+ "…": "...",
104
+ "···": "...",
105
+ "・・・": "...",
106
+ "v": "V",
107
+ }
108
+ if ph in rep_map.keys():
109
+ ph = rep_map[ph]
110
+ if ph in symbols:
111
+ return ph
112
+ if ph not in symbols:
113
+ ph = "UNK"
114
+ return ph
115
+
116
+
117
+ rep_map = {
118
+ ":": ",",
119
+ ";": ",",
120
+ ",": ",",
121
+ "。": ".",
122
+ "!": "!",
123
+ "?": "?",
124
+ "\n": ".",
125
+ ".": ".",
126
+ "…": "...",
127
+ "···": "...",
128
+ "・・・": "...",
129
+ "·": ",",
130
+ "・": ",",
131
+ "、": ",",
132
+ "$": ".",
133
+ "“": "'",
134
+ "”": "'",
135
+ '"': "'",
136
+ "‘": "'",
137
+ "’": "'",
138
+ "(": "'",
139
+ ")": "'",
140
+ "(": "'",
141
+ ")": "'",
142
+ "《": "'",
143
+ "》": "'",
144
+ "【": "'",
145
+ "】": "'",
146
+ "[": "'",
147
+ "]": "'",
148
+ "—": "-",
149
+ "−": "-",
150
+ "~": "-",
151
+ "~": "-",
152
+ "「": "'",
153
+ "」": "'",
154
+ }
155
+
156
+
157
+ def replace_punctuation(text):
158
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
159
+
160
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
161
+
162
+ # replaced_text = re.sub(
163
+ # r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
164
+ # + "".join(punctuation)
165
+ # + r"]+",
166
+ # "",
167
+ # replaced_text,
168
+ # )
169
+
170
+ return replaced_text
171
+
172
+
173
+ def read_dict():
174
+ g2p_dict = {}
175
+ start_line = 49
176
+ with open(CMU_DICT_PATH) as f:
177
+ line = f.readline()
178
+ line_index = 1
179
+ while line:
180
+ if line_index >= start_line:
181
+ line = line.strip()
182
+ word_split = line.split(" ")
183
+ word = word_split[0]
184
+
185
+ syllable_split = word_split[1].split(" - ")
186
+ g2p_dict[word] = []
187
+ for syllable in syllable_split:
188
+ phone_split = syllable.split(" ")
189
+ g2p_dict[word].append(phone_split)
190
+
191
+ line_index = line_index + 1
192
+ line = f.readline()
193
+
194
+ return g2p_dict
195
+
196
+
197
+ def cache_dict(g2p_dict, file_path):
198
+ with open(file_path, "wb") as pickle_file:
199
+ pickle.dump(g2p_dict, pickle_file)
200
+
201
+
202
+ def get_dict():
203
+ if os.path.exists(CACHE_PATH):
204
+ with open(CACHE_PATH, "rb") as pickle_file:
205
+ g2p_dict = pickle.load(pickle_file)
206
+ else:
207
+ g2p_dict = read_dict()
208
+ cache_dict(g2p_dict, CACHE_PATH)
209
+
210
+ return g2p_dict
211
+
212
+
213
+ eng_dict = get_dict()
214
+
215
+
216
+ def refine_ph(phn):
217
+ tone = 0
218
+ if re.search(r"\d$", phn):
219
+ tone = int(phn[-1]) + 1
220
+ phn = phn[:-1]
221
+ else:
222
+ tone = 3
223
+ return phn.lower(), tone
224
+
225
+
226
+ def refine_syllables(syllables):
227
+ tones = []
228
+ phonemes = []
229
+ for phn_list in syllables:
230
+ for i in range(len(phn_list)):
231
+ phn = phn_list[i]
232
+ phn, tone = refine_ph(phn)
233
+ phonemes.append(phn)
234
+ tones.append(tone)
235
+ return phonemes, tones
236
+
237
+
238
+ import re
239
+ import inflect
240
+
241
+ _inflect = inflect.engine()
242
+ _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
243
+ _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
244
+ _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
245
+ _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
246
+ _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
247
+ _number_re = re.compile(r"[0-9]+")
248
+
249
+ # List of (regular expression, replacement) pairs for abbreviations:
250
+ _abbreviations = [
251
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
252
+ for x in [
253
+ ("mrs", "misess"),
254
+ ("mr", "mister"),
255
+ ("dr", "doctor"),
256
+ ("st", "saint"),
257
+ ("co", "company"),
258
+ ("jr", "junior"),
259
+ ("maj", "major"),
260
+ ("gen", "general"),
261
+ ("drs", "doctors"),
262
+ ("rev", "reverend"),
263
+ ("lt", "lieutenant"),
264
+ ("hon", "honorable"),
265
+ ("sgt", "sergeant"),
266
+ ("capt", "captain"),
267
+ ("esq", "esquire"),
268
+ ("ltd", "limited"),
269
+ ("col", "colonel"),
270
+ ("ft", "fort"),
271
+ ]
272
+ ]
273
+
274
+
275
+ # List of (ipa, lazy ipa) pairs:
276
+ _lazy_ipa = [
277
+ (re.compile("%s" % x[0]), x[1])
278
+ for x in [
279
+ ("r", "ɹ"),
280
+ ("æ", "e"),
281
+ ("ɑ", "a"),
282
+ ("ɔ", "o"),
283
+ ("ð", "z"),
284
+ ("θ", "s"),
285
+ ("ɛ", "e"),
286
+ ("ɪ", "i"),
287
+ ("ʊ", "u"),
288
+ ("ʒ", "ʥ"),
289
+ ("ʤ", "ʥ"),
290
+ ("ˈ", "↓"),
291
+ ]
292
+ ]
293
+
294
+ # List of (ipa, lazy ipa2) pairs:
295
+ _lazy_ipa2 = [
296
+ (re.compile("%s" % x[0]), x[1])
297
+ for x in [
298
+ ("r", "ɹ"),
299
+ ("ð", "z"),
300
+ ("θ", "s"),
301
+ ("ʒ", "ʑ"),
302
+ ("ʤ", "dʑ"),
303
+ ("ˈ", "↓"),
304
+ ]
305
+ ]
306
+
307
+ # List of (ipa, ipa2) pairs
308
+ _ipa_to_ipa2 = [
309
+ (re.compile("%s" % x[0]), x[1]) for x in [("r", "ɹ"), ("ʤ", "dʒ"), ("ʧ", "tʃ")]
310
+ ]
311
+
312
+
313
+ def _expand_dollars(m):
314
+ match = m.group(1)
315
+ parts = match.split(".")
316
+ if len(parts) > 2:
317
+ return match + " dollars" # Unexpected format
318
+ dollars = int(parts[0]) if parts[0] else 0
319
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
320
+ if dollars and cents:
321
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
322
+ cent_unit = "cent" if cents == 1 else "cents"
323
+ return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
324
+ elif dollars:
325
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
326
+ return "%s %s" % (dollars, dollar_unit)
327
+ elif cents:
328
+ cent_unit = "cent" if cents == 1 else "cents"
329
+ return "%s %s" % (cents, cent_unit)
330
+ else:
331
+ return "zero dollars"
332
+
333
+
334
+ def _remove_commas(m):
335
+ return m.group(1).replace(",", "")
336
+
337
+
338
+ def _expand_ordinal(m):
339
+ return _inflect.number_to_words(m.group(0))
340
+
341
+
342
+ def _expand_number(m):
343
+ num = int(m.group(0))
344
+ if num > 1000 and num < 3000:
345
+ if num == 2000:
346
+ return "two thousand"
347
+ elif num > 2000 and num < 2010:
348
+ return "two thousand " + _inflect.number_to_words(num % 100)
349
+ elif num % 100 == 0:
350
+ return _inflect.number_to_words(num // 100) + " hundred"
351
+ else:
352
+ return _inflect.number_to_words(
353
+ num, andword="", zero="oh", group=2
354
+ ).replace(", ", " ")
355
+ else:
356
+ return _inflect.number_to_words(num, andword="")
357
+
358
+
359
+ def _expand_decimal_point(m):
360
+ return m.group(1).replace(".", " point ")
361
+
362
+
363
+ def normalize_numbers(text):
364
+ text = re.sub(_comma_number_re, _remove_commas, text)
365
+ text = re.sub(_pounds_re, r"\1 pounds", text)
366
+ text = re.sub(_dollars_re, _expand_dollars, text)
367
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
368
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
369
+ text = re.sub(_number_re, _expand_number, text)
370
+ return text
371
+
372
+
373
+ def text_normalize(text):
374
+ text = normalize_numbers(text)
375
+ text = replace_punctuation(text)
376
+ text = re.sub(r"([,;.\?\!])([\w])", r"\1 \2", text)
377
+ return text
378
+
379
+
380
+ def distribute_phone(n_phone, n_word):
381
+ phones_per_word = [0] * n_word
382
+ for task in range(n_phone):
383
+ min_tasks = min(phones_per_word)
384
+ min_index = phones_per_word.index(min_tasks)
385
+ phones_per_word[min_index] += 1
386
+ return phones_per_word
387
+
388
+
389
+ def sep_text(text):
390
+ words = re.split(r"([,;.\?\!\s+])", text)
391
+ words = [word for word in words if word.strip() != ""]
392
+ return words
393
+
394
+
395
+ def text_to_words(text):
396
+ tokens = tokenizer.tokenize(text)
397
+ words = []
398
+ for idx, t in enumerate(tokens):
399
+ if t.startswith("▁"):
400
+ words.append([t[1:]])
401
+ else:
402
+ if t in punctuation:
403
+ if idx == len(tokens) - 1:
404
+ words.append([f"{t}"])
405
+ else:
406
+ if (
407
+ not tokens[idx + 1].startswith("▁")
408
+ and tokens[idx + 1] not in punctuation
409
+ ):
410
+ if idx == 0:
411
+ words.append([])
412
+ words[-1].append(f"{t}")
413
+ else:
414
+ words.append([f"{t}"])
415
+ else:
416
+ if idx == 0:
417
+ words.append([])
418
+ words[-1].append(f"{t}")
419
+ return words
420
+
421
+
422
+ def g2p(text):
423
+ phones = []
424
+ tones = []
425
+ phone_len = []
426
+ # words = sep_text(text)
427
+ # tokens = [tokenizer.tokenize(i) for i in words]
428
+ words = text_to_words(text)
429
+
430
+ for word in words:
431
+ temp_phones, temp_tones = [], []
432
+ if len(word) > 1:
433
+ if "'" in word:
434
+ word = ["".join(word)]
435
+ for w in word:
436
+ if w in punctuation:
437
+ temp_phones.append(w)
438
+ temp_tones.append(0)
439
+ continue
440
+ if w.upper() in eng_dict:
441
+ phns, tns = refine_syllables(eng_dict[w.upper()])
442
+ temp_phones += [post_replace_ph(i) for i in phns]
443
+ temp_tones += tns
444
+ # w2ph.append(len(phns))
445
+ else:
446
+ phone_list = list(filter(lambda p: p != " ", _g2p(w)))
447
+ phns = []
448
+ tns = []
449
+ for ph in phone_list:
450
+ if ph in arpa:
451
+ ph, tn = refine_ph(ph)
452
+ phns.append(ph)
453
+ tns.append(tn)
454
+ else:
455
+ phns.append(ph)
456
+ tns.append(0)
457
+ temp_phones += [post_replace_ph(i) for i in phns]
458
+ temp_tones += tns
459
+ phones += temp_phones
460
+ tones += temp_tones
461
+ phone_len.append(len(temp_phones))
462
+ # phones = [post_replace_ph(i) for i in phones]
463
+
464
+ word2ph = []
465
+ for token, pl in zip(words, phone_len):
466
+ word_len = len(token)
467
+
468
+ aaa = distribute_phone(pl, word_len)
469
+ word2ph += aaa
470
+
471
+ phones = ["_"] + phones + ["_"]
472
+ tones = [0] + tones + [0]
473
+ word2ph = [1] + word2ph + [1]
474
+ assert len(phones) == len(tones), text
475
+ assert len(phones) == sum(word2ph), text
476
+
477
+ return phones, tones, word2ph
478
+
479
+
480
+ def get_bert_feature(text, word2ph):
481
+ from text import english_bert_mock
482
+
483
+ return english_bert_mock.get_bert_feature(text, word2ph)
484
+
485
+
486
+ if __name__ == "__main__":
487
+ # print(get_dict())
488
+ # print(eng_word_to_phoneme("hello"))
489
+ print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
490
+ # all_phones = set()
491
+ # for k, syllables in eng_dict.items():
492
+ # for group in syllables:
493
+ # for ph in group:
494
+ # all_phones.add(ph)
495
+ # print(all_phones)
text/english_bert_mock.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import torch
4
+ from transformers import DebertaV2Model, DebertaV2Tokenizer
5
+
6
+ from config import config
7
+
8
+
9
+ LOCAL_PATH = "./bert/deberta-v3-large"
10
+
11
+ tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12
+
13
+ models = dict()
14
+
15
+
16
+ def get_bert_feature(
17
+ text,
18
+ word2ph,
19
+ device=config.bert_gen_config.device,
20
+ assist_text=None,
21
+ assist_text_weight=0.7,
22
+ ):
23
+ if (
24
+ sys.platform == "darwin"
25
+ and torch.backends.mps.is_available()
26
+ and device == "cpu"
27
+ ):
28
+ device = "mps"
29
+ if not device:
30
+ device = "cuda"
31
+ if device == "cuda" and not torch.cuda.is_available():
32
+ device = "cpu"
33
+ if device not in models.keys():
34
+ models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
35
+ with torch.no_grad():
36
+ inputs = tokenizer(text, return_tensors="pt")
37
+ for i in inputs:
38
+ inputs[i] = inputs[i].to(device)
39
+ res = models[device](**inputs, output_hidden_states=True)
40
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
41
+ if assist_text:
42
+ style_inputs = tokenizer(assist_text, return_tensors="pt")
43
+ for i in style_inputs:
44
+ style_inputs[i] = style_inputs[i].to(device)
45
+ style_res = models[device](**style_inputs, output_hidden_states=True)
46
+ style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
47
+ style_res_mean = style_res.mean(0)
48
+ assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
49
+ word2phone = word2ph
50
+ phone_level_feature = []
51
+ for i in range(len(word2phone)):
52
+ if assist_text:
53
+ repeat_feature = (
54
+ res[i].repeat(word2phone[i], 1) * (1 - assist_text_weight)
55
+ + style_res_mean.repeat(word2phone[i], 1) * assist_text_weight
56
+ )
57
+ else:
58
+ repeat_feature = res[i].repeat(word2phone[i], 1)
59
+ phone_level_feature.append(repeat_feature)
60
+
61
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
62
+
63
+ return phone_level_feature.T
text/japanese.py ADDED
@@ -0,0 +1,585 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Convert Japanese text to phonemes which is
2
+ # compatible with Julius https://github.com/julius-speech/segmentation-kit
3
+ import re
4
+ import unicodedata
5
+
6
+ import pyopenjtalk
7
+ from num2words import num2words
8
+ from transformers import AutoTokenizer
9
+
10
+ from common.log import logger
11
+ from text import punctuation
12
+ from text.japanese_mora_list import (
13
+ mora_kata_to_mora_phonemes,
14
+ mora_phonemes_to_mora_kata,
15
+ )
16
+
17
+ # 子音の集合
18
+ COSONANTS = set(
19
+ [
20
+ cosonant
21
+ for cosonant, _ in mora_kata_to_mora_phonemes.values()
22
+ if cosonant is not None
23
+ ]
24
+ )
25
+
26
+ # 母音の集合、便宜上「ん」を含める
27
+ VOWELS = {"a", "i", "u", "e", "o", "N"}
28
+
29
+
30
+ # 正規化で記号を変換するための辞書
31
+ rep_map = {
32
+ ":": ",",
33
+ ";": ",",
34
+ ",": ",",
35
+ "。": ".",
36
+ "!": "!",
37
+ "?": "?",
38
+ "\n": ".",
39
+ ".": ".",
40
+ "…": "...",
41
+ "···": "...",
42
+ "・・・": "...",
43
+ "·": ",",
44
+ "・": ",",
45
+ "、": ",",
46
+ "$": ".",
47
+ "“": "'",
48
+ "”": "'",
49
+ '"': "'",
50
+ "‘": "'",
51
+ "’": "'",
52
+ "(": "'",
53
+ ")": "'",
54
+ "(": "'",
55
+ ")": "'",
56
+ "《": "'",
57
+ "》": "'",
58
+ "【": "'",
59
+ "】": "'",
60
+ "[": "'",
61
+ "]": "'",
62
+ "—": "-",
63
+ "−": "-",
64
+ # "~": "-", # これは長音記号「ー」として扱うよう変更
65
+ # "~": "-", # これも長音記号「ー」として扱うよう変更
66
+ "「": "'",
67
+ "」": "'",
68
+ }
69
+
70
+
71
+ def text_normalize(text):
72
+ """
73
+ 日本語のテキストを正規化する。
74
+ 結果は、ちょうど次の文字のみからなる:
75
+ - ひらがな
76
+ - カタカナ(全角長音記号「ー」が入る!)
77
+ - 漢字
78
+ - 半角アルファベット(大文字と小文字)
79
+ - ギリシャ文字
80
+ - `.` (句点`。`や`…`の一部や改行等)
81
+ - `,` (読点`、`や`:`等)
82
+ - `?` (疑問符`?`)
83
+ - `!` (感嘆符`!`)
84
+ - `'` (`「`や`」`等)
85
+ - `-` (`―`(ダッシュ、長音記号ではない)や`-`等)
86
+
87
+ 注意点:
88
+ - 三点リーダー`…`は`...`に変換される(`なるほど…。` → `なるほど....`)
89
+ - 数字は漢字に変換される(`1,100円` → `千百円`、`52.34` → `五十二点三四`)
90
+ - 読点や疑問符等の位置・個数等は保持される(`??あ、、!!!` → `??あ,,!!!`)
91
+ """
92
+ res = unicodedata.normalize("NFKC", text) # ここでアルファベットは半角になる
93
+ res = japanese_convert_numbers_to_words(res) # 「100円」→「百円」等
94
+ # 「~」と「~」も長音記号として扱う
95
+ res = res.replace("~", "ー")
96
+ res = res.replace("~", "ー")
97
+
98
+ res = replace_punctuation(res) # 句読点等正規化、読めない文字を削除
99
+
100
+ # 結合文字の濁点・半濁点を削除
101
+ # 通常の「ば」等はそのままのこされる、「あ゛」は上で「あ゙」になりここで「あ」になる
102
+ res = res.replace("\u3099", "") # 結合文字の濁点を削除、る゙ → る
103
+ res = res.replace("\u309A", "") # 結合文字の半濁点を削除、な゚ → な
104
+ return res
105
+
106
+
107
+ def replace_punctuation(text: str) -> str:
108
+ """句読点等を「.」「,」「!」「?」「'」「-」に正規化し、OpenJTalkで読みが取得できるもののみ残す:
109
+ 漢字・平仮名・カタカナ、アルファベット、ギリシャ文字
110
+ """
111
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
112
+
113
+ # 句読点を辞書で置換
114
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
115
+
116
+ replaced_text = re.sub(
117
+ # ↓ ひらがな、カタカナ、漢字
118
+ r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
119
+ # ↓ 半角アルファベット(大文字と小文字)
120
+ + r"\u0041-\u005A\u0061-\u007A"
121
+ # ↓ 全角アルファベット(大文字と小文字)
122
+ + r"\uFF21-\uFF3A\uFF41-\uFF5A"
123
+ # ↓ ギリシャ文字
124
+ + r"\u0370-\u03FF\u1F00-\u1FFF"
125
+ # ↓ "!", "?", "…", ",", ".", "'", "-", 但し`…`はすでに`...`に変換されている
126
+ + "".join(punctuation) + r"]+",
127
+ # 上述以外の文字を削除
128
+ "",
129
+ replaced_text,
130
+ )
131
+
132
+ return replaced_text
133
+
134
+
135
+ _NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
136
+ _CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
137
+ _CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
138
+ _NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
139
+
140
+
141
+ def japanese_convert_numbers_to_words(text: str) -> str:
142
+ res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
143
+ res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
144
+ res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
145
+ return res
146
+
147
+
148
+ def g2p(
149
+ norm_text: str, use_jp_extra: bool = True
150
+ ) -> tuple[list[str], list[int], list[int]]:
151
+ """
152
+ 他で使われるメインの関数。`text_normalize()`で正規化された`norm_text`を受け取り、
153
+ - phones: 音素のリスト(ただし`!`や`,`や`.`等punctuationが含まれうる)
154
+ - tones: アクセントのリスト、0(低)と1(高)からなり、phonesと同じ長さ
155
+ - word2ph: 元のテキストの各文字に音素が何個割り当てられるかを表すリスト
156
+ のタプルを返す。
157
+ ただし`phones`と`tones`の最初と終わりに`_`が入り、応じて`word2ph`の最初と最後に1が追加される。
158
+ use_jp_extra: Falseの場合、「ん」の音素を「N」ではなく「n」とする。
159
+ """
160
+ # pyopenjtalkのフルコンテキストラベルを使ってアクセントを取り出すと、punctuationの位置が消えてしまい情報が失われてしまう:
161
+ # 「こんにちは、世界。」と「こんにちは!世界。」と「こんにちは!!!???世界……。」は全て同じになる。
162
+ # よって、まずpunctuation無しの音素とアクセントのリストを作り、
163
+ # それとは別にpyopenjtalk.run_frontend()で得られる音素リスト(こちらはpunctuationが保持される)を使い、
164
+ # アクセント割当をしなおすことによってpunctuationを含めた音素とアクセントのリストを作る。
165
+
166
+ # punctuationがすべて消えた、音素とアクセントのタプルのリスト(「ん」は「N」)
167
+ phone_tone_list_wo_punct = g2phone_tone_wo_punct(norm_text)
168
+
169
+ # sep_text: 単語単位の単語のリスト
170
+ # sep_kata: 単語単位の単語のカタカナ読みのリスト
171
+ sep_text, sep_kata = text2sep_kata(norm_text)
172
+
173
+ # sep_phonemes: 各単語ごとの音素のリストのリスト
174
+ sep_phonemes = handle_long([kata2phoneme_list(i) for i in sep_kata])
175
+
176
+ # phone_w_punct: sep_phonemesを結合した、punctuationを元のまま保持した音素列
177
+ phone_w_punct: list[str] = []
178
+ for i in sep_phonemes:
179
+ phone_w_punct += i
180
+
181
+ # punctuation無しのアクセント情報を使って、punctuationを含めたアクセント情報を作る
182
+ phone_tone_list = align_tones(phone_w_punct, phone_tone_list_wo_punct)
183
+ # logger.debug(f"phone_tone_list:\n{phone_tone_list}")
184
+ # word2phは厳密な解答は不可能なので(「今日」「眼鏡」等の熟字訓が存在)、
185
+ # Bert-VITS2では、単語単位の分割を使って、単語の文字ごとにだいたい均等に音素を分配する
186
+
187
+ # sep_textから、各単語を1文字1文字分割して、文字のリスト(のリスト)を作る
188
+ sep_tokenized: list[list[str]] = []
189
+ for i in sep_text:
190
+ if i not in punctuation:
191
+ sep_tokenized.append(
192
+ tokenizer.tokenize(i)
193
+ ) # ここでおそらく`i`が文字単位に分割される
194
+ else:
195
+ sep_tokenized.append([i])
196
+
197
+ # 各単語について、音素の数と文字の数を比較して、均等っぽく分配する
198
+ word2ph = []
199
+ for token, phoneme in zip(sep_tokenized, sep_phonemes):
200
+ phone_len = len(phoneme)
201
+ word_len = len(token)
202
+ word2ph += distribute_phone(phone_len, word_len)
203
+
204
+ # 最初と最後に`_`記号を追加、アクセントは0(低)、word2phもそれに合わせて追加
205
+ phone_tone_list = [("_", 0)] + phone_tone_list + [("_", 0)]
206
+ word2ph = [1] + word2ph + [1]
207
+
208
+ phones = [phone for phone, _ in phone_tone_list]
209
+ tones = [tone for _, tone in phone_tone_list]
210
+
211
+ assert len(phones) == sum(word2ph), f"{len(phones)} != {sum(word2ph)}"
212
+
213
+ # use_jp_extraでない場合は「N」を「n」に変換
214
+ if not use_jp_extra:
215
+ phones = [phone if phone != "N" else "n" for phone in phones]
216
+
217
+ return phones, tones, word2ph
218
+
219
+
220
+ def g2kata_tone(norm_text: str) -> list[tuple[str, int]]:
221
+ phones, tones, _ = g2p(norm_text, use_jp_extra=True)
222
+ return phone_tone2kata_tone(list(zip(phones, tones)))
223
+
224
+
225
+ def phone_tone2kata_tone(phone_tone: list[tuple[str, int]]) -> list[tuple[str, int]]:
226
+ """phone_toneをのphone部分をカタカナに変換する。ただし最初と最後の("_", 0)は無視"""
227
+ phone_tone = phone_tone[1:] # 最初の("_", 0)を無視
228
+ phones = [phone for phone, _ in phone_tone]
229
+ tones = [tone for _, tone in phone_tone]
230
+ result: list[tuple[str, int]] = []
231
+ current_mora = ""
232
+ for phone, next_phone, tone, next_tone in zip(phones, phones[1:], tones, tones[1:]):
233
+ # zipの関係で最後の("_", 0)は無視されている
234
+ if phone in punctuation:
235
+ result.append((phone, tone))
236
+ continue
237
+ if phone in COSONANTS: # n以外の子音の場合
238
+ assert current_mora == "", f"Unexpected {phone} after {current_mora}"
239
+ assert tone == next_tone, f"Unexpected {phone} tone {tone} != {next_tone}"
240
+ current_mora = phone
241
+ else:
242
+ # phoneが母音もしくは「N」
243
+ current_mora += phone
244
+ result.append((mora_phonemes_to_mora_kata[current_mora], tone))
245
+ current_mora = ""
246
+ return result
247
+
248
+
249
+ def kata_tone2phone_tone(kata_tone: list[tuple[str, int]]) -> list[tuple[str, int]]:
250
+ """`phone_tone2kata_tone()`の逆。"""
251
+ result: list[tuple[str, int]] = [("_", 0)]
252
+ for mora, tone in kata_tone:
253
+ if mora in punctuation:
254
+ result.append((mora, tone))
255
+ else:
256
+ cosonant, vowel = mora_kata_to_mora_phonemes[mora]
257
+ if cosonant is None:
258
+ result.append((vowel, tone))
259
+ else:
260
+ result.append((cosonant, tone))
261
+ result.append((vowel, tone))
262
+ result.append(("_", 0))
263
+ return result
264
+
265
+
266
+ def g2phone_tone_wo_punct(text: str) -> list[tuple[str, int]]:
267
+ """
268
+ テキストに対して、音素とアクセント(0か1)のペアのリストを返す。
269
+ ただし「!」「.」「?」等の非音素記号(punctuation)は全て消える(ポーズ記号も残さない)。
270
+ 非音素記号を含める処理は`align_tones()`で行われる。
271
+ また「っ」は「q」に、「ん」は「N」に変換される。
272
+ 例: "こんにちは、世界ー。。元気?!" →
273
+ [('k', 0), ('o', 0), ('N', 1), ('n', 1), ('i', 1), ('ch', 1), ('i', 1), ('w', 1), ('a', 1), ('s', 1), ('e', 1), ('k', 0), ('a', 0), ('i', 0), ('i', 0), ('g', 1), ('e', 1), ('N', 0), ('k', 0), ('i', 0)]
274
+ """
275
+ prosodies = pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True)
276
+ # logger.debug(f"prosodies: {prosodies}")
277
+ result: list[tuple[str, int]] = []
278
+ current_phrase: list[tuple[str, int]] = []
279
+ current_tone = 0
280
+ for i, letter in enumerate(prosodies):
281
+ # 特殊記号の処理
282
+
283
+ # 文頭記号、無視する
284
+ if letter == "^":
285
+ assert i == 0, "Unexpected ^"
286
+ # アクセント句の終わりに来る記号
287
+ elif letter in ("$", "?", "_", "#"):
288
+ # 保持しているフレーズを、アクセント数値を0-1に修正し結果に追加
289
+ result.extend(fix_phone_tone(current_phrase))
290
+ # 末尾に来る終了記号、無視(文中の疑問文は`_`になる)
291
+ if letter in ("$", "?"):
292
+ assert i == len(prosodies) - 1, f"Unexpected {letter}"
293
+ # あとは"_"(ポーズ)と"#"(アクセント句の境界)のみ
294
+ # これらは残さず、次のアクセント句に備える。
295
+ current_phrase = []
296
+ # 0を基準点にしてそこから上昇・下降する(負の場合は上の`fix_phone_tone`で直る)
297
+ current_tone = 0
298
+ # アクセント上昇記号
299
+ elif letter == "[":
300
+ current_tone = current_tone + 1
301
+ # アクセント下降記号
302
+ elif letter == "]":
303
+ current_tone = current_tone - 1
304
+ # それ以外は通常の音素
305
+ else:
306
+ if letter == "cl": # 「っ」の処理
307
+ letter = "q"
308
+ # elif letter == "N": # 「ん」の処理
309
+ # letter = "n"
310
+ current_phrase.append((letter, current_tone))
311
+ return result
312
+
313
+
314
+ def text2sep_kata(norm_text: str) -> tuple[list[str], list[str]]:
315
+ """
316
+ `text_normalize`で正規化済みの`norm_text`を受け取り、それを単語分割し、
317
+ 分割された単語リストとその読み(カタカナor記号1文字)のリストのタプルを返す。
318
+ 単語分割結果は、`g2p()`の`word2ph`で1文字あたりに割り振る音素記号の数を決めるために使う。
319
+ 例:
320
+ `私はそう思う!って感じ?` →
321
+ ["私", "は", "そう", "思う", "!", "って", "感じ", "?"], ["ワタシ", "ワ", "ソー", "オモウ", "!", "ッテ", "カンジ", "?"]
322
+ """
323
+ # parsed: OpenJTalkの解析結果
324
+ parsed = pyopenjtalk.run_frontend(norm_text)
325
+ sep_text: list[str] = []
326
+ sep_kata: list[str] = []
327
+ for parts in parsed:
328
+ # word: 実際の単語の文字列
329
+ # yomi: その読み、但し無声化サインの`’`は除去
330
+ word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
331
+ "’", ""
332
+ )
333
+ """
334
+ ここで`yomi`の取りうる値は以下の通りのはず。
335
+ - `word`が通常単語 → 通常の読み(カタカナ)
336
+ (カタカナからなり、長音記号も含みうる、`アー` 等)
337
+ - `word`が`ー` から始まる → `ーラー` や `ーーー` など
338
+ - `word`が句読点や空白等 → `、`
339
+ - `word`が`?` → `?`(全角になる)
340
+ 他にも`word`が読めないキリル文字アラビア文字等が来ると`、`になるが、正規化でこの場合は起きないはず。
341
+ また元のコードでは`yomi`が空白の場合の処理があったが、これは起きないはず。
342
+ 処理すべきは`yomi`が`、`の場合のみのはず。
343
+ """
344
+ assert yomi != "", f"Empty yomi: {word}"
345
+ if yomi == "、":
346
+ # wordは正規化されているので、`.`, `,`, `!`, `'`, `-`, `--` のいずれか
347
+ if word not in (
348
+ ".",
349
+ ",",
350
+ "!",
351
+ "'",
352
+ "-",
353
+ "--",
354
+ ):
355
+ # ここはpyopenjtalkが読めない文字等のときに起こる
356
+ raise ValueError(f"Cannot read: {word} in:\n{norm_text}")
357
+ # yomiは元の記号のままに変更
358
+ yomi = word
359
+ elif yomi == "?":
360
+ assert word == "?", f"yomi `?` comes from: {word}"
361
+ yomi = "?"
362
+ sep_text.append(word)
363
+ sep_kata.append(yomi)
364
+ return sep_text, sep_kata
365
+
366
+
367
+ # ESPnetの実装から引用、変更点無し。「ん」は「N」なことに注意。
368
+ # https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
369
+ def pyopenjtalk_g2p_prosody(text: str, drop_unvoiced_vowels: bool = True) -> list[str]:
370
+ """Extract phoneme + prosoody symbol sequence from input full-context labels.
371
+
372
+ The algorithm is based on `Prosodic features control by symbols as input of
373
+ sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
374
+
375
+ Args:
376
+ text (str): Input text.
377
+ drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
378
+
379
+ Returns:
380
+ List[str]: List of phoneme + prosody symbols.
381
+
382
+ Examples:
383
+ >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
384
+ >>> pyopenjtalk_g2p_prosody("こんにちは。")
385
+ ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
386
+
387
+ .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
388
+ modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
389
+
390
+ """
391
+ labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
392
+ N = len(labels)
393
+
394
+ phones = []
395
+ for n in range(N):
396
+ lab_curr = labels[n]
397
+
398
+ # current phoneme
399
+ p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
400
+ # deal unvoiced vowels as normal vowels
401
+ if drop_unvoiced_vowels and p3 in "AEIOU":
402
+ p3 = p3.lower()
403
+
404
+ # deal with sil at the beginning and the end of text
405
+ if p3 == "sil":
406
+ assert n == 0 or n == N - 1
407
+ if n == 0:
408
+ phones.append("^")
409
+ elif n == N - 1:
410
+ # check question form or not
411
+ e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
412
+ if e3 == 0:
413
+ phones.append("$")
414
+ elif e3 == 1:
415
+ phones.append("?")
416
+ continue
417
+ elif p3 == "pau":
418
+ phones.append("_")
419
+ continue
420
+ else:
421
+ phones.append(p3)
422
+
423
+ # accent type and position info (forward or backward)
424
+ a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
425
+ a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
426
+ a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
427
+
428
+ # number of mora in accent phrase
429
+ f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
430
+
431
+ a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
432
+ # accent phrase border
433
+ if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
434
+ phones.append("#")
435
+ # pitch falling
436
+ elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
437
+ phones.append("]")
438
+ # pitch rising
439
+ elif a2 == 1 and a2_next == 2:
440
+ phones.append("[")
441
+
442
+ return phones
443
+
444
+
445
+ def _numeric_feature_by_regex(regex, s):
446
+ match = re.search(regex, s)
447
+ if match is None:
448
+ return -50
449
+ return int(match.group(1))
450
+
451
+
452
+ def fix_phone_tone(phone_tone_list: list[tuple[str, int]]) -> list[tuple[str, int]]:
453
+ """
454
+ `phone_tone_list`のtone(アクセントの値)を0か1の範囲に修正する。
455
+ 例: [(a, 0), (i, -1), (u, -1)] → [(a, 1), (i, 0), (u, 0)]
456
+ """
457
+ tone_values = set(tone for _, tone in phone_tone_list)
458
+ if len(tone_values) == 1:
459
+ assert tone_values == {0}, tone_values
460
+ return phone_tone_list
461
+ elif len(tone_values) == 2:
462
+ if tone_values == {0, 1}:
463
+ return phone_tone_list
464
+ elif tone_values == {-1, 0}:
465
+ return [
466
+ (letter, 0 if tone == -1 else 1) for letter, tone in phone_tone_list
467
+ ]
468
+ else:
469
+ raise ValueError(f"Unexpected tone values: {tone_values}")
470
+ else:
471
+ raise ValueError(f"Unexpected tone values: {tone_values}")
472
+
473
+
474
+ def distribute_phone(n_phone: int, n_word: int) -> list[int]:
475
+ """
476
+ 左から右に1ずつ振り分け、次にまた左から右に1ずつ増やし、というふうに、
477
+ 音素の数`n_phone`を単語の数`n_word`に分配する。
478
+ """
479
+ phones_per_word = [0] * n_word
480
+ for _ in range(n_phone):
481
+ min_tasks = min(phones_per_word)
482
+ min_index = phones_per_word.index(min_tasks)
483
+ phones_per_word[min_index] += 1
484
+ return phones_per_word
485
+
486
+
487
+ def handle_long(sep_phonemes: list[list[str]]) -> list[list[str]]:
488
+ for i in range(len(sep_phonemes)):
489
+ if sep_phonemes[i][0] == "ー":
490
+ sep_phonemes[i][0] = sep_phonemes[i - 1][-1]
491
+ if "ー" in sep_phonemes[i]:
492
+ for j in range(len(sep_phonemes[i])):
493
+ if sep_phonemes[i][j] == "ー":
494
+ sep_phonemes[i][j] = sep_phonemes[i][j - 1][-1]
495
+ return sep_phonemes
496
+
497
+
498
+ tokenizer = AutoTokenizer.from_pretrained("./bert/deberta-v2-large-japanese-char-wwm")
499
+
500
+
501
+ def align_tones(
502
+ phones_with_punct: list[str], phone_tone_list: list[tuple[str, int]]
503
+ ) -> list[tuple[str, int]]:
504
+ """
505
+ 例:
506
+ …私は、、そう思う。
507
+ phones_with_punct:
508
+ [".", ".", ".", "w", "a", "t", "a", "sh", "i", "w", "a", ",", ",", "s", "o", "o", "o", "m", "o", "u", "."]
509
+ phone_tone_list:
510
+ [("w", 0), ("a", 0), ("t", 1), ("a", 1), ("sh", 1), ("i", 1), ("w", 1), ("a", 1), ("_", 0), ("s", 0), ("o", 0), ("o", 1), ("o", 1), ("m", 1), ("o", 1), ("u", 0))]
511
+ Return:
512
+ [(".", 0), (".", 0), (".", 0), ("w", 0), ("a", 0), ("t", 1), ("a", 1), ("sh", 1), ("i", 1), ("w", 1), ("a", 1), (",", 0), (",", 0), ("s", 0), ("o", 0), ("o", 1), ("o", 1), ("m", 1), ("o", 1), ("u", 0), (".", 0)]
513
+ """
514
+ result: list[tuple[str, int]] = []
515
+ tone_index = 0
516
+ for phone in phones_with_punct:
517
+ if tone_index >= len(phone_tone_list):
518
+ # 余ったpunctuationがある場合 → (punctuation, 0)を追加
519
+ result.append((phone, 0))
520
+ elif phone == phone_tone_list[tone_index][0]:
521
+ # phone_tone_listの現在の音素と一致する場合 → toneをそこから取得、(phone, tone)を追加
522
+ result.append((phone, phone_tone_list[tone_index][1]))
523
+ # 探すindexを1つ進める
524
+ tone_index += 1
525
+ elif phone in punctuation:
526
+ # phoneがpunctuationの場合 → (phone, 0)を追加
527
+ result.append((phone, 0))
528
+ else:
529
+ logger.debug(f"phones: {phones_with_punct}")
530
+ logger.debug(f"phone_tone_list: {phone_tone_list}")
531
+ logger.debug(f"result: {result}")
532
+ logger.debug(f"tone_index: {tone_index}")
533
+ logger.debug(f"phone: {phone}")
534
+ raise ValueError(f"Unexpected phone: {phone}")
535
+ return result
536
+
537
+
538
+ def kata2phoneme_list(text: str) -> list[str]:
539
+ """
540
+ 原則カタカナの`text`を受け取り、それをそのままいじらずに音素記号のリストに変換。
541
+ 注意点:
542
+ - punctuationが来た場合(punctuationが1文字の場合がありうる)、処理せず1文字のリストを返す
543
+ - 冒頭に続く「ー」はそのまま「ー」のままにする(`handle_long()`で処理される)
544
+ - 文中の「ー」は前の音素記号の最後の音素記号に変換される。
545
+ 例:
546
+ `ーーソーナノカーー` → ["ー", "ー", "s", "o", "o", "n", "a", "n", "o", "k", "a", "a", "a"]
547
+ `?` → ["?"]
548
+ """
549
+ if text in punctuation:
550
+ return [text]
551
+ elif text == "--":
552
+ return ["-", "-"]
553
+ # `text`がカタカナ(`ー`含む)のみからなるかどうかをチェック
554
+ if re.fullmatch(r"[\u30A0-\u30FF]+", text) is None:
555
+ raise ValueError(f"Input must be katakana only: {text}")
556
+ sorted_keys = sorted(mora_kata_to_mora_phonemes.keys(), key=len, reverse=True)
557
+ pattern = "|".join(map(re.escape, sorted_keys))
558
+
559
+ def mora2phonemes(mora: str) -> str:
560
+ cosonant, vowel = mora_kata_to_mora_phonemes[mora]
561
+ if cosonant is None:
562
+ return f" {vowel}"
563
+ return f" {cosonant} {vowel}"
564
+
565
+ spaced_phonemes = re.sub(pattern, lambda m: mora2phonemes(m.group()), text)
566
+
567
+ # 長音記号「ー」の処理
568
+ long_pattern = r"(\w)(ー*)"
569
+ long_replacement = lambda m: m.group(1) + (" " + m.group(1)) * len(m.group(2))
570
+ spaced_phonemes = re.sub(long_pattern, long_replacement, spaced_phonemes)
571
+ return spaced_phonemes.strip().split(" ")
572
+
573
+
574
+ if __name__ == "__main__":
575
+ tokenizer = AutoTokenizer.from_pretrained("./bert/deberta-v2-large-japanese")
576
+ text = "hello,こんにちは、世界ー!……"
577
+ from text.japanese_bert import get_bert_feature
578
+
579
+ text = text_normalize(text)
580
+ print(text)
581
+
582
+ phones, tones, word2ph = g2p(text)
583
+ bert = get_bert_feature(text, word2ph)
584
+
585
+ print(phones, tones, word2ph, bert.shape)
text/japanese_bert.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import torch
4
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
5
+
6
+ from config import config
7
+ from text.japanese import text2sep_kata
8
+
9
+ LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm"
10
+
11
+ tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12
+
13
+ models = dict()
14
+
15
+
16
+ def get_bert_feature(
17
+ text,
18
+ word2ph,
19
+ device=config.bert_gen_config.device,
20
+ assist_text=None,
21
+ assist_text_weight=0.7,
22
+ ):
23
+ text = "".join(text2sep_kata(text)[0])
24
+ if assist_text:
25
+ assist_text = "".join(text2sep_kata(assist_text)[0])
26
+ if (
27
+ sys.platform == "darwin"
28
+ and torch.backends.mps.is_available()
29
+ and device == "cpu"
30
+ ):
31
+ device = "mps"
32
+ if not device:
33
+ device = "cuda"
34
+ if device == "cuda" and not torch.cuda.is_available():
35
+ device = "cpu"
36
+ if device not in models.keys():
37
+ models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
38
+ with torch.no_grad():
39
+ inputs = tokenizer(text, return_tensors="pt")
40
+ for i in inputs:
41
+ inputs[i] = inputs[i].to(device)
42
+ res = models[device](**inputs, output_hidden_states=True)
43
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
44
+ if assist_text:
45
+ style_inputs = tokenizer(assist_text, return_tensors="pt")
46
+ for i in style_inputs:
47
+ style_inputs[i] = style_inputs[i].to(device)
48
+ style_res = models[device](**style_inputs, output_hidden_states=True)
49
+ style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
50
+ style_res_mean = style_res.mean(0)
51
+
52
+ assert len(word2ph) == len(text) + 2, text
53
+ word2phone = word2ph
54
+ phone_level_feature = []
55
+ for i in range(len(word2phone)):
56
+ if assist_text:
57
+ repeat_feature = (
58
+ res[i].repeat(word2phone[i], 1) * (1 - assist_text_weight)
59
+ + style_res_mean.repeat(word2phone[i], 1) * assist_text_weight
60
+ )
61
+ else:
62
+ repeat_feature = res[i].repeat(word2phone[i], 1)
63
+ phone_level_feature.append(repeat_feature)
64
+
65
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
66
+
67
+ return phone_level_feature.T
text/japanese_mora_list.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VOICEVOXのソースコードからお借りして最低限に改造したコード。
3
+ https://github.com/VOICEVOX/voicevox_engine/blob/master/voicevox_engine/tts_pipeline/mora_list.py
4
+ """
5
+
6
+ """
7
+ 以下のモーラ対応表はOpenJTalkのソースコードから取得し、
8
+ カタカナ表記とモーラが一対一対応するように改造した。
9
+ ライセンス表記:
10
+ -----------------------------------------------------------------
11
+ The Japanese TTS System "Open JTalk"
12
+ developed by HTS Working Group
13
+ http://open-jtalk.sourceforge.net/
14
+ -----------------------------------------------------------------
15
+
16
+ Copyright (c) 2008-2014 Nagoya Institute of Technology
17
+ Department of Computer Science
18
+
19
+ All rights reserved.
20
+
21
+ Redistribution and use in source and binary forms, with or
22
+ without modification, are permitted provided that the following
23
+ conditions are met:
24
+
25
+ - Redistributions of source code must retain the above copyright
26
+ notice, this list of conditions and the following disclaimer.
27
+ - Redistributions in binary form must reproduce the above
28
+ copyright notice, this list of conditions and the following
29
+ disclaimer in the documentation and/or other materials provided
30
+ with the distribution.
31
+ - Neither the name of the HTS working group nor the names of its
32
+ contributors may be used to endorse or promote products derived
33
+ from this software without specific prior written permission.
34
+
35
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
36
+ CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
37
+ INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
38
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
40
+ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
41
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
42
+ TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
43
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
44
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
46
+ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47
+ POSSIBILITY OF SUCH DAMAGE.
48
+ """
49
+ from typing import Optional
50
+
51
+ # (カタカナ, 子音, 母音)の順。子音がない場合はNoneを入れる。
52
+ # 但し「ン」と「ッ」は母音のみという扱いで、「ン」は「N」、「ッ」は「q」とする。
53
+ # (元々「ッ」は「cl」)
54
+ # また「デェ = dy e」はpyopenjtalkの出力(de e)と合わないため削除
55
+ _mora_list_minimum: list[tuple[str, Optional[str], str]] = [
56
+ ("ヴォ", "v", "o"),
57
+ ("ヴェ", "v", "e"),
58
+ ("ヴィ", "v", "i"),
59
+ ("ヴァ", "v", "a"),
60
+ ("ヴ", "v", "u"),
61
+ ("ン", None, "N"),
62
+ ("ワ", "w", "a"),
63
+ ("ロ", "r", "o"),
64
+ ("レ", "r", "e"),
65
+ ("ル", "r", "u"),
66
+ ("リョ", "ry", "o"),
67
+ ("リュ", "ry", "u"),
68
+ ("リャ", "ry", "a"),
69
+ ("リェ", "ry", "e"),
70
+ ("リ", "r", "i"),
71
+ ("ラ", "r", "a"),
72
+ ("ヨ", "y", "o"),
73
+ ("ユ", "y", "u"),
74
+ ("ヤ", "y", "a"),
75
+ ("モ", "m", "o"),
76
+ ("メ", "m", "e"),
77
+ ("ム", "m", "u"),
78
+ ("ミョ", "my", "o"),
79
+ ("ミュ", "my", "u"),
80
+ ("ミャ", "my", "a"),
81
+ ("ミェ", "my", "e"),
82
+ ("ミ", "m", "i"),
83
+ ("マ", "m", "a"),
84
+ ("ポ", "p", "o"),
85
+ ("ボ", "b", "o"),
86
+ ("ホ", "h", "o"),
87
+ ("ペ", "p", "e"),
88
+ ("ベ", "b", "e"),
89
+ ("ヘ", "h", "e"),
90
+ ("プ", "p", "u"),
91
+ ("ブ", "b", "u"),
92
+ ("フォ", "f", "o"),
93
+ ("フェ", "f", "e"),
94
+ ("フィ", "f", "i"),
95
+ ("ファ", "f", "a"),
96
+ ("フ", "f", "u"),
97
+ ("ピョ", "py", "o"),
98
+ ("ピュ", "py", "u"),
99
+ ("ピャ", "py", "a"),
100
+ ("ピェ", "py", "e"),
101
+ ("ピ", "p", "i"),
102
+ ("ビョ", "by", "o"),
103
+ ("ビュ", "by", "u"),
104
+ ("ビャ", "by", "a"),
105
+ ("ビェ", "by", "e"),
106
+ ("ビ", "b", "i"),
107
+ ("ヒョ", "hy", "o"),
108
+ ("ヒュ", "hy", "u"),
109
+ ("ヒャ", "hy", "a"),
110
+ ("ヒェ", "hy", "e"),
111
+ ("ヒ", "h", "i"),
112
+ ("パ", "p", "a"),
113
+ ("バ", "b", "a"),
114
+ ("ハ", "h", "a"),
115
+ ("ノ", "n", "o"),
116
+ ("ネ", "n", "e"),
117
+ ("ヌ", "n", "u"),
118
+ ("ニョ", "ny", "o"),
119
+ ("ニュ", "ny", "u"),
120
+ ("ニャ", "ny", "a"),
121
+ ("ニェ", "ny", "e"),
122
+ ("ニ", "n", "i"),
123
+ ("ナ", "n", "a"),
124
+ ("ドゥ", "d", "u"),
125
+ ("ド", "d", "o"),
126
+ ("トゥ", "t", "u"),
127
+ ("ト", "t", "o"),
128
+ ("デョ", "dy", "o"),
129
+ ("デュ", "dy", "u"),
130
+ ("デャ", "dy", "a"),
131
+ # ("デェ", "dy", "e"),
132
+ ("ディ", "d", "i"),
133
+ ("デ", "d", "e"),
134
+ ("テョ", "ty", "o"),
135
+ ("テュ", "ty", "u"),
136
+ ("テャ", "ty", "a"),
137
+ ("ティ", "t", "i"),
138
+ ("テ", "t", "e"),
139
+ ("ツォ", "ts", "o"),
140
+ ("ツェ", "ts", "e"),
141
+ ("ツィ", "ts", "i"),
142
+ ("ツァ", "ts", "a"),
143
+ ("ツ", "ts", "u"),
144
+ ("ッ", None, "q"), # 「cl」から「q」に変更
145
+ ("チョ", "ch", "o"),
146
+ ("チュ", "ch", "u"),
147
+ ("チャ", "ch", "a"),
148
+ ("チェ", "ch", "e"),
149
+ ("チ", "ch", "i"),
150
+ ("ダ", "d", "a"),
151
+ ("タ", "t", "a"),
152
+ ("ゾ", "z", "o"),
153
+ ("ソ", "s", "o"),
154
+ ("ゼ", "z", "e"),
155
+ ("セ", "s", "e"),
156
+ ("ズィ", "z", "i"),
157
+ ("ズ", "z", "u"),
158
+ ("スィ", "s", "i"),
159
+ ("ス", "s", "u"),
160
+ ("ジョ", "j", "o"),
161
+ ("ジュ", "j", "u"),
162
+ ("ジャ", "j", "a"),
163
+ ("ジェ", "j", "e"),
164
+ ("ジ", "j", "i"),
165
+ ("ショ", "sh", "o"),
166
+ ("シュ", "sh", "u"),
167
+ ("シャ", "sh", "a"),
168
+ ("シェ", "sh", "e"),
169
+ ("シ", "sh", "i"),
170
+ ("ザ", "z", "a"),
171
+ ("サ", "s", "a"),
172
+ ("ゴ", "g", "o"),
173
+ ("コ", "k", "o"),
174
+ ("ゲ", "g", "e"),
175
+ ("ケ", "k", "e"),
176
+ ("グヮ", "gw", "a"),
177
+ ("グ", "g", "u"),
178
+ ("クヮ", "kw", "a"),
179
+ ("ク", "k", "u"),
180
+ ("ギョ", "gy", "o"),
181
+ ("ギュ", "gy", "u"),
182
+ ("ギャ", "gy", "a"),
183
+ ("ギェ", "gy", "e"),
184
+ ("ギ", "g", "i"),
185
+ ("キョ", "ky", "o"),
186
+ ("キュ", "ky", "u"),
187
+ ("キャ", "ky", "a"),
188
+ ("キェ", "ky", "e"),
189
+ ("キ", "k", "i"),
190
+ ("ガ", "g", "a"),
191
+ ("カ", "k", "a"),
192
+ ("オ", None, "o"),
193
+ ("エ", None, "e"),
194
+ ("ウォ", "w", "o"),
195
+ ("ウェ", "w", "e"),
196
+ ("ウィ", "w", "i"),
197
+ ("ウ", None, "u"),
198
+ ("イェ", "y", "e"),
199
+ ("イ", None, "i"),
200
+ ("ア", None, "a"),
201
+ ]
202
+ _mora_list_additional: list[tuple[str, Optional[str], str]] = [
203
+ ("ヴョ", "by", "o"),
204
+ ("ヴュ", "by", "u"),
205
+ ("ヴャ", "by", "a"),
206
+ ("ヲ", None, "o"),
207
+ ("ヱ", None, "e"),
208
+ ("ヰ", None, "i"),
209
+ ("ヮ", "w", "a"),
210
+ ("ョ", "y", "o"),
211
+ ("ュ", "y", "u"),
212
+ ("ヅ", "z", "u"),
213
+ ("ヂ", "j", "i"),
214
+ ("ヶ", "k", "e"),
215
+ ("ャ", "y", "a"),
216
+ ("ォ", None, "o"),
217
+ ("ェ", None, "e"),
218
+ ("ゥ", None, "u"),
219
+ ("ィ", None, "i"),
220
+ ("ァ", None, "a"),
221
+ ]
222
+
223
+ # 例: "vo" -> "ヴォ", "a" -> "ア"
224
+ mora_phonemes_to_mora_kata: dict[str, str] = {
225
+ (consonant or "") + vowel: kana for [kana, consonant, vowel] in _mora_list_minimum
226
+ }
227
+
228
+ # 例: "ヴォ" -> ("v", "o"), "ア" -> (None, "a")
229
+ mora_kata_to_mora_phonemes: dict[str, tuple[Optional[str], str]] = {
230
+ kana: (consonant, vowel)
231
+ for [kana, consonant, vowel] in _mora_list_minimum + _mora_list_additional
232
+ }
text/opencpop-strict.txt ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a AA a
2
+ ai AA ai
3
+ an AA an
4
+ ang AA ang
5
+ ao AA ao
6
+ ba b a
7
+ bai b ai
8
+ ban b an
9
+ bang b ang
10
+ bao b ao
11
+ bei b ei
12
+ ben b en
13
+ beng b eng
14
+ bi b i
15
+ bian b ian
16
+ biao b iao
17
+ bie b ie
18
+ bin b in
19
+ bing b ing
20
+ bo b o
21
+ bu b u
22
+ ca c a
23
+ cai c ai
24
+ can c an
25
+ cang c ang
26
+ cao c ao
27
+ ce c e
28
+ cei c ei
29
+ cen c en
30
+ ceng c eng
31
+ cha ch a
32
+ chai ch ai
33
+ chan ch an
34
+ chang ch ang
35
+ chao ch ao
36
+ che ch e
37
+ chen ch en
38
+ cheng ch eng
39
+ chi ch ir
40
+ chong ch ong
41
+ chou ch ou
42
+ chu ch u
43
+ chua ch ua
44
+ chuai ch uai
45
+ chuan ch uan
46
+ chuang ch uang
47
+ chui ch ui
48
+ chun ch un
49
+ chuo ch uo
50
+ ci c i0
51
+ cong c ong
52
+ cou c ou
53
+ cu c u
54
+ cuan c uan
55
+ cui c ui
56
+ cun c un
57
+ cuo c uo
58
+ da d a
59
+ dai d ai
60
+ dan d an
61
+ dang d ang
62
+ dao d ao
63
+ de d e
64
+ dei d ei
65
+ den d en
66
+ deng d eng
67
+ di d i
68
+ dia d ia
69
+ dian d ian
70
+ diao d iao
71
+ die d ie
72
+ ding d ing
73
+ diu d iu
74
+ dong d ong
75
+ dou d ou
76
+ du d u
77
+ duan d uan
78
+ dui d ui
79
+ dun d un
80
+ duo d uo
81
+ e EE e
82
+ ei EE ei
83
+ en EE en
84
+ eng EE eng
85
+ er EE er
86
+ fa f a
87
+ fan f an
88
+ fang f ang
89
+ fei f ei
90
+ fen f en
91
+ feng f eng
92
+ fo f o
93
+ fou f ou
94
+ fu f u
95
+ ga g a
96
+ gai g ai
97
+ gan g an
98
+ gang g ang
99
+ gao g ao
100
+ ge g e
101
+ gei g ei
102
+ gen g en
103
+ geng g eng
104
+ gong g ong
105
+ gou g ou
106
+ gu g u
107
+ gua g ua
108
+ guai g uai
109
+ guan g uan
110
+ guang g uang
111
+ gui g ui
112
+ gun g un
113
+ guo g uo
114
+ ha h a
115
+ hai h ai
116
+ han h an
117
+ hang h ang
118
+ hao h ao
119
+ he h e
120
+ hei h ei
121
+ hen h en
122
+ heng h eng
123
+ hong h ong
124
+ hou h ou
125
+ hu h u
126
+ hua h ua
127
+ huai h uai
128
+ huan h uan
129
+ huang h uang
130
+ hui h ui
131
+ hun h un
132
+ huo h uo
133
+ ji j i
134
+ jia j ia
135
+ jian j ian
136
+ jiang j iang
137
+ jiao j iao
138
+ jie j ie
139
+ jin j in
140
+ jing j ing
141
+ jiong j iong
142
+ jiu j iu
143
+ ju j v
144
+ jv j v
145
+ juan j van
146
+ jvan j van
147
+ jue j ve
148
+ jve j ve
149
+ jun j vn
150
+ jvn j vn
151
+ ka k a
152
+ kai k ai
153
+ kan k an
154
+ kang k ang
155
+ kao k ao
156
+ ke k e
157
+ kei k ei
158
+ ken k en
159
+ keng k eng
160
+ kong k ong
161
+ kou k ou
162
+ ku k u
163
+ kua k ua
164
+ kuai k uai
165
+ kuan k uan
166
+ kuang k uang
167
+ kui k ui
168
+ kun k un
169
+ kuo k uo
170
+ la l a
171
+ lai l ai
172
+ lan l an
173
+ lang l ang
174
+ lao l ao
175
+ le l e
176
+ lei l ei
177
+ leng l eng
178
+ li l i
179
+ lia l ia
180
+ lian l ian
181
+ liang l iang
182
+ liao l iao
183
+ lie l ie
184
+ lin l in
185
+ ling l ing
186
+ liu l iu
187
+ lo l o
188
+ long l ong
189
+ lou l ou
190
+ lu l u
191
+ luan l uan
192
+ lun l un
193
+ luo l uo
194
+ lv l v
195
+ lve l ve
196
+ ma m a
197
+ mai m ai
198
+ man m an
199
+ mang m ang
200
+ mao m ao
201
+ me m e
202
+ mei m ei
203
+ men m en
204
+ meng m eng
205
+ mi m i
206
+ mian m ian
207
+ miao m iao
208
+ mie m ie
209
+ min m in
210
+ ming m ing
211
+ miu m iu
212
+ mo m o
213
+ mou m ou
214
+ mu m u
215
+ na n a
216
+ nai n ai
217
+ nan n an
218
+ nang n ang
219
+ nao n ao
220
+ ne n e
221
+ nei n ei
222
+ nen n en
223
+ neng n eng
224
+ ni n i
225
+ nian n ian
226
+ niang n iang
227
+ niao n iao
228
+ nie n ie
229
+ nin n in
230
+ ning n ing
231
+ niu n iu
232
+ nong n ong
233
+ nou n ou
234
+ nu n u
235
+ nuan n uan
236
+ nun n un
237
+ nuo n uo
238
+ nv n v
239
+ nve n ve
240
+ o OO o
241
+ ou OO ou
242
+ pa p a
243
+ pai p ai
244
+ pan p an
245
+ pang p ang
246
+ pao p ao
247
+ pei p ei
248
+ pen p en
249
+ peng p eng
250
+ pi p i
251
+ pian p ian
252
+ piao p iao
253
+ pie p ie
254
+ pin p in
255
+ ping p ing
256
+ po p o
257
+ pou p ou
258
+ pu p u
259
+ qi q i
260
+ qia q ia
261
+ qian q ian
262
+ qiang q iang
263
+ qiao q iao
264
+ qie q ie
265
+ qin q in
266
+ qing q ing
267
+ qiong q iong
268
+ qiu q iu
269
+ qu q v
270
+ qv q v
271
+ quan q van
272
+ qvan q van
273
+ que q ve
274
+ qve q ve
275
+ qun q vn
276
+ qvn q vn
277
+ ran r an
278
+ rang r ang
279
+ rao r ao
280
+ re r e
281
+ ren r en
282
+ reng r eng
283
+ ri r ir
284
+ rong r ong
285
+ rou r ou
286
+ ru r u
287
+ rua r ua
288
+ ruan r uan
289
+ rui r ui
290
+ run r un
291
+ ruo r uo
292
+ sa s a
293
+ sai s ai
294
+ san s an
295
+ sang s ang
296
+ sao s ao
297
+ se s e
298
+ sen s en
299
+ seng s eng
300
+ sha sh a
301
+ shai sh ai
302
+ shan sh an
303
+ shang sh ang
304
+ shao sh ao
305
+ she sh e
306
+ shei sh ei
307
+ shen sh en
308
+ sheng sh eng
309
+ shi sh ir
310
+ shou sh ou
311
+ shu sh u
312
+ shua sh ua
313
+ shuai sh uai
314
+ shuan sh uan
315
+ shuang sh uang
316
+ shui sh ui
317
+ shun sh un
318
+ shuo sh uo
319
+ si s i0
320
+ song s ong
321
+ sou s ou
322
+ su s u
323
+ suan s uan
324
+ sui s ui
325
+ sun s un
326
+ suo s uo
327
+ ta t a
328
+ tai t ai
329
+ tan t an
330
+ tang t ang
331
+ tao t ao
332
+ te t e
333
+ tei t ei
334
+ teng t eng
335
+ ti t i
336
+ tian t ian
337
+ tiao t iao
338
+ tie t ie
339
+ ting t ing
340
+ tong t ong
341
+ tou t ou
342
+ tu t u
343
+ tuan t uan
344
+ tui t ui
345
+ tun t un
346
+ tuo t uo
347
+ wa w a
348
+ wai w ai
349
+ wan w an
350
+ wang w ang
351
+ wei w ei
352
+ wen w en
353
+ weng w eng
354
+ wo w o
355
+ wu w u
356
+ xi x i
357
+ xia x ia
358
+ xian x ian
359
+ xiang x iang
360
+ xiao x iao
361
+ xie x ie
362
+ xin x in
363
+ xing x ing
364
+ xiong x iong
365
+ xiu x iu
366
+ xu x v
367
+ xv x v
368
+ xuan x van
369
+ xvan x van
370
+ xue x ve
371
+ xve x ve
372
+ xun x vn
373
+ xvn x vn
374
+ ya y a
375
+ yan y En
376
+ yang y ang
377
+ yao y ao
378
+ ye y E
379
+ yi y i
380
+ yin y in
381
+ ying y ing
382
+ yo y o
383
+ yong y ong
384
+ you y ou
385
+ yu y v
386
+ yv y v
387
+ yuan y van
388
+ yvan y van
389
+ yue y ve
390
+ yve y ve
391
+ yun y vn
392
+ yvn y vn
393
+ za z a
394
+ zai z ai
395
+ zan z an
396
+ zang z ang
397
+ zao z ao
398
+ ze z e
399
+ zei z ei
400
+ zen z en
401
+ zeng z eng
402
+ zha zh a
403
+ zhai zh ai
404
+ zhan zh an
405
+ zhang zh ang
406
+ zhao zh ao
407
+ zhe zh e
408
+ zhei zh ei
409
+ zhen zh en
410
+ zheng zh eng
411
+ zhi zh ir
412
+ zhong zh ong
413
+ zhou zh ou
414
+ zhu zh u
415
+ zhua zh ua
416
+ zhuai zh uai
417
+ zhuan zh uan
418
+ zhuang zh uang
419
+ zhui zh ui
420
+ zhun zh un
421
+ zhuo zh uo
422
+ zi z i0
423
+ zong z ong
424
+ zou z ou
425
+ zu z u
426
+ zuan z uan
427
+ zui z ui
428
+ zun z un
429
+ zuo z uo
text/symbols.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2
+ pu_symbols = punctuation + ["SP", "UNK"]
3
+ pad = "_"
4
+
5
+ # chinese
6
+ zh_symbols = [
7
+ "E",
8
+ "En",
9
+ "a",
10
+ "ai",
11
+ "an",
12
+ "ang",
13
+ "ao",
14
+ "b",
15
+ "c",
16
+ "ch",
17
+ "d",
18
+ "e",
19
+ "ei",
20
+ "en",
21
+ "eng",
22
+ "er",
23
+ "f",
24
+ "g",
25
+ "h",
26
+ "i",
27
+ "i0",
28
+ "ia",
29
+ "ian",
30
+ "iang",
31
+ "iao",
32
+ "ie",
33
+ "in",
34
+ "ing",
35
+ "iong",
36
+ "ir",
37
+ "iu",
38
+ "j",
39
+ "k",
40
+ "l",
41
+ "m",
42
+ "n",
43
+ "o",
44
+ "ong",
45
+ "ou",
46
+ "p",
47
+ "q",
48
+ "r",
49
+ "s",
50
+ "sh",
51
+ "t",
52
+ "u",
53
+ "ua",
54
+ "uai",
55
+ "uan",
56
+ "uang",
57
+ "ui",
58
+ "un",
59
+ "uo",
60
+ "v",
61
+ "van",
62
+ "ve",
63
+ "vn",
64
+ "w",
65
+ "x",
66
+ "y",
67
+ "z",
68
+ "zh",
69
+ "AA",
70
+ "EE",
71
+ "OO",
72
+ ]
73
+ num_zh_tones = 6
74
+
75
+ # japanese
76
+ ja_symbols = [
77
+ "N",
78
+ "a",
79
+ "a:",
80
+ "b",
81
+ "by",
82
+ "ch",
83
+ "d",
84
+ "dy",
85
+ "e",
86
+ "e:",
87
+ "f",
88
+ "g",
89
+ "gy",
90
+ "h",
91
+ "hy",
92
+ "i",
93
+ "i:",
94
+ "j",
95
+ "k",
96
+ "ky",
97
+ "m",
98
+ "my",
99
+ "n",
100
+ "ny",
101
+ "o",
102
+ "o:",
103
+ "p",
104
+ "py",
105
+ "q",
106
+ "r",
107
+ "ry",
108
+ "s",
109
+ "sh",
110
+ "t",
111
+ "ts",
112
+ "ty",
113
+ "u",
114
+ "u:",
115
+ "w",
116
+ "y",
117
+ "z",
118
+ "zy",
119
+ ]
120
+ num_ja_tones = 2
121
+
122
+ # English
123
+ en_symbols = [
124
+ "aa",
125
+ "ae",
126
+ "ah",
127
+ "ao",
128
+ "aw",
129
+ "ay",
130
+ "b",
131
+ "ch",
132
+ "d",
133
+ "dh",
134
+ "eh",
135
+ "er",
136
+ "ey",
137
+ "f",
138
+ "g",
139
+ "hh",
140
+ "ih",
141
+ "iy",
142
+ "jh",
143
+ "k",
144
+ "l",
145
+ "m",
146
+ "n",
147
+ "ng",
148
+ "ow",
149
+ "oy",
150
+ "p",
151
+ "r",
152
+ "s",
153
+ "sh",
154
+ "t",
155
+ "th",
156
+ "uh",
157
+ "uw",
158
+ "V",
159
+ "w",
160
+ "y",
161
+ "z",
162
+ "zh",
163
+ ]
164
+ num_en_tones = 4
165
+
166
+ # combine all symbols
167
+ normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168
+ symbols = [pad] + normal_symbols + pu_symbols
169
+ sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170
+
171
+ # combine all tones
172
+ num_tones = num_zh_tones + num_ja_tones + num_en_tones
173
+
174
+ # language maps
175
+ language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176
+ num_languages = len(language_id_map.keys())
177
+
178
+ language_tone_start_map = {
179
+ "ZH": 0,
180
+ "JP": num_zh_tones,
181
+ "EN": num_zh_tones + num_ja_tones,
182
+ }
183
+
184
+ if __name__ == "__main__":
185
+ a = set(zh_symbols)
186
+ b = set(en_symbols)
187
+ print(sorted(a & b))
text/tone_sandhi.py ADDED
@@ -0,0 +1,773 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import List
15
+ from typing import Tuple
16
+
17
+ import jieba
18
+ from pypinyin import lazy_pinyin
19
+ from pypinyin import Style
20
+
21
+
22
+ class ToneSandhi:
23
+ def __init__(self):
24
+ self.must_neural_tone_words = {
25
+ "麻烦",
26
+ "麻利",
27
+ "鸳鸯",
28
+ "高粱",
29
+ "骨头",
30
+ "骆驼",
31
+ "马虎",
32
+ "首饰",
33
+ "馒头",
34
+ "馄饨",
35
+ "风筝",
36
+ "难为",
37
+ "队伍",
38
+ "阔气",
39
+ "闺女",
40
+ "门道",
41
+ "锄头",
42
+ "铺盖",
43
+ "铃铛",
44
+ "铁匠",
45
+ "钥匙",
46
+ "里脊",
47
+ "里头",
48
+ "部分",
49
+ "那么",
50
+ "道士",
51
+ "造化",
52
+ "迷糊",
53
+ "连累",
54
+ "这么",
55
+ "这个",
56
+ "运气",
57
+ "过去",
58
+ "软和",
59
+ "转悠",
60
+ "踏实",
61
+ "跳蚤",
62
+ "跟头",
63
+ "趔趄",
64
+ "财主",
65
+ "豆腐",
66
+ "讲究",
67
+ "记性",
68
+ "记号",
69
+ "认识",
70
+ "规矩",
71
+ "见识",
72
+ "裁缝",
73
+ "补丁",
74
+ "衣裳",
75
+ "衣服",
76
+ "衙门",
77
+ "街坊",
78
+ "行李",
79
+ "行当",
80
+ "蛤蟆",
81
+ "蘑菇",
82
+ "薄荷",
83
+ "葫芦",
84
+ "葡萄",
85
+ "萝卜",
86
+ "荸荠",
87
+ "苗条",
88
+ "苗头",
89
+ "苍蝇",
90
+ "芝麻",
91
+ "舒服",
92
+ "舒坦",
93
+ "舌头",
94
+ "自在",
95
+ "膏药",
96
+ "脾气",
97
+ "脑袋",
98
+ "脊梁",
99
+ "能耐",
100
+ "胳膊",
101
+ "胭脂",
102
+ "胡萝",
103
+ "胡琴",
104
+ "胡同",
105
+ "聪明",
106
+ "耽误",
107
+ "耽搁",
108
+ "耷拉",
109
+ "耳朵",
110
+ "老爷",
111
+ "老实",
112
+ "老婆",
113
+ "老头",
114
+ "老太",
115
+ "翻腾",
116
+ "罗嗦",
117
+ "罐头",
118
+ "编辑",
119
+ "结实",
120
+ "红火",
121
+ "累赘",
122
+ "糨糊",
123
+ "糊涂",
124
+ "精神",
125
+ "粮食",
126
+ "簸箕",
127
+ "篱笆",
128
+ "算计",
129
+ "算盘",
130
+ "答应",
131
+ "笤帚",
132
+ "笑语",
133
+ "笑话",
134
+ "窟窿",
135
+ "窝囊",
136
+ "窗户",
137
+ "稳当",
138
+ "稀罕",
139
+ "称呼",
140
+ "秧歌",
141
+ "秀气",
142
+ "秀才",
143
+ "福气",
144
+ "祖宗",
145
+ "砚台",
146
+ "码头",
147
+ "石榴",
148
+ "石头",
149
+ "石匠",
150
+ "知识",
151
+ "眼睛",
152
+ "眯缝",
153
+ "眨巴",
154
+ "眉毛",
155
+ "相声",
156
+ "盘算",
157
+ "白净",
158
+ "痢疾",
159
+ "痛快",
160
+ "疟疾",
161
+ "疙瘩",
162
+ "疏忽",
163
+ "畜生",
164
+ "生意",
165
+ "甘蔗",
166
+ "琵琶",
167
+ "琢磨",
168
+ "琉璃",
169
+ "玻璃",
170
+ "玫瑰",
171
+ "玄乎",
172
+ "狐狸",
173
+ "状元",
174
+ "特务",
175
+ "牲口",
176
+ "牙碜",
177
+ "牌楼",
178
+ "爽快",
179
+ "爱人",
180
+ "热闹",
181
+ "烧饼",
182
+ "烟筒",
183
+ "烂糊",
184
+ "点心",
185
+ "炊帚",
186
+ "灯笼",
187
+ "火候",
188
+ "漂亮",
189
+ "滑溜",
190
+ "溜达",
191
+ "温和",
192
+ "清楚",
193
+ "消息",
194
+ "浪头",
195
+ "活泼",
196
+ "比方",
197
+ "正经",
198
+ "欺负",
199
+ "模糊",
200
+ "槟榔",
201
+ "棺材",
202
+ "棒槌",
203
+ "棉花",
204
+ "核桃",
205
+ "栅栏",
206
+ "柴火",
207
+ "架势",
208
+ "枕头",
209
+ "枇杷",
210
+ "机灵",
211
+ "本事",
212
+ "木头",
213
+ "木匠",
214
+ "朋友",
215
+ "月饼",
216
+ "月亮",
217
+ "暖和",
218
+ "明白",
219
+ "时候",
220
+ "新鲜",
221
+ "故事",
222
+ "收拾",
223
+ "收成",
224
+ "提防",
225
+ "挖苦",
226
+ "挑剔",
227
+ "指甲",
228
+ "指头",
229
+ "拾掇",
230
+ "拳头",
231
+ "拨弄",
232
+ "招牌",
233
+ "招呼",
234
+ "抬举",
235
+ "护士",
236
+ "折腾",
237
+ "扫帚",
238
+ "打量",
239
+ "打算",
240
+ "打点",
241
+ "打扮",
242
+ "打听",
243
+ "打发",
244
+ "扎实",
245
+ "扁担",
246
+ "戒指",
247
+ "懒得",
248
+ "意识",
249
+ "意思",
250
+ "情形",
251
+ "悟性",
252
+ "怪物",
253
+ "思量",
254
+ "怎么",
255
+ "念头",
256
+ "念叨",
257
+ "快活",
258
+ "忙活",
259
+ "志气",
260
+ "心思",
261
+ "得罪",
262
+ "张罗",
263
+ "弟兄",
264
+ "开通",
265
+ "应酬",
266
+ "庄稼",
267
+ "干事",
268
+ "帮手",
269
+ "帐篷",
270
+ "希罕",
271
+ "师父",
272
+ "师傅",
273
+ "巴结",
274
+ "巴掌",
275
+ "差事",
276
+ "工夫",
277
+ "岁数",
278
+ "屁股",
279
+ "尾巴",
280
+ "少爷",
281
+ "小气",
282
+ "小伙",
283
+ "将就",
284
+ "对头",
285
+ "对付",
286
+ "寡妇",
287
+ "家伙",
288
+ "客气",
289
+ "实在",
290
+ "官司",
291
+ "学问",
292
+ "学生",
293
+ "字号",
294
+ "嫁妆",
295
+ "媳妇",
296
+ "媒人",
297
+ "婆家",
298
+ "娘家",
299
+ "委屈",
300
+ "姑娘",
301
+ "姐夫",
302
+ "妯娌",
303
+ "妥当",
304
+ "妖精",
305
+ "奴才",
306
+ "女婿",
307
+ "头发",
308
+ "太阳",
309
+ "大爷",
310
+ "大方",
311
+ "大意",
312
+ "大夫",
313
+ "多少",
314
+ "多么",
315
+ "外甥",
316
+ "壮实",
317
+ "地道",
318
+ "地方",
319
+ "在乎",
320
+ "困难",
321
+ "嘴巴",
322
+ "嘱咐",
323
+ "嘟囔",
324
+ "嘀咕",
325
+ "喜欢",
326
+ "喇嘛",
327
+ "喇叭",
328
+ "商量",
329
+ "唾沫",
330
+ "哑巴",
331
+ "哈欠",
332
+ "哆嗦",
333
+ "咳嗽",
334
+ "和尚",
335
+ "告诉",
336
+ "告示",
337
+ "含糊",
338
+ "吓唬",
339
+ "后头",
340
+ "名字",
341
+ "名堂",
342
+ "合同",
343
+ "吆喝",
344
+ "叫唤",
345
+ "口袋",
346
+ "厚道",
347
+ "厉害",
348
+ "千斤",
349
+ "包袱",
350
+ "包涵",
351
+ "匀称",
352
+ "勤快",
353
+ "动静",
354
+ "动弹",
355
+ "功夫",
356
+ "力气",
357
+ "前头",
358
+ "刺猬",
359
+ "刺激",
360
+ "别扭",
361
+ "利落",
362
+ "利索",
363
+ "利害",
364
+ "分析",
365
+ "出息",
366
+ "凑合",
367
+ "凉快",
368
+ "冷战",
369
+ "冤枉",
370
+ "冒失",
371
+ "养活",
372
+ "关系",
373
+ "先生",
374
+ "兄弟",
375
+ "便宜",
376
+ "使唤",
377
+ "佩服",
378
+ "作坊",
379
+ "体面",
380
+ "位置",
381
+ "似的",
382
+ "伙计",
383
+ "休息",
384
+ "什么",
385
+ "人家",
386
+ "亲戚",
387
+ "亲家",
388
+ "交情",
389
+ "云彩",
390
+ "事情",
391
+ "买卖",
392
+ "主意",
393
+ "丫头",
394
+ "丧气",
395
+ "两口",
396
+ "东西",
397
+ "东家",
398
+ "世故",
399
+ "不由",
400
+ "不在",
401
+ "下水",
402
+ "下巴",
403
+ "上头",
404
+ "上司",
405
+ "丈夫",
406
+ "丈人",
407
+ "一辈",
408
+ "那个",
409
+ "菩萨",
410
+ "父亲",
411
+ "母亲",
412
+ "咕噜",
413
+ "邋遢",
414
+ "费用",
415
+ "冤家",
416
+ "甜头",
417
+ "介绍",
418
+ "荒唐",
419
+ "大人",
420
+ "泥鳅",
421
+ "幸福",
422
+ "熟悉",
423
+ "计划",
424
+ "扑腾",
425
+ "蜡烛",
426
+ "姥爷",
427
+ "照顾",
428
+ "喉咙",
429
+ "吉他",
430
+ "弄堂",
431
+ "蚂蚱",
432
+ "凤凰",
433
+ "拖沓",
434
+ "寒碜",
435
+ "糟蹋",
436
+ "倒腾",
437
+ "报复",
438
+ "逻辑",
439
+ "盘缠",
440
+ "喽啰",
441
+ "牢骚",
442
+ "咖喱",
443
+ "扫把",
444
+ "惦记",
445
+ }
446
+ self.must_not_neural_tone_words = {
447
+ "男子",
448
+ "女子",
449
+ "分子",
450
+ "原子",
451
+ "量子",
452
+ "莲子",
453
+ "石子",
454
+ "瓜子",
455
+ "电子",
456
+ "人人",
457
+ "虎虎",
458
+ }
459
+ self.punc = ":,;。?!“”‘’':,;.?!"
460
+
461
+ # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
462
+ # e.g.
463
+ # word: "家里"
464
+ # pos: "s"
465
+ # finals: ['ia1', 'i3']
466
+ def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
467
+ # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
468
+ for j, item in enumerate(word):
469
+ if (
470
+ j - 1 >= 0
471
+ and item == word[j - 1]
472
+ and pos[0] in {"n", "v", "a"}
473
+ and word not in self.must_not_neural_tone_words
474
+ ):
475
+ finals[j] = finals[j][:-1] + "5"
476
+ ge_idx = word.find("个")
477
+ if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
478
+ finals[-1] = finals[-1][:-1] + "5"
479
+ elif len(word) >= 1 and word[-1] in "的地得":
480
+ finals[-1] = finals[-1][:-1] + "5"
481
+ # e.g. 走了, 看着, 去过
482
+ # elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
483
+ # finals[-1] = finals[-1][:-1] + "5"
484
+ elif (
485
+ len(word) > 1
486
+ and word[-1] in "们子"
487
+ and pos in {"r", "n"}
488
+ and word not in self.must_not_neural_tone_words
489
+ ):
490
+ finals[-1] = finals[-1][:-1] + "5"
491
+ # e.g. 桌上, 地下, 家里
492
+ elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
493
+ finals[-1] = finals[-1][:-1] + "5"
494
+ # e.g. 上来, 下去
495
+ elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
496
+ finals[-1] = finals[-1][:-1] + "5"
497
+ # 个做量词
498
+ elif (
499
+ ge_idx >= 1
500
+ and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
501
+ ) or word == "个":
502
+ finals[ge_idx] = finals[ge_idx][:-1] + "5"
503
+ else:
504
+ if (
505
+ word in self.must_neural_tone_words
506
+ or word[-2:] in self.must_neural_tone_words
507
+ ):
508
+ finals[-1] = finals[-1][:-1] + "5"
509
+
510
+ word_list = self._split_word(word)
511
+ finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
512
+ for i, word in enumerate(word_list):
513
+ # conventional neural in Chinese
514
+ if (
515
+ word in self.must_neural_tone_words
516
+ or word[-2:] in self.must_neural_tone_words
517
+ ):
518
+ finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
519
+ finals = sum(finals_list, [])
520
+ return finals
521
+
522
+ def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
523
+ # e.g. 看不懂
524
+ if len(word) == 3 and word[1] == "不":
525
+ finals[1] = finals[1][:-1] + "5"
526
+ else:
527
+ for i, char in enumerate(word):
528
+ # "不" before tone4 should be bu2, e.g. 不怕
529
+ if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
530
+ finals[i] = finals[i][:-1] + "2"
531
+ return finals
532
+
533
+ def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
534
+ # "一" in number sequences, e.g. 一零零, 二一零
535
+ if word.find("一") != -1 and all(
536
+ [item.isnumeric() for item in word if item != "一"]
537
+ ):
538
+ return finals
539
+ # "一" between reduplication words should be yi5, e.g. 看一看
540
+ elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
541
+ finals[1] = finals[1][:-1] + "5"
542
+ # when "一" is ordinal word, it should be yi1
543
+ elif word.startswith("第一"):
544
+ finals[1] = finals[1][:-1] + "1"
545
+ else:
546
+ for i, char in enumerate(word):
547
+ if char == "一" and i + 1 < len(word):
548
+ # "一" before tone4 should be yi2, e.g. 一段
549
+ if finals[i + 1][-1] == "4":
550
+ finals[i] = finals[i][:-1] + "2"
551
+ # "一" before non-tone4 should be yi4, e.g. 一天
552
+ else:
553
+ # "一" 后面如果是标点,还读一声
554
+ if word[i + 1] not in self.punc:
555
+ finals[i] = finals[i][:-1] + "4"
556
+ return finals
557
+
558
+ def _split_word(self, word: str) -> List[str]:
559
+ word_list = jieba.cut_for_search(word)
560
+ word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
561
+ first_subword = word_list[0]
562
+ first_begin_idx = word.find(first_subword)
563
+ if first_begin_idx == 0:
564
+ second_subword = word[len(first_subword) :]
565
+ new_word_list = [first_subword, second_subword]
566
+ else:
567
+ second_subword = word[: -len(first_subword)]
568
+ new_word_list = [second_subword, first_subword]
569
+ return new_word_list
570
+
571
+ def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
572
+ if len(word) == 2 and self._all_tone_three(finals):
573
+ finals[0] = finals[0][:-1] + "2"
574
+ elif len(word) == 3:
575
+ word_list = self._split_word(word)
576
+ if self._all_tone_three(finals):
577
+ # disyllabic + monosyllabic, e.g. 蒙古/包
578
+ if len(word_list[0]) == 2:
579
+ finals[0] = finals[0][:-1] + "2"
580
+ finals[1] = finals[1][:-1] + "2"
581
+ # monosyllabic + disyllabic, e.g. 纸/老虎
582
+ elif len(word_list[0]) == 1:
583
+ finals[1] = finals[1][:-1] + "2"
584
+ else:
585
+ finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
586
+ if len(finals_list) == 2:
587
+ for i, sub in enumerate(finals_list):
588
+ # e.g. 所有/人
589
+ if self._all_tone_three(sub) and len(sub) == 2:
590
+ finals_list[i][0] = finals_list[i][0][:-1] + "2"
591
+ # e.g. 好/喜欢
592
+ elif (
593
+ i == 1
594
+ and not self._all_tone_three(sub)
595
+ and finals_list[i][0][-1] == "3"
596
+ and finals_list[0][-1][-1] == "3"
597
+ ):
598
+ finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
599
+ finals = sum(finals_list, [])
600
+ # split idiom into two words who's length is 2
601
+ elif len(word) == 4:
602
+ finals_list = [finals[:2], finals[2:]]
603
+ finals = []
604
+ for sub in finals_list:
605
+ if self._all_tone_three(sub):
606
+ sub[0] = sub[0][:-1] + "2"
607
+ finals += sub
608
+
609
+ return finals
610
+
611
+ def _all_tone_three(self, finals: List[str]) -> bool:
612
+ return all(x[-1] == "3" for x in finals)
613
+
614
+ # merge "不" and the word behind it
615
+ # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
616
+ def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
617
+ new_seg = []
618
+ last_word = ""
619
+ for word, pos in seg:
620
+ if last_word == "不":
621
+ word = last_word + word
622
+ if word != "不":
623
+ new_seg.append((word, pos))
624
+ last_word = word[:]
625
+ if last_word == "不":
626
+ new_seg.append((last_word, "d"))
627
+ last_word = ""
628
+ return new_seg
629
+
630
+ # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
631
+ # function 2: merge single "一" and the word behind it
632
+ # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
633
+ # e.g.
634
+ # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
635
+ # output seg: [['听一听', 'v']]
636
+ def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
637
+ new_seg = [] * len(seg)
638
+ # function 1
639
+ i = 0
640
+ while i < len(seg):
641
+ word, pos = seg[i]
642
+ if (
643
+ i - 1 >= 0
644
+ and word == "一"
645
+ and i + 1 < len(seg)
646
+ and seg[i - 1][0] == seg[i + 1][0]
647
+ and seg[i - 1][1] == "v"
648
+ ):
649
+ new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
650
+ i += 2
651
+ else:
652
+ if (
653
+ i - 2 >= 0
654
+ and seg[i - 1][0] == "一"
655
+ and seg[i - 2][0] == word
656
+ and pos == "v"
657
+ ):
658
+ continue
659
+ else:
660
+ new_seg.append([word, pos])
661
+ i += 1
662
+ seg = [i for i in new_seg if len(i) > 0]
663
+ new_seg = []
664
+ # function 2
665
+ for i, (word, pos) in enumerate(seg):
666
+ if new_seg and new_seg[-1][0] == "一":
667
+ new_seg[-1][0] = new_seg[-1][0] + word
668
+ else:
669
+ new_seg.append([word, pos])
670
+ return new_seg
671
+
672
+ # the first and the second words are all_tone_three
673
+ def _merge_continuous_three_tones(
674
+ self, seg: List[Tuple[str, str]]
675
+ ) -> List[Tuple[str, str]]:
676
+ new_seg = []
677
+ sub_finals_list = [
678
+ lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
679
+ for (word, pos) in seg
680
+ ]
681
+ assert len(sub_finals_list) == len(seg)
682
+ merge_last = [False] * len(seg)
683
+ for i, (word, pos) in enumerate(seg):
684
+ if (
685
+ i - 1 >= 0
686
+ and self._all_tone_three(sub_finals_list[i - 1])
687
+ and self._all_tone_three(sub_finals_list[i])
688
+ and not merge_last[i - 1]
689
+ ):
690
+ # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
691
+ if (
692
+ not self._is_reduplication(seg[i - 1][0])
693
+ and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
694
+ ):
695
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
696
+ merge_last[i] = True
697
+ else:
698
+ new_seg.append([word, pos])
699
+ else:
700
+ new_seg.append([word, pos])
701
+
702
+ return new_seg
703
+
704
+ def _is_reduplication(self, word: str) -> bool:
705
+ return len(word) == 2 and word[0] == word[1]
706
+
707
+ # the last char of first word and the first char of second word is tone_three
708
+ def _merge_continuous_three_tones_2(
709
+ self, seg: List[Tuple[str, str]]
710
+ ) -> List[Tuple[str, str]]:
711
+ new_seg = []
712
+ sub_finals_list = [
713
+ lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
714
+ for (word, pos) in seg
715
+ ]
716
+ assert len(sub_finals_list) == len(seg)
717
+ merge_last = [False] * len(seg)
718
+ for i, (word, pos) in enumerate(seg):
719
+ if (
720
+ i - 1 >= 0
721
+ and sub_finals_list[i - 1][-1][-1] == "3"
722
+ and sub_finals_list[i][0][-1] == "3"
723
+ and not merge_last[i - 1]
724
+ ):
725
+ # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
726
+ if (
727
+ not self._is_reduplication(seg[i - 1][0])
728
+ and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
729
+ ):
730
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
731
+ merge_last[i] = True
732
+ else:
733
+ new_seg.append([word, pos])
734
+ else:
735
+ new_seg.append([word, pos])
736
+ return new_seg
737
+
738
+ def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
739
+ new_seg = []
740
+ for i, (word, pos) in enumerate(seg):
741
+ if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#":
742
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
743
+ else:
744
+ new_seg.append([word, pos])
745
+ return new_seg
746
+
747
+ def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
748
+ new_seg = []
749
+ for i, (word, pos) in enumerate(seg):
750
+ if new_seg and word == new_seg[-1][0]:
751
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
752
+ else:
753
+ new_seg.append([word, pos])
754
+ return new_seg
755
+
756
+ def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
757
+ seg = self._merge_bu(seg)
758
+ try:
759
+ seg = self._merge_yi(seg)
760
+ except:
761
+ print("_merge_yi failed")
762
+ seg = self._merge_reduplication(seg)
763
+ seg = self._merge_continuous_three_tones(seg)
764
+ seg = self._merge_continuous_three_tones_2(seg)
765
+ seg = self._merge_er(seg)
766
+ return seg
767
+
768
+ def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
769
+ finals = self._bu_sandhi(word, finals)
770
+ finals = self._yi_sandhi(word, finals)
771
+ finals = self._neural_sandhi(word, pos, finals)
772
+ finals = self._three_sandhi(word, finals)
773
+ return finals