Spaces:
Sleeping
Sleeping
# Convert Japanese text to phonemes which is | |
# compatible with Julius https://github.com/julius-speech/segmentation-kit | |
import re | |
import unicodedata | |
from transformers import AutoTokenizer | |
from text import punctuation, symbols | |
try: | |
import MeCab | |
except ImportError as e: | |
raise ImportError("Japanese requires mecab-python3 and unidic-lite.") from e | |
from num2words import num2words | |
_CONVRULES = [ | |
# Conversion of 2 letters | |
"アァ/ a a", | |
"イィ/ i i", | |
"イェ/ i e", | |
"イャ/ y a", | |
"ウゥ/ u:", | |
"エェ/ e e", | |
"オォ/ o:", | |
"カァ/ k a:", | |
"キィ/ k i:", | |
"クゥ/ k u:", | |
"クャ/ ky a", | |
"クュ/ ky u", | |
"クョ/ ky o", | |
"ケェ/ k e:", | |
"コォ/ k o:", | |
"ガァ/ g a:", | |
"ギィ/ g i:", | |
"グゥ/ g u:", | |
"グャ/ gy a", | |
"グュ/ gy u", | |
"グョ/ gy o", | |
"ゲェ/ g e:", | |
"ゴォ/ g o:", | |
"サァ/ s a:", | |
"シィ/ sh i:", | |
"スゥ/ s u:", | |
"スャ/ sh a", | |
"スュ/ sh u", | |
"スョ/ sh o", | |
"セェ/ s e:", | |
"ソォ/ s o:", | |
"ザァ/ z a:", | |
"ジィ/ j i:", | |
"ズゥ/ z u:", | |
"ズャ/ zy a", | |
"ズュ/ zy u", | |
"ズョ/ zy o", | |
"ゼェ/ z e:", | |
"ゾォ/ z o:", | |
"タァ/ t a:", | |
"チィ/ ch i:", | |
"ツァ/ ts a", | |
"ツィ/ ts i", | |
"ツゥ/ ts u:", | |
"ツャ/ ch a", | |
"ツュ/ ch u", | |
"ツョ/ ch o", | |
"ツェ/ ts e", | |
"ツォ/ ts o", | |
"テェ/ t e:", | |
"トォ/ t o:", | |
"ダァ/ d a:", | |
"ヂィ/ j i:", | |
"ヅゥ/ d u:", | |
"ヅャ/ zy a", | |
"ヅュ/ zy u", | |
"ヅョ/ zy o", | |
"デェ/ d e:", | |
"ドォ/ d o:", | |
"ナァ/ n a:", | |
"ニィ/ n i:", | |
"ヌゥ/ n u:", | |
"ヌャ/ ny a", | |
"ヌュ/ ny u", | |
"ヌョ/ ny o", | |
"ネェ/ n e:", | |
"ノォ/ n o:", | |
"ハァ/ h a:", | |
"ヒィ/ h i:", | |
"フゥ/ f u:", | |
"フャ/ hy a", | |
"フュ/ hy u", | |
"フョ/ hy o", | |
"ヘェ/ h e:", | |
"ホォ/ h o:", | |
"バァ/ b a:", | |
"ビィ/ b i:", | |
"ブゥ/ b u:", | |
"フャ/ hy a", | |
"ブュ/ by u", | |
"フョ/ hy o", | |
"ベェ/ b e:", | |
"ボォ/ b o:", | |
"パァ/ p a:", | |
"ピィ/ p i:", | |
"プゥ/ p u:", | |
"プャ/ py a", | |
"プュ/ py u", | |
"プョ/ py o", | |
"ペェ/ p e:", | |
"ポォ/ p o:", | |
"マァ/ m a:", | |
"ミィ/ m i:", | |
"ムゥ/ m u:", | |
"ムャ/ my a", | |
"ムュ/ my u", | |
"ムョ/ my o", | |
"メェ/ m e:", | |
"モォ/ m o:", | |
"ヤァ/ y a:", | |
"ユゥ/ y u:", | |
"ユャ/ y a:", | |
"ユュ/ y u:", | |
"ユョ/ y o:", | |
"ヨォ/ y o:", | |
"ラァ/ r a:", | |
"リィ/ r i:", | |
"ルゥ/ r u:", | |
"ルャ/ ry a", | |
"ルュ/ ry u", | |
"ルョ/ ry o", | |
"レェ/ r e:", | |
"ロォ/ r o:", | |
"ワァ/ w a:", | |
"ヲォ/ o:", | |
"ディ/ d i", | |
"デェ/ d e:", | |
"デャ/ dy a", | |
"デュ/ dy u", | |
"デョ/ dy o", | |
"ティ/ t i", | |
"テェ/ t e:", | |
"テャ/ ty a", | |
"テュ/ ty u", | |
"テョ/ ty o", | |
"スィ/ s i", | |
"ズァ/ z u a", | |
"ズィ/ z i", | |
"ズゥ/ z u", | |
"ズャ/ zy a", | |
"ズュ/ zy u", | |
"ズョ/ zy o", | |
"ズェ/ z e", | |
"ズォ/ z o", | |
"キャ/ ky a", | |
"キュ/ ky u", | |
"キョ/ ky o", | |
"シャ/ sh a", | |
"シュ/ sh u", | |
"シェ/ sh e", | |
"ショ/ sh o", | |
"チャ/ ch a", | |
"チュ/ ch u", | |
"チェ/ ch e", | |
"チョ/ ch o", | |
"トゥ/ t u", | |
"トャ/ ty a", | |
"トュ/ ty u", | |
"トョ/ ty o", | |
"ドァ/ d o a", | |
"ドゥ/ d u", | |
"ドャ/ dy a", | |
"ドュ/ dy u", | |
"ドョ/ dy o", | |
"ドォ/ d o:", | |
"ニャ/ ny a", | |
"ニュ/ ny u", | |
"ニョ/ ny o", | |
"ヒャ/ hy a", | |
"ヒュ/ hy u", | |
"ヒョ/ hy o", | |
"ミャ/ my a", | |
"ミュ/ my u", | |
"ミョ/ my o", | |
"リャ/ ry a", | |
"リュ/ ry u", | |
"リョ/ ry o", | |
"ギャ/ gy a", | |
"ギュ/ gy u", | |
"ギョ/ gy o", | |
"ヂェ/ j e", | |
"ヂャ/ j a", | |
"ヂュ/ j u", | |
"ヂョ/ j o", | |
"ジェ/ j e", | |
"ジャ/ j a", | |
"ジュ/ j u", | |
"ジョ/ j o", | |
"ビャ/ by a", | |
"ビュ/ by u", | |
"ビョ/ by o", | |
"ピャ/ py a", | |
"ピュ/ py u", | |
"ピョ/ py o", | |
"ウァ/ u a", | |
"ウィ/ w i", | |
"ウェ/ w e", | |
"ウォ/ w o", | |
"ファ/ f a", | |
"フィ/ f i", | |
"フゥ/ f u", | |
"フャ/ hy a", | |
"フュ/ hy u", | |
"フョ/ hy o", | |
"フェ/ f e", | |
"フォ/ f o", | |
"ヴァ/ b a", | |
"ヴィ/ b i", | |
"ヴェ/ b e", | |
"ヴォ/ b o", | |
"ヴュ/ by u", | |
# Conversion of 1 letter | |
"ア/ a", | |
"イ/ i", | |
"ウ/ u", | |
"エ/ e", | |
"オ/ o", | |
"カ/ k a", | |
"キ/ k i", | |
"ク/ k u", | |
"ケ/ k e", | |
"コ/ k o", | |
"サ/ s a", | |
"シ/ sh i", | |
"ス/ s u", | |
"セ/ s e", | |
"ソ/ s o", | |
"タ/ t a", | |
"チ/ ch i", | |
"ツ/ ts u", | |
"テ/ t e", | |
"ト/ t o", | |
"ナ/ n a", | |
"ニ/ n i", | |
"ヌ/ n u", | |
"ネ/ n e", | |
"ノ/ n o", | |
"ハ/ h a", | |
"ヒ/ h i", | |
"フ/ f u", | |
"ヘ/ h e", | |
"ホ/ h o", | |
"マ/ m a", | |
"ミ/ m i", | |
"ム/ m u", | |
"メ/ m e", | |
"モ/ m o", | |
"ラ/ r a", | |
"リ/ r i", | |
"ル/ r u", | |
"レ/ r e", | |
"ロ/ r o", | |
"ガ/ g a", | |
"ギ/ g i", | |
"グ/ g u", | |
"ゲ/ g e", | |
"ゴ/ g o", | |
"ザ/ z a", | |
"ジ/ j i", | |
"ズ/ z u", | |
"ゼ/ z e", | |
"ゾ/ z o", | |
"ダ/ d a", | |
"ヂ/ j i", | |
"ヅ/ z u", | |
"デ/ d e", | |
"ド/ d o", | |
"バ/ b a", | |
"ビ/ b i", | |
"ブ/ b u", | |
"ベ/ b e", | |
"ボ/ b o", | |
"パ/ p a", | |
"ピ/ p i", | |
"プ/ p u", | |
"ペ/ p e", | |
"ポ/ p o", | |
"ヤ/ y a", | |
"ユ/ y u", | |
"ヨ/ y o", | |
"ワ/ w a", | |
"ヰ/ i", | |
"ヱ/ e", | |
"ヲ/ o", | |
"ン/ N", | |
"ッ/ q", | |
"ヴ/ b u", | |
"ー/:", | |
# Try converting broken text | |
"ァ/ a", | |
"ィ/ i", | |
"ゥ/ u", | |
"ェ/ e", | |
"ォ/ o", | |
"ヮ/ w a", | |
"ォ/ o", | |
# Symbols | |
"、/ ,", | |
"。/ .", | |
"!/ !", | |
"?/ ?", | |
"・/ ,", | |
] | |
_COLON_RX = re.compile(":+") | |
_REJECT_RX = re.compile("[^ a-zA-Z:,.?]") | |
def _makerulemap(): | |
l = [tuple(x.split("/")) for x in _CONVRULES] | |
return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2)) | |
_RULEMAP1, _RULEMAP2 = _makerulemap() | |
def kata2phoneme(text: str) -> str: | |
"""Convert katakana text to phonemes.""" | |
text = text.strip() | |
res = [] | |
while text: | |
if len(text) >= 2: | |
x = _RULEMAP2.get(text[:2]) | |
if x is not None: | |
text = text[2:] | |
res += x.split(" ")[1:] | |
continue | |
x = _RULEMAP1.get(text[0]) | |
if x is not None: | |
text = text[1:] | |
res += x.split(" ")[1:] | |
continue | |
res.append(text[0]) | |
text = text[1:] | |
# res = _COLON_RX.sub(":", res) | |
return res | |
_KATAKANA = "".join(chr(ch) for ch in range(ord("ァ"), ord("ン") + 1)) | |
_HIRAGANA = "".join(chr(ch) for ch in range(ord("ぁ"), ord("ん") + 1)) | |
_HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA) | |
def hira2kata(text: str) -> str: | |
text = text.translate(_HIRA2KATATRANS) | |
return text.replace("う゛", "ヴ") | |
_SYMBOL_TOKENS = set(list("・、。?!")) | |
_NO_YOMI_TOKENS = set(list("「」『』―()[][]")) | |
_TAGGER = MeCab.Tagger() | |
def text2kata(text: str) -> str: | |
parsed = _TAGGER.parse(text) | |
res = [] | |
for line in parsed.split("\n"): | |
if line == "EOS": | |
break | |
parts = line.split("\t") | |
word, yomi = parts[0], parts[1] | |
if yomi: | |
res.append(yomi) | |
else: | |
if word in _SYMBOL_TOKENS: | |
res.append(word) | |
elif word in ("っ", "ッ"): | |
res.append("ッ") | |
elif word in _NO_YOMI_TOKENS: | |
pass | |
else: | |
res.append(word) | |
return hira2kata("".join(res)) | |
_ALPHASYMBOL_YOMI = { | |
"#": "シャープ", | |
"%": "パーセント", | |
"&": "アンド", | |
"+": "プラス", | |
"-": "マイナス", | |
":": "コロン", | |
";": "セミコロン", | |
"<": "小なり", | |
"=": "イコール", | |
">": "大なり", | |
"@": "アット", | |
"a": "エー", | |
"b": "ビー", | |
"c": "シー", | |
"d": "ディー", | |
"e": "イー", | |
"f": "エフ", | |
"g": "ジー", | |
"h": "エイチ", | |
"i": "アイ", | |
"j": "ジェー", | |
"k": "ケー", | |
"l": "エル", | |
"m": "エム", | |
"n": "エヌ", | |
"o": "オー", | |
"p": "ピー", | |
"q": "キュー", | |
"r": "アール", | |
"s": "エス", | |
"t": "ティー", | |
"u": "ユー", | |
"v": "ブイ", | |
"w": "ダブリュー", | |
"x": "エックス", | |
"y": "ワイ", | |
"z": "ゼット", | |
"α": "アルファ", | |
"β": "ベータ", | |
"γ": "ガンマ", | |
"δ": "デルタ", | |
"ε": "イプシロン", | |
"ζ": "ゼータ", | |
"η": "イータ", | |
"θ": "シータ", | |
"ι": "イオタ", | |
"κ": "カッパ", | |
"λ": "ラムダ", | |
"μ": "ミュー", | |
"ν": "ニュー", | |
"ξ": "クサイ", | |
"ο": "オミクロン", | |
"π": "パイ", | |
"ρ": "ロー", | |
"σ": "シグマ", | |
"τ": "タウ", | |
"υ": "ウプシロン", | |
"φ": "ファイ", | |
"χ": "カイ", | |
"ψ": "プサイ", | |
"ω": "オメガ", | |
} | |
_NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+") | |
_CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"} | |
_CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])") | |
_NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?") | |
def japanese_convert_numbers_to_words(text: str) -> str: | |
res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text) | |
res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res) | |
res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res) | |
return res | |
def japanese_convert_alpha_symbols_to_words(text: str) -> str: | |
return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()]) | |
def japanese_text_to_phonemes(text: str) -> str: | |
"""Convert Japanese text to phonemes.""" | |
res = unicodedata.normalize("NFKC", text) | |
res = japanese_convert_numbers_to_words(res) | |
# res = japanese_convert_alpha_symbols_to_words(res) | |
res = text2kata(res) | |
res = kata2phoneme(res) | |
return res | |
def is_japanese_character(char): | |
# 定义日语文字系统的 Unicode 范围 | |
japanese_ranges = [ | |
(0x3040, 0x309F), # 平假名 | |
(0x30A0, 0x30FF), # 片假名 | |
(0x4E00, 0x9FFF), # 汉字 (CJK Unified Ideographs) | |
(0x3400, 0x4DBF), # 汉字扩展 A | |
(0x20000, 0x2A6DF), # 汉字扩展 B | |
# 可以根据需要添加其他汉字扩展范围 | |
] | |
# 将字符的 Unicode 编码转换为整数 | |
char_code = ord(char) | |
# 检查字符是否在任何一个日语范围内 | |
for start, end in japanese_ranges: | |
if start <= char_code <= end: | |
return True | |
return False | |
rep_map = { | |
":": ",", | |
";": ",", | |
",": ",", | |
"。": ".", | |
"!": "!", | |
"?": "?", | |
"\n": ".", | |
"·": ",", | |
"、": ",", | |
"...": "…", | |
} | |
def replace_punctuation(text): | |
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) | |
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) | |
replaced_text = re.sub( | |
r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF" | |
+ "".join(punctuation) | |
+ r"]+", | |
"", | |
replaced_text, | |
) | |
return replaced_text | |
def text_normalize(text): | |
res = unicodedata.normalize("NFKC", text) | |
res = japanese_convert_numbers_to_words(res) | |
# res = "".join([i for i in res if is_japanese_character(i)]) | |
res = replace_punctuation(res) | |
return res | |
def distribute_phone(n_phone, n_word): | |
phones_per_word = [0] * n_word | |
for task in range(n_phone): | |
min_tasks = min(phones_per_word) | |
min_index = phones_per_word.index(min_tasks) | |
phones_per_word[min_index] += 1 | |
return phones_per_word | |
tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") | |
def g2p(norm_text): | |
tokenized = tokenizer.tokenize(norm_text) | |
phs = [] | |
ph_groups = [] | |
for t in tokenized: | |
if not t.startswith("#"): | |
ph_groups.append([t]) | |
else: | |
ph_groups[-1].append(t.replace("#", "")) | |
word2ph = [] | |
for group in ph_groups: | |
phonemes = kata2phoneme(text2kata("".join(group))) | |
# phonemes = [i for i in phonemes if i in symbols] | |
for i in phonemes: | |
assert i in symbols, (group, norm_text, tokenized) | |
phone_len = len(phonemes) | |
word_len = len(group) | |
aaa = distribute_phone(phone_len, word_len) | |
word2ph += aaa | |
phs += phonemes | |
phones = ["_"] + phs + ["_"] | |
tones = [0 for i in phones] | |
word2ph = [1] + word2ph + [1] | |
return phones, tones, word2ph | |
if __name__ == "__main__": | |
tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") | |
text = "hello,こんにちは、世界!……" | |
from text.japanese_bert import get_bert_feature | |
text = text_normalize(text) | |
print(text) | |
phones, tones, word2ph = g2p(text) | |
bert = get_bert_feature(text, word2ph) | |
print(phones, tones, word2ph, bert.shape) | |