from typing import List | |
import jieba | |
import pypinyin | |
from .pinyinToPhonemes import PINYIN_DICT | |
def _chinese_character_to_pinyin(text: str) -> List[str]: | |
pinyins = pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True) | |
pinyins_flat_list = [item for sublist in pinyins for item in sublist] | |
return pinyins_flat_list | |
def _chinese_pinyin_to_phoneme(pinyin: str) -> str: | |
segment = pinyin[:-1] | |
tone = pinyin[-1] | |
phoneme = PINYIN_DICT.get(segment, [""])[0] | |
return phoneme + tone | |
def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str: | |
tokenized_text = jieba.cut(text, HMM=False) | |
tokenized_text = " ".join(tokenized_text) | |
pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text) | |
results: List[str] = [] | |
for token in pinyined_text: | |
if token[-1] in "12345": # TODO transform to is_pinyin() | |
pinyin_phonemes = _chinese_pinyin_to_phoneme(token) | |
results += list(pinyin_phonemes) | |
else: # is ponctuation or other | |
results += list(token) | |
return seperator.join(results) | |