Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import re | |
from pathlib import Path | |
import jieba | |
from omegaconf import OmegaConf | |
from ipa.convert_digits import parse_num | |
from ipa.proc_text import ( | |
apply_v2f, | |
normalize_text, | |
prep_regex, | |
run_jieba, | |
update_jieba_dict, | |
) | |
ipa_configs = OmegaConf.to_object(OmegaConf.load("configs/ipa.yaml")) | |
for key in ipa_configs["preserved_list"]: | |
ipa_configs["v2f_dict"].pop(key, None) | |
delimiter_regex, replace_regex, v2f_regex = prep_regex( | |
ipa_configs["delimiter_list"], ipa_configs["replace_dict"], ipa_configs["v2f_dict"] | |
) | |
def get_ipa(raw_text, dialect): | |
lexicon = ipa_configs["lexicon"][dialect] | |
update_jieba_dict( | |
list(lexicon.keys()), Path(os.path.dirname(jieba.__file__)) / "dict.txt" | |
) | |
text = normalize_text(raw_text, ipa_configs["replace_dict"], replace_regex) | |
text = parse_num(text) | |
text_parts = [s.strip() for s in re.split(delimiter_regex, text) if s.strip()] | |
text = ",".join(text_parts) | |
word_list = run_jieba(text) | |
word_list = apply_v2f(word_list, ipa_configs["v2f_dict"], v2f_regex) | |
word_list = run_jieba("".join(word_list)) | |
final_words = [] | |
final_pinyin = [] | |
final_ipa = [] | |
missing_words = [] | |
for word in word_list: | |
if not bool(word.strip()): | |
continue | |
if word == ",": | |
final_words.append(",") | |
final_pinyin.append(",") | |
final_ipa.append(",") | |
elif word not in lexicon: | |
final_words.append(word) | |
missing_words.append(word) | |
else: | |
final_words.append(f"{word}") | |
final_pinyin.append(lexicon[word]['pinyin'][0]) | |
# NOTE 只有 lexicon[word] 中的第一個 ipa 才被考慮 | |
final_ipa.append(lexicon[word]['ipa'][0].replace(" ", "-")) | |
if len(final_ipa) == 0 or len(missing_words) > 0: | |
return final_words, final_ipa, final_pinyin, missing_words | |
final_words = " ".join(final_words).replace(" , ", ",") | |
final_ipa = " ".join(final_ipa).replace(" , ", ",") | |
final_pinyin = " ".join(final_pinyin).replace(" , ", ",") | |
return final_words, final_ipa, final_pinyin, missing_words | |
def parse_ipa(ipa: str): | |
text = [] | |
ipa_list = re.split(r"(?<![, -])(?=[, -])|(?<=[, -])(?![, -])",ipa) | |
# tone as a separate token | |
for phoneme_with_tone in ipa_list: | |
if phoneme_with_tone ==" ": | |
text.append(phoneme_with_tone) | |
continue | |
elif phoneme_with_tone == ",": | |
text.extend(" , ") | |
continue | |
elif phoneme_with_tone == "-": # use " " split 詞 (or use " " to split 字) | |
continue | |
split_phoneme_and_tone = phoneme_with_tone.split("_") | |
if len(split_phoneme_and_tone) == 2: | |
phoneme, tone = split_phoneme_and_tone | |
text.extend(phoneme) | |
text.append(tone) | |
else: | |
text.extend(split_phoneme_and_tone[0]) | |
return text |