import os import re from pathlib import Path import jieba from omegaconf import OmegaConf from ipa.convert_digits import parse_num from ipa.proc_text import ( apply_v2f, normalize_text, prep_regex, run_jieba, update_jieba_dict, ) ipa_configs = OmegaConf.to_object(OmegaConf.load("configs/ipa.yaml")) for key in ipa_configs["preserved_list"]: ipa_configs["v2f_dict"].pop(key, None) delimiter_regex, replace_regex, v2f_regex = prep_regex( ipa_configs["delimiter_list"], ipa_configs["replace_dict"], ipa_configs["v2f_dict"] ) def get_ipa(raw_text: str, dialect: str) -> tuple[str, str, str, list[str]]: pinyin_split = re.split( r"([a-z]+\d+)", raw_text ) final_words = [] final_pinyin = [] final_ipa = [] final_missing_words = [] for hanzi_or_pinyin in pinyin_split: if len(hanzi_or_pinyin.strip()) == 0: continue if re.search(r"[a-z]+\d+", hanzi_or_pinyin): final_words.append(hanzi_or_pinyin) final_pinyin.append(hanzi_or_pinyin) pinyin, tone = re.match(r"([a-z]+)(\d+)?", hanzi_or_pinyin).groups() tone = f"_{tone}" if tone else "" ipa = parse_pinyin_to_ipa(pinyin) if ipa is None: final_missing_words.append(pinyin) continue final_ipa.append(ipa + tone) else: words, ipa, pinyin, missing_words = parse_hanzi_to_ipa( hanzi_or_pinyin, dialect ) final_words.extend(words) final_ipa.extend(ipa) final_pinyin.extend(pinyin) final_missing_words.extend(missing_words) if len(final_ipa) == 0 or len(final_missing_words) > 0: return final_words, final_ipa, final_pinyin, final_missing_words final_words = " ".join(final_words).replace(" , ", ",") final_ipa = " ".join(final_ipa).replace(" , ", ",") final_pinyin = " ".join(final_pinyin).replace(" , ", ",") return final_words, final_ipa, final_pinyin, final_missing_words def parse_ipa(ipa: str, delete_chars="\+\-\|\_", as_space="")->list[str]: text = [] ipa_list = re.split(r"(? 0: word = re.sub(r"[{}]".format(as_space), " ", word) if len(delete_chars) > 0: word = re.sub(r"[{}]".format(delete_chars), "", word) word = word.replace(",", " , ") text.extend(word) return text def parse_pinyin_to_ipa(pinyin: str)->str|None: if pinyin not in ipa_configs["pinyin_to_ipa_dict"]: return None ipa_dict_result = ipa_configs["pinyin_to_ipa_dict"][pinyin] ipa = "+".join(ipa_dict_result).replace(" ", "-") return ipa def parse_hanzi_to_ipa( hanzi: str, dialect: str ) -> tuple[list[str], list[str], list[str], list[str]]: lexicon = ipa_configs["lexicon"][dialect] update_jieba_dict( list(lexicon.keys()), Path(os.path.dirname(jieba.__file__)) / "dict.txt" ) text = normalize_text(hanzi, ipa_configs["replace_dict"], replace_regex) text = parse_num(text) text_parts = [s.strip() for s in re.split(delimiter_regex, text) if s.strip()] text = ",".join(text_parts) word_list = run_jieba(text) word_list = apply_v2f(word_list, ipa_configs["v2f_dict"], v2f_regex) word_list = run_jieba("".join(word_list)) final_words = [] final_pinyin = [] final_ipa = [] missing_words = [] for word in word_list: if not bool(word.strip()): continue if word == ",": final_words.append(",") final_pinyin.append(",") final_ipa.append(",") elif word not in lexicon: final_words.append(word) missing_words.append(word) else: final_words.append(f"{word}") final_pinyin.append(lexicon[word]["pinyin"][0]) # NOTE 只有 lexicon[word] 中的第一個 ipa 才被考慮 final_ipa.append(lexicon[word]["ipa"][0]) return final_words, final_ipa, final_pinyin, missing_words