# Copyright 2024 Hung-Shin Lee (hungshinlee@gmail.com) # Apache 2.0 import re from pathlib import Path from unicodedata import normalize import jieba import opencc jieba.setLogLevel(20) jieba.re_han_default = re.compile("([\u2e80-\U000e01efa-zA-Z0-9+#&\._%\-']+)", re.U) s2tw_converter = opencc.OpenCC("s2tw.json") def update_jieba_dict( lexicon: list, jieba_dict_path: Path, high_freq_words: list = [], high_freq_words_weight: int = 10, ) -> list: lexicon = sorted(set(lexicon)) jieba_dict_path.unlink(missing_ok=True) Path("/tmp/jieba.cache").unlink(missing_ok=True) with jieba_dict_path.open("w", encoding="utf-8") as file: for word in lexicon: if word in high_freq_words: file.write(f"{word} {len(word) * high_freq_words_weight}\n") else: file.write(f"{word} {len(word)}\n") jieba.dt.initialized = False return lexicon def run_jieba(line: str) -> list: # NOTE JIEBA 處理多行文本的結果會失去原本的行結構 seg_list = list(jieba.cut(line, cut_all=False, HMM=False)) return seg_list def normalize_text(text: str, replace_dict: dict, replace_regex: str) -> str: def replace_match(match): return replace_dict[match.group(0)] text = re.sub("\x08", "", text) text = re.sub("\ufeff", "", text) text = re.sub("\u0010", "", text) text = normalize("NFKC", text) text = re.sub(replace_regex, replace_match, text) text = " ".join(text.split()).upper() return text def apply_v2f(word_list: list, v2f_dict: dict, v2f_regex: str) -> list: result = [] for word in word_list: result.append(re.sub(v2f_regex, lambda x: v2f_dict[x.group(0)], word)) return result def prep_regex( delimiter_list: list, replace_dict: dict = {}, v2f_dict: dict = {} ) -> tuple[str, str, str]: delimiter_regex = "|".join(map(re.escape, delimiter_list)) replace_regex = "" if len(replace_dict): sorted_keys = sorted(replace_dict.keys(), key=len, reverse=True) replace_regex = "|".join(map(re.escape, sorted_keys)) v2f_regex = "" if len(v2f_dict): v2f_regex = "|".join(map(re.escape, v2f_dict.keys())) return delimiter_regex, replace_regex, v2f_regex