|
from tokenizers import Tokenizer |
|
|
|
tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json") |
|
|
|
def get_oov(): |
|
|
|
f_out = open("oov.txt", "w", encoding="utf-8") |
|
all_words = open("../../vocab.freq.zh.txt", "r", encoding="utf-8") |
|
for line in all_words: |
|
word, count = line.strip().split("\t") |
|
if "�" in word or word in ["之长", "个好", "亿亿", "余个", "聊了", "与该", "多花"]: |
|
continue |
|
|
|
encoding = tokenizer.encode(word) |
|
if len(encoding.ids) > 1: |
|
f_out.write(line) |
|
|
|
|
|
def build_vocab(): |
|
pass |
|
|
|
|
|
|
|
def convert_oov_to_merges(): |
|
"""将词拆分成merge分组,必须是两个一组, |
|
比如 |
|
承担 -> 承 担 |
|
天津市 -> 天津 市 |
|
社会保障 -> 社会 保障 |
|
的一部分 -> 的 一部分 -> 一 部分 |
|
""" |
|
all_tokens_and_counts = [line.strip().split("\t") for line in open("oov.txt", "r", encoding="utf-8")] |
|
all_tokens = [token for token,count in all_tokens_and_counts if int(count) > 2] |
|
len1 = [token for token in all_tokens if len(token) == 1] |
|
len2 = [token for token in all_tokens if len(token) == 2] |
|
len3 = [token for token in all_tokens if len(token) == 3] |
|
len4 = [token for token in all_tokens if len(token) == 4] |
|
print(len(len1), len(len2), len(len3), len(len4)) |
|
|
|
|
|
|
|
|
|
|
|
with open("oov.add.txt", "w", encoding="utf-8") as f_out: |
|
for token in len1: |
|
f_out.write(token + "\n") |
|
for token in len2[:20000]: |
|
f_out.write(token + "\n") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
get_oov() |
|
convert_oov_to_merges() |