Spaces:
Running
Running
from tokenizers import Tokenizer | |
tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json") | |
def get_oov(): | |
f_out = open("oov.txt", "w", encoding="utf-8") | |
all_words = open("../../vocab.freq.zh.txt", "r", encoding="utf-8") | |
for line in all_words: | |
word, count = line.strip().split("\t") | |
if "�" in word or word in ["之长", "个好", "亿亿", "余个", "聊了", "与该", "多花"]: | |
continue | |
encoding = tokenizer.encode(word) | |
if len(encoding.ids) > 1: | |
f_out.write(line) | |
def build_vocab(): | |
pass | |
def convert_oov_to_merges(): | |
"""将词拆分成merge分组,必须是两个一组, | |
比如 | |
承担 -> 承 担 | |
天津市 -> 天津 市 | |
社会保障 -> 社会 保障 | |
的一部分 -> 的 一部分 -> 一 部分 | |
""" | |
all_tokens_and_counts = [line.strip().split("\t") for line in open("oov.txt", "r", encoding="utf-8")] | |
all_tokens = [token for token,count in all_tokens_and_counts if int(count) > 2] # 至少3个词典中出现过 | |
len1 = [token for token in all_tokens if len(token) == 1] | |
len2 = [token for token in all_tokens if len(token) == 2] | |
len3 = [token for token in all_tokens if len(token) == 3] | |
len4 = [token for token in all_tokens if len(token) == 4] | |
print(len(len1), len(len2), len(len3), len(len4)) | |
# vocab = set(["天津", "社会", "保障", "部分", "一部分", "需要", "数据", "使用", "我们", "一个",] + len2) | |
# vocab = set(["天津", "社会", "保障", "部分", "需要", "数据", "使用", "我们", "一个"] + len2) | |
with open("oov.add.txt", "w", encoding="utf-8") as f_out: | |
for token in len1: | |
f_out.write(token + "\n") | |
for token in len2[:20000]: | |
f_out.write(token + "\n") | |
# f_out.write(token[0] + " " + token[1] + "\n") | |
# for token in len3: | |
# idx = -1 | |
# for part in len2: | |
# if part in token: | |
# idx = token.find(part) | |
# break | |
# if idx == -1: | |
# print("not found", token) | |
# elif idx == 0: | |
# f_out.write(token[0] + " " + token[1:] + "\n") | |
# else: | |
# f_out.write(token[:2] + " " + token[2] + "\n") | |
get_oov() | |
convert_oov_to_merges() |