Spaces:
Running
Running
File size: 1,520 Bytes
751936e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
"""
merge 是干嘛的?
## 结果
共merge 4357 个 token
"""
import json
from tokenizers import Tokenizer
from data_sample.oov_base import jd_vocab_tokens
from zhon.hanzi import punctuation as zh_punc
def load_base_tokenizer():
old_vocab_path = "../gpt_nexo_20b/20B_tokenizer.json"
data = json.load(open(old_vocab_path, "r", encoding="utf-8"))
tokenizer = Tokenizer.from_file(old_vocab_path)
print("vocab_size with added_tokens:", )
return data, tokenizer
data, base_tokenizer = load_base_tokenizer()
vocab = data["model"]["vocab"]
merges = data["model"]["merges"]
vocab_size = base_tokenizer.get_vocab_size(with_added_tokens=True)
"""
方式一:原有的added_tokens保持id不变。方式二:原有的added_tokens进行id移位。
以下采用方式一。
"""
new_added_tokens = set()
for word in jd_vocab_tokens + list(zh_punc):
if len(word) > 1 or word in new_added_tokens:
continue
encoding = base_tokenizer.encode(word)
# if len(encoding.ids) > 1:
if len(encoding.ids) == 2: # 3个的,怎么处理?
tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
print("merging", vocab_size, word, json.dumps(tokens))
vocab["".join(tokens)] = vocab_size
vocab_size += 1
merges.append(" ".join(tokens))
new_added_tokens.add(word)
print("共merge %d 个 token" % (len(new_added_tokens)))
f_out = open("20B_tokenizer_chinese_2.json", "w", encoding="utf-8")
json.dump(data, f_out, indent=2) |