|
""" |
|
最简单的tokenizer |
|
""" |
|
|
|
|
|
import json |
|
from vocab.gpt_nexo_20b.tokenizer.tokenizer import HFTokenizer |
|
|
|
tokenizer = HFTokenizer("20B_tokenizer.json") |
|
|
|
|
|
print("vocab_size with added_tokens:", tokenizer.vocab_size) |
|
|
|
vocab = tokenizer.vocab |
|
|
|
def test_single_token(): |
|
""" |
|
单个字符的编码(一个字符可能会编码成多个id) |
|
""" |
|
for word in "中国解决方法黑白侗鸩,。!?;": |
|
encoding = tokenizer.tokenize(word) |
|
for token_id in encoding: |
|
decode_str = tokenizer.detokenize([token_id]) |
|
|
|
print(word, token_id, decode_str, json.dumps(decode_str), ) |
|
|
|
|
|
|
|
def test_encode(): |
|
text = "中国解决方法黑白侗鸩,。!?;一个人去哪里 一 个" |
|
encoding = tokenizer.tokenize(text) |
|
for token_id in encoding: |
|
decode_str = tokenizer.detokenize([token_id]) |
|
token = tokenizer.tokenizer.id_to_token(token_id) |
|
print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) |
|
|
|
|
|
test_encode() |
|
|