|
""" |
|
|
|
vocab size: 106029 |
|
|
|
中文汉字数:54230, 中文标点数: 549 |
|
|
|
moss很奇怪, |
|
""" |
|
|
|
import json |
|
from transformers import AutoTokenizer, BloomTokenizerFast |
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("moss-moon-003-sft", trust_remote_code=True) |
|
|
|
print("vocab size:", tokenizer.vocab_size) |
|
|
|
|
|
tokens = tokenizer.encode("中<eoc>") |
|
decode_line = tokenizer.decode(tokens) |
|
for token in tokens: |
|
print(token, tokenizer.decode([token])) |
|
|
|
|
|
def test1(): |
|
word = "中" |
|
token_ids = tokenizer.encode(word) |
|
tokens = tokenizer.convert_ids_to_tokens(token_ids) |
|
print(tokens) |
|
print([ord(k) for k in tokens[0]]) |
|
decode_str = tokenizer.convert_tokens_to_string(tokens) |
|
print(decode_str) |
|
|
|
def test_token(): |
|
for word in "中国解决方法黑白侗,。!?;": |
|
encoding = tokenizer.encode(word) |
|
for token_id in encoding: |
|
decode_str = tokenizer.decode([token_id]) |
|
token = tokenizer.convert_ids_to_tokens([token_id]) |
|
print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) |
|
|
|
|
|
|
|
|
|
test1() |