|
""" |
|
最简单的tokenizer |
|
""" |
|
|
|
import json |
|
from tokenizers import Tokenizer |
|
|
|
tokenizer = Tokenizer.from_file("20B_tokenizer.json") |
|
|
|
print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True)) |
|
print("vocab_size without added_tokens:", tokenizer.get_vocab_size(with_added_tokens=False)) |
|
|
|
vocab = tokenizer.get_vocab() |
|
|
|
def to_unicode(text): |
|
return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text) |
|
|
|
|
|
def is_UTF_8(str): |
|
remain = 0 |
|
for x in range(len(str)): |
|
if remain == 0: |
|
if (ord(str[x]) & 0x80) == 0x00: |
|
remain = 0 |
|
elif (ord(str[x]) & 0xE0) == 0xC0: |
|
remain = 1 |
|
elif (ord(str[x]) & 0xF0) == 0xE0: |
|
remain = 2 |
|
elif (ord(str[x]) & 0xF8) == 0xF0: |
|
remain = 3 |
|
else: |
|
return False |
|
else: |
|
if not ((ord(str[x]) & 0xC0) == 0x80): |
|
return False |
|
remain = remain - 1 |
|
if remain == 0: |
|
return True |
|
else: |
|
return False |
|
|
|
|
|
|
|
def test_reverse(): |
|
f_out = open("reverse.jsonl", "w", encoding="utf-8") |
|
for token_id in range(tokenizer.get_vocab_size(with_added_tokens=False)): |
|
token = tokenizer.id_to_token(token_id) |
|
print(token_id, is_UTF_8(token)) |
|
if "Ġ" in token: |
|
continue |
|
|
|
|
|
encoding = tokenizer.encode(token) |
|
if len(encoding.ids) > 1 or encoding.ids[0] != token_id: |
|
f_out.write(json.dumps({"id": token_id, "token": token, "encoding": encoding.ids, "is_utf8": is_UTF_8(token), "isalpha": token.isalpha()}) + "\n") |
|
|
|
|
|
|
|
def test_single_token(): |
|
""" |
|
单个字符的编码(一个字符可能会编码成多个id) |
|
""" |
|
for word in "发大厦三分赛中国解决方法黑白侗鸩,。!?;ĠABC": |
|
encoding = tokenizer.encode(word) |
|
for token_id in encoding.ids: |
|
decode_str = tokenizer.decode([token_id]) |
|
token = tokenizer.id_to_token(token_id) |
|
print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token), token.encode("utf-8"), bytes(token, "utf-8"), to_unicode(token)) |
|
|
|
|
|
def test_long_token(): |
|
""" |
|
|
|
|
|
""" |
|
words = [ |
|
"//----------------------------------------------------------------", |
|
"--------------------------", |
|
"-------------------------", |
|
"-----------------------", |
|
] |
|
for word in words: |
|
encoding = tokenizer.encode(word) |
|
for token_id in encoding.ids: |
|
decode_str = tokenizer.decode([token_id]) |
|
token = tokenizer.id_to_token(token_id) |
|
print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) |
|
|
|
|
|
def test_encode(): |
|
text = "中国解决方法黑白侗鸩,。!?;一个人去哪里 一 个" |
|
encoding = tokenizer.encode(text) |
|
for token_id in encoding.ids: |
|
decode_str = tokenizer.decode([token_id]) |
|
token = tokenizer.id_to_token(token_id) |
|
print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) |
|
|
|
|
|
test_reverse() |
|
|
|
|
|
|
|
|