Spaces:
Running
Running
import json | |
from tokenizers import Tokenizer | |
tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.json") | |
print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True)) | |
print("vocab_size without added_tokens:", tokenizer.get_vocab_size(with_added_tokens=False)) | |
def test_token(): | |
""" | |
:return: | |
""" | |
text = " \t\n中国解决方法黑白侗鸩玥,。!" | |
# text = open("../../data_sample/EBKE20150806001_epub_30198917_30198917.txt", "r", encoding="utf-8").readline() | |
encoding = tokenizer.encode(text) | |
decoding = tokenizer.decode(encoding.ids) | |
print(decoding) | |
for word in text: | |
encoding = tokenizer.encode(word) | |
for token_id in encoding.ids: | |
decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd" | |
token = tokenizer.id_to_token(token_id) | |
print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) | |
def test_encode(): | |
text = "中国解决方法黑白侗鸩,。!?;一个人去哪里疗疗<|endoftext|>一 个刹车卉" | |
encoding = tokenizer.encode(text) | |
print(tokenizer.decode(encoding.ids)) | |
for token_id in encoding.ids: | |
decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd" | |
token = tokenizer.id_to_token(token_id) | |
print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) | |
def test_decode(): | |
encoding = [30903, 20287, 20005, 52300, 25949, 30329, 50039, 31949, 25538, | |
34698, 18764, 5225, 53915, 163, 223] | |
decode_str = tokenizer.decode(encoding, skip_special_tokens=False) | |
print(decode_str) | |
# test_token() | |
test_encode() | |
# test_decode() | |