Spaces:
Running
Running
File size: 689 Bytes
751936e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
import json
from tokenization import make_tokenizer
add_sentinel_token = 0
tokenizer = make_tokenizer("ChineseSPTokenizer", None, "tokenizer.model", "50048",
None, add_block_symbols=True, cache_dir="cache",
add_sentinel_token=add_sentinel_token, add_task_mask=True,
add_decoder_mask=False,
fix_command_token=False)
f_out = open("glm_chinese.vocab.txt", "w", encoding="utf-8")
for idx in range(tokenizer.num_tokens):
try:
decode_str = tokenizer.DecodeIds([idx])
f_out.write("%d\t%s\n" % (idx, decode_str))
except Exception as e:
print(idx, e)
|