```python | |
from transformers import LlamaTokenizer | |
tokenizer = LlamaTokenizer.from_pretrained( | |
'ocisd4/llama_tokenizer_ext_zhtw', | |
pad_token='<unk>', | |
add_bos_token=True, | |
add_eos_token=False | |
) | |
#vocab size: 36128 | |
print(tokenizer.tokenize('今天天氣真好!')) | |
#['▁', '今', '天', '天', '氣', '真', '好', '!'] | |
print(tokenizer.encode('今天天氣真好!')) | |
#[1, 29871, 31482, 30408, 30408, 32045, 30848, 31076, 30584] | |
print(tokenizer.decode(tokenizer.encode('今天天氣真好!'))) | |
# <s>今天天氣真好! | |
``` |