File size: 330 Bytes
751936e
 
 
 
 
 
 
 
428b731
 
 
1
2
3
4
5
6
7
8
9
10
11
12
from transformers import AutoTokenizer
from vocab import TokenizerType

tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan-7B", trust_remote_code=True)


# byte-bpe  sentencepiece
tokenizer.type = TokenizerType.ByteBPE

tokenizer.comments = "使用 SentencePiece 中的 Byte-Pair Encoding (BPE) 作为分词算法"