Spaces:
Running
Running
File size: 486 Bytes
f4973d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
import sentencepiece as spm
baichuan_spm = sp_pb2_model.ModelProto()
baichuan_spm.ParseFromString(open("Baichuan2-7B-Chat/tokenizer.model", "rb").read())
vocab_size = len(baichuan_spm.pieces)
for i in range(vocab_size):
piece = baichuan_spm.pieces[i]
if "reser" in piece.piece:
print(i, str(piece).strip().replace("\n", ", ")) |