Spaces:
Running
Running
import os | |
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" | |
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model | |
# sp = sp_pb2_model.ModelProto() | |
# # sp.ParseFromString(open("zh_corpus.unigram.model", "rb").read()) | |
# sp.ParseFromString(open("zh_corpus.bpe.user_defined_symbols.model", "rb").read()) | |
import sentencepiece as spm | |
sp = spm.SentencePieceProcessor(model_file="Baichuan2-7B-Chat/tokenizer.model") | |
for text in ["汉堡王", "汉", "堡", "sfds<|USER|>ss</s><Rhino>", "<reserved_87254>", "<reserved_928>"]: | |
result_str = sp.encode(text, out_type=str) | |
result_int = sp.encode(text, out_type=int) | |
print(result_str, result_int) |