Spaces:
Running
Running
File size: 598 Bytes
0ce6477 f4973d4 0ce6477 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
# HF_tokenizer
# from tokenizers import Tokenizer
# tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
import sentencepiece as spm
text = "nice job 华为手机"
text = "世界上最高的山是哪座山?"
tokenizer = spm.SentencePieceProcessor(model_file="../tokenizer/tokenizer.model")
tokens = tokenizer.encode(text) # [7575, 4982, 29871, 31266, 30573, 30880, 31429]
print(tokens)
from transformers import LlamaTokenizer
tokenizer = LlamaTokenizer.from_pretrained("tokenizer")
tokens = tokenizer.encode(text) # [1, 7575, 4982, 29871, 31266, 30573, 30880, 31429]
print(tokens)
|