Spaces:
Running
Running
File size: 1,177 Bytes
751936e 9495a4f 751936e 9495a4f 751936e d10ecd7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
"""
## 指令 special token
{"token_id": 29961, "decode_str": "[", "token": "["}
{"token_id": 25580, "decode_str": "INST", "token": "INST"}
{"token_id": 29962, "decode_str": "]", "token": "]"}
{"token_id": 3532, "decode_str": "<<", "token": "▁<<"}
{"token_id": 14816, "decode_str": "SY", "token": "SY"}
{"token_id": 29903, "decode_str": "S", "token": "S"}
{"token_id": 6778, "decode_str": ">>", "token": ">>"}
{"token_id": 13, "decode_str": "\n", "token": "<0x0A>"}
疑问:为什么不将 <<SYS>> <</SYS>> [INST] [/INST] 做成1个id?
"""
import os
from transformers import LlamaTokenizer
from vocab import TokenizerType, TokenizerImpl
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")
tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_DIR)
tokenizer.parent = ""
tokenizer.type = TokenizerType.ByteBPE.name
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
tokenizer.comments = "split all numbers into individual digits, " \
"and fallback to bytes to decompose unknown UTF-8 characters"
|