|
|
|
""" |
|
|
|
|
|
""" |
|
|
|
import os |
|
from transformers import LlamaTokenizer |
|
from vocab import TokenizerType, TokenizerImpl |
|
|
|
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) |
|
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer") |
|
|
|
|
|
|
|
tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_DIR) |
|
|
|
|
|
tokenizer.parent = "" |
|
tokenizer.type = TokenizerType.ByteBPE.name |
|
tokenizer.implementation = TokenizerImpl.SentencePiece.name |
|
tokenizer.comments = "split all numbers into individual digits, " \ |
|
"and fallback to bytes to decompose unknown UTF-8 characters" |
|
|