Spaces:
Running
Running
""" | |
""" | |
import os | |
from transformers import LlamaTokenizer | |
from vocab import TokenizerType, TokenizerImpl | |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer") | |
tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_DIR) | |
tokenizer.parent = "" | |
tokenizer.type = TokenizerType.ByteBPE.name | |
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py | |
tokenizer.comments = "split all numbers into individual digits, " \ | |
"and fallback to bytes to decompose unknown UTF-8 characters" | |