File size: 636 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

"""


"""

import os
from transformers import LlamaTokenizer
from vocab import TokenizerType, TokenizerImpl

CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")



tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_DIR)


tokenizer.parent = ""
tokenizer.type = TokenizerType.ByteBPE.name
tokenizer.implementation = TokenizerImpl.SentencePiece.name   # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
tokenizer.comments = "split all numbers into individual digits, " \
                     "and fallback to bytes to decompose unknown UTF-8 characters"