File size: 2,170 Bytes
751936e 8e0e4e9 751936e 9495a4f 751936e d10ecd7 f4973d4 751936e f4973d4 428b731 9495a4f d10ecd7 8e0e4e9 d10ecd7 f4973d4 d10ecd7 f4973d4 9495a4f f4973d4 9495a4f 8e0e4e9 9495a4f 8e0e4e9 d10ecd7 8e0e4e9 d10ecd7 751936e 9495a4f 751936e 428b731 d10ecd7 751936e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import tiktoken
from tiktoken import Encoding
from utils.log_util import logger
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
tokenizer.vocab_size = tokenizer.n_vocab
tokenizer.comments = "tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"
tokenizer.reversible = True # It's reversible and lossless, so you can convert tokens back into the original text
def decode(self, tokens, errors="replace", skip_special_tokens=False):
"""
默认的decode,可能会报错,详见 decode_test.py
skip_special_tokens 是为了兼容 hf_tokenizer
"""
try:
decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
except:
decode_str = "null"
return decode_str
def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
"""
为什么没有这个方法?
"""
try:
return tokenizer.decode_tokens_bytes(tokens)
except:
# 什么要返回None?见zh_util.py
# 16个空闲id, 100256 100261-100275
return [None for token in tokens]
def get_vocab(self, token_type="str"):
"""Returns vocab as a dict
:param token_type: ["str", "byte"]
:return:
"""
vocab = {}
key_error_list = []
unicode_decode_error_list = []
for i in range(self.vocab_size):
if i == 100256:
print(i)
try:
token_byte = self.convert_ids_to_tokens([i])[0]
if token_byte is None:
continue
# token_str = token_byte.decode("utf-8")
vocab[token_byte] = i
except UnicodeDecodeError: # 773 UnicodeDecodeError
unicode_decode_error_list.append((i, str(token_byte)))
vocab[token_byte] = i
# vocab.update(self.added_tokens_encoder)
logger.info(f"gpt_35_turbo {len(key_error_list)} KeyError: {key_error_list}")
logger.info(f"gpt_35_turbo {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
return vocab
# tiktoken patch
Encoding.decode = decode
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
Encoding.get_vocab = get_vocab
|