File size: 940 Bytes
614012d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9495a4f
 
 
 
 
614012d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
"""
gpt_35_turbo decode UnicodeDecodeError 99413 b' \xe6\xb5'
gpt_35_turbo decode UnicodeDecodeError 99742 b'\x8c\xa8'
gpt_35_turbo decode UnicodeDecodeError 99834 b'\xad\x90'
gpt_35_turbo decode UnicodeDecodeError 100112 b'\xe0\xae\xbf\xe0\xae'
gpt_35_turbo decode KeyError 100256
gpt_35_turbo decode KeyError 100261
gpt_35_turbo decode KeyError 100262
gpt_35_turbo decode KeyError 100263
"""



import json
import tiktoken


tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')

tokens = [100263, 99834]

tokenizer.decode(tokens)

tokenizer._core_bpe.decode_bytes(tokens).decode("utf-8", errors="replace")

for token_id in [100263, 99834]:  # special_tokens: 200257-100260 100276
    try:
        tokenizer.decode_tokens_bytes([token_id])
    except:
        pass

    try:
        tokenizer.decode_single_token_bytes(token_id)
    except:
        pass

    try:
        tokenizer.decode_bytes([token_id])
    except:
        pass