File size: 476 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from icetk import icetk

tokens = icetk.tokenize('Hello World! I am icetk.')
ids = icetk.encode('你好世界!这里是 icetk。')
print(ids)
# ids == [20005, 94874, 84097, 20035, 94947, 22881, 35955, 83823]


# tokens = icetk.encode('你好世界!这里是 icetk。')
for token in tokens:
    print(token, icetk.text_tokenizer.proto.pieces[token - 20000].piece)




    # print(i, icetk.decode(tokens[i:i + 1]))
    # print(icetk.text_tokenizer.proto.pieces[token-20000])