|
|
|
import json |
|
import os |
|
from transformers import LlamaTokenizer |
|
|
|
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) |
|
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer") |
|
|
|
|
|
|
|
tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_DIR) |
|
|
|
|
|
tokens = [ 1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, |
|
526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889, |
|
29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641, |
|
9109, 29889, 29871, 3575, 6089, 881, 451, 3160, 738, 10311, |
|
1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916, |
|
391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793, |
|
29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443, |
|
5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644, |
|
263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338, |
|
451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012, |
|
310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915, |
|
29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016, |
|
29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816, |
|
29903, 6778, 13, 13, 15970, 526, 366, 518, 29914, 25580, |
|
29962] |
|
|
|
text = tokenizer.decode(tokens) |
|
print(text) |
|
for token_id in tokens: |
|
print(json.dumps({"token_id": token_id, "decode_str": tokenizer.decode([token_id]), "token": tokenizer.convert_ids_to_tokens([token_id][0])}, ensure_ascii=False)) |