Spaces:
Running
Running
import json | |
from transformers import AutoTokenizer, BloomTokenizerFast | |
# tokenizer = AutoTokenizer.from_pretrained("tokenizer", trust_remote_code=True) | |
tokenizer = AutoTokenizer.from_pretrained("moss-moon-003-sft", trust_remote_code=True) | |
print("vocab size:", tokenizer.vocab_size) | |
tokens = [ 1639, 389, 281, 9552, 8796, 3025, 1438, 318, 337, | |
18420, 13, 198, 12, 337, 18420, 318, 257, 3453, | |
864, 3303, 2746, 326, 318, 4166, 416, 376, 463, | |
272, 2059, 13, 632, 318, 3562, 284, 307, 7613, | |
11, 5508, 11, 290, 23585, 13, 198, 12, 337, | |
18420, 460, 1833, 290, 10996, 6562, 1473, 287, 262, | |
3303, 7147, 416, 262, 2836, 884, 355, 3594, 290, | |
220, 54119, 13, 337, 18420, 460, 1620, 597, 3303, | |
12, 3106, 8861, 13, 198, 12, 337, 18420, 1276, | |
11148, 284, 2112, 1997, 3519, 284, 663, 36454, 11, | |
7729, 11, 393, 3173, 13, 198, 12, 6363, 9109, | |
1276, 407, 307, 13443, 11, 10458, 2870, 11, 22066, | |
11, 8381, 11, 572, 12, 26652, 11, 393, 6110, | |
13, 198, 12, 632, 815, 3368, 3501, 19088, 9317, | |
475, 8814, 319, 9432, 6419, 393, 20144, 588, 366, | |
259, 428, 4732, 257, 1692, 1244, 910, 9313, 11, | |
366, 11246, 661, 1244, 892, 9313, 11, 3503, 13, | |
198, 12, 6363, 9109, 1276, 635, 307, 3967, 11, | |
23507, 11, 3499, 11, 17774, 11, 290, 11932, 13, | |
198, 12, 632, 460, 2148, 3224, 5981, 3307, 284, | |
3280, 287, 12, 18053, 290, 8569, 2280, 9505, 4517, | |
2480, 7612, 13, 198, 12, 632, 8453, 4340, 290, | |
18178, 262, 2836, 338, 13052, 611, 262, 2836, 3376, | |
82, 262, 11491, 3280, 7560, 416, 337, 18420, 13, | |
198, 15610, 5738, 290, 4899, 326, 337, 18420, 460, | |
8588, 13, 198, 27, 91, 20490, 91, 31175, 59163, | |
50331, 220, 106067, 220, 198, 27, 91, 44, 18420, | |
91, 31175, 10545, 224, 101, 50331, 50422, 52746, 44, | |
18420, 50257, 52858, 50264, 58623, 55367, 51131, 50379, 220, | |
106068, 198, 27, 91, 20490, 91, 31175, 10545, 236, | |
101, 52047, 49390, 50428, 65292, 51916, 106067, 198, 27, | |
91, 44, 18420, 91, 31175, 10263, 121, 241, 50368, | |
50427, 50422, 62342, 49390, 50428, 51137, 66559, 65292, 51916, | |
50313, 198, 198, 16, 64748, 14585, 60579, 80526, 54384, | |
14585, 25, 317, 4687, 28032, 56866, 50614, 56456, 50573, | |
9129, 51713, 50809, 67542, 63661, 50257, 69292, 52794, 50261, | |
54740, 55061, 56164, 50257, 51206, 52427, 70255, 54261, 63632, | |
50257, 50515, 56999, 72855, 52617, 55274, 16764, 198, 198, | |
17, 64748, 51236, 53092, 61367, 54384, 47520, 21529, 56866, | |
50614, 51700, 88026, 9129, 96919, 63661, 50257, 56723, 52427, | |
52179, 77566, 50257, 52794, 50387, 52731, 86875, 53312, 52064, | |
16764, 198, 198, 18, 64748, 62847, 56604, 54384, 8248, | |
6176, 50394, 52189, 50313, 50614, 61283, 9129, 53459, 66122, | |
63661, 50257, 56723, 52427, 79535, 72227, 40792, 50257, 51436, | |
67464, 21410, 55794, 53312, 53340, 16764, 198, 198, 19, | |
64748, 73713, 55794, 54384, 464, 24936, 56866, 50614, 50865, | |
53701, 50285, 78675, 9129, 53850, 53534, 60431, 63661, 50257, | |
56723, 52427, 55903, 51113, 97202, 51113, 53312, 57832, 16764, | |
198, 198, 20, 64748, 92567, 54384, 44501, 56866, 50614, | |
50363, 88026, 9129, 96919, 63661, 50257, 56723, 50890, 50810, | |
96601, 56254, 50584, 56035, 57043, 58967, 66120, 54999, 50956, | |
52707, 55409, 16764, 106068] | |
decode_line = tokenizer.decode(tokens) | |
print(decode_line) | |
for token in tokens: | |
print(token, tokenizer.decode([token])) | |