codebyzeb's picture
Upload tokenizer
8d5285d verified
raw
history blame
2.16 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "UNK",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "PAD",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "BOS",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "EOS",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "Replace",
"pattern": {
"String": "\n"
},
"content": " UTT_BOUNDARY"
},
{
"type": "Strip",
"strip_left": true,
"strip_right": true
}
]
},
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": null,
"decoder": null,
"model": {
"type": "WordLevel",
"vocab": {
"UNK": 0,
"PAD": 1,
"BOS": 2,
"EOS": 3,
"WORD_BOUNDARY": 4,
"UTT_BOUNDARY": 5,
"aː": 6,
"˧": 7,
"t": 8,
"ɐ": 9,
"k": 10,
"˥": 11,
"l": 12,
"j": 13,
"ʊ": 14,
"˨˩": 15,
"ɛː": 16,
"n": 17,
"ei": 18,
"˩˧": 19,
"w": 20,
"˨": 21,
"ɐi": 22,
"˧˥": 23,
"m̩": 24,
"m": 25,
"ou": 26,
"iː": 27,
"ts": 28,
"ɔː": 29,
"tʰ": 30,
"f": 31,
"aːĭ": 32,
"p": 33,
"h": 34,
"ɵy": 35,
"uː": 36,
"ŋ": 37,
"s": 38,
"ɔːĭ": 39,
"ɐu": 40,
"ɪ": 41,
"iːŭ": 42,
"ɵ": 43,
"tsʰ": 44,
"kʰ": 45,
"aːŭ": 46,
"pʰ": 47,
"yː": 48,
"œː": 49,
"uːĭ": 50,
"u": 51
},
"unk_token": "UNK"
}
}