unigram-tokenizer-wikitext / tokenizer.json
sayakpaul's picture
sayakpaul HF staff
add tokenizer
99b7b7b
raw
history blame
3.24 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "Replace",
"pattern": {
"String": "``"
},
"content": "\""
},
{
"type": "Replace",
"pattern": {
"String": "''"
},
"content": "\""
}
]
},
"pre_tokenizer": {
"type": "Metaspace",
"replacement": "▁",
"add_prefix_space": true
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
0
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
1
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": {
"type": "Metaspace",
"replacement": "▁",
"add_prefix_space": true
},
"model": {
"type": "Unigram",
"unk_id": 2,
"vocab": [
[
"[CLS]",
0.0
],
[
"[SEP]",
0.0
],
[
"<unk>",
0.0
],
[
"<pad>",
0.0
],
[
"[MASK]",
0.0
],
[
"t",
-0.9163739781069129
],
[
"x",
-1.6096599126533642
],
[
"e",
-1.6096599126533642
],
[
"▁",
-1.6096599126533642
]
]
}
}