asude55 commited on
Commit
83a37d9
1 Parent(s): 9a6237a

Create tokenizer.json

Browse files
Files changed (1) hide show
  1. tokenizer.json +107 -0
tokenizer.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "0.1",
3
+ "truncation": {
4
+ "max_length": 512,
5
+ "stride": 0,
6
+ "strategy": "longest_first"
7
+ },
8
+ "padding": {
9
+ "strategy": "longest",
10
+ "max_length": 512,
11
+ "pad_to_multiple_of": null
12
+ },
13
+ "added_tokens": [
14
+ {
15
+ "id": 0,
16
+ "content": "[PAD]",
17
+ "single_word": false,
18
+ "lstrip": false,
19
+ "rstrip": false,
20
+ "normalized": false,
21
+ "special": true
22
+ },
23
+ {
24
+ "id": 1,
25
+ "content": "[UNK]",
26
+ "single_word": false,
27
+ "lstrip": false,
28
+ "rstrip": false,
29
+ "normalized": false,
30
+ "special": true
31
+ },
32
+ {
33
+ "id": 2,
34
+ "content": "[CLS]",
35
+ "single_word": false,
36
+ "lstrip": false,
37
+ "rstrip": false,
38
+ "normalized": false,
39
+ "special": true
40
+ },
41
+ {
42
+ "id": 3,
43
+ "content": "[SEP]",
44
+ "single_word": false,
45
+ "lstrip": false,
46
+ "rstrip": false,
47
+ "normalized": false,
48
+ "special": true
49
+ },
50
+ {
51
+ "id": 4,
52
+ "content": "[MASK]",
53
+ "single_word": false,
54
+ "lstrip": false,
55
+ "rstrip": false,
56
+ "normalized": false,
57
+ "special": true
58
+ }
59
+ ],
60
+ "normalizer": {
61
+ "type": "BertNormalizer",
62
+ "clean_text": true,
63
+ "handle_chinese_chars": true,
64
+ "strip_accents": null,
65
+ "lowercase": false
66
+ },
67
+ "pre_tokenizer": {
68
+ "type": "BertPreTokenizer"
69
+ },
70
+ "post_processor": {
71
+ "type": "BertPostProcessor",
72
+ "sep": {
73
+ "type": "AddedToken",
74
+ "content": "[SEP]",
75
+ "single_word": false,
76
+ "lstrip": false,
77
+ "rstrip": false,
78
+ "normalized": false
79
+ },
80
+ "cls": {
81
+ "type": "AddedToken",
82
+ "content": "[CLS]",
83
+ "single_word": false,
84
+ "lstrip": false,
85
+ "rstrip": false,
86
+ "normalized": false
87
+ }
88
+ },
89
+ "decoder": {
90
+ "type": "WordPiece",
91
+ "cleanup": true
92
+ },
93
+ "model": {
94
+ "type": "WordPiece",
95
+ "unk_token": "[UNK]",
96
+ "vocab": {
97
+ "[PAD]": 0,
98
+ "[UNK]": 1,
99
+ "[CLS]": 2,
100
+ "[SEP]": 3,
101
+ "[MASK]": 4,
102
+ "hello": 5,
103
+ "world": 6
104
+ },
105
+ "max_input_chars_per_word": 100
106
+ }
107
+ }