idsedykh commited on
Commit
1f2906f
·
1 Parent(s): 2df8abf

add tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +1 -0
  2. tokenizer_config.json +1 -0
  3. vocab.txt +38 -0
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "special_tokens_map_file": "../models/special_tokens_map.json", "tokenizer_file": null, "name_or_path": "/home/idsedykh/thesis/models/bert-7-big/checkpoint-88000", "tokenizer_class": "CharBertTokenizer"}
vocab.txt ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [UNK]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
+ э
7
+ т
8
+ о
9
+ н
10
+ а
11
+ и
12
+ с
13
+ к
14
+ з
15
+ ь
16
+ р
17
+ ы
18
+ й
19
+ ч
20
+ е
21
+ л
22
+ в
23
+ д
24
+ б
25
+ я
26
+ у
27
+ ж
28
+ г
29
+ м
30
+ ш
31
+ п
32
+ х
33
+ ц
34
+ щ
35
+ ю
36
+ ф
37
+ ъ
38
+ -