conviette commited on
Commit
93cbd94
·
1 Parent(s): c748f8d

Upload KorBertTokenizer.py

Browse files
Files changed (1) hide show
  1. KorBertTokenizer.py +89 -0
KorBertTokenizer.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizer, WordpieceTokenizer
2
+ from unicodedata import normalize
3
+
4
+ def whitespace_tokenize(text):
5
+ text = text.strip()
6
+ if not text:
7
+ return []
8
+ tokens = text.split()
9
+ return tokens
10
+
11
+
12
+ class KorWordpieceTokenizer(WordpieceTokenizer):
13
+ def tokenize(self, text):
14
+ output_tokens = []
15
+ for token in whitespace_tokenize(text):
16
+ chars = list(normalize('NFC',token))
17
+ if len(chars) > self.max_input_chars_per_word:
18
+ output_tokens.append(self.unk_token)
19
+ continue
20
+
21
+ is_bad = False
22
+ start = 0
23
+ sub_tokens = []
24
+ while start < len(chars):
25
+ end = len(chars)
26
+ cur_substr = None
27
+ while start < end:
28
+ substr = "".join(chars[start:end])
29
+ if substr in self.vocab:
30
+ cur_substr = substr
31
+ break
32
+ end -= 1
33
+ if cur_substr is None:
34
+ is_bad = True
35
+ break
36
+ sub_tokens.append(cur_substr)
37
+ start = end
38
+
39
+ if is_bad:
40
+ output_tokens.append(self.unk_token)
41
+ else:
42
+ output_tokens.extend(sub_tokens)
43
+ return output_tokens
44
+
45
+
46
+
47
+ class KorBertTokenizer(BertTokenizer):
48
+
49
+ def __init__(self,
50
+ vocab_file,
51
+ do_lower_case=True,
52
+ do_basic_tokenize=True,
53
+ never_split=None,
54
+ unk_token="[UNK]",
55
+ sep_token="[SEP]",
56
+ pad_token="[PAD]",
57
+ cls_token="[CLS]",
58
+ mask_token="[MASK]",
59
+ tokenize_chinese_chars=True,
60
+ strip_accents=None,
61
+ **kwargs):
62
+ super().__init__(vocab_file,
63
+ do_lower_case=True,
64
+ do_basic_tokenize=True,
65
+ never_split=None,
66
+ unk_token="[UNK]",
67
+ sep_token="[SEP]",
68
+ pad_token="[PAD]",
69
+ cls_token="[CLS]",
70
+ mask_token="[MASK]",
71
+ tokenize_chinese_chars=True,
72
+ strip_accents=None,
73
+ **kwargs)
74
+ self.wordpiece_tokenizer = KorWordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
75
+
76
+ def _tokenize(self, text):
77
+ split_tokens = []
78
+ if self.do_basic_tokenize:
79
+ for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
80
+
81
+ token += '_'
82
+ # If the token is part of the never_split set
83
+ if token in self.basic_tokenizer.never_split:
84
+ split_tokens.append(token)
85
+ else:
86
+ split_tokens += self.wordpiece_tokenizer.tokenize(token)
87
+ else:
88
+ split_tokens = self.wordpiece_tokenizer.tokenize(text)
89
+ return split_tokens