Upload tokenizer

#2
by chtan - opened
special_tokens_map.json CHANGED
@@ -1 +1,7 @@
1
- {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenization_ponet.py ADDED
@@ -0,0 +1,590 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for PoNet."""
16
+
17
+
18
+ import collections
19
+ import os
20
+ import unicodedata
21
+ from typing import Dict, List, Optional, Tuple, Union
22
+
23
+ from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
24
+ from transformers.tokenization_utils_base import BatchEncoding, EncodedInput
25
+ from transformers.utils import PaddingStrategy, logging
26
+
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
31
+
32
+ PRETRAINED_VOCAB_FILES_MAP = {
33
+ "vocab_file": {
34
+ "chtan/ponet-base-uncased": "https://huggingface.co/chtan/ponet-base-uncased/resolve/main/vocab.txt",
35
+ }
36
+ }
37
+
38
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
39
+ "chtan/ponet-base-uncased": 512,
40
+ }
41
+
42
+ PRETRAINED_INIT_CONFIGURATION = {
43
+ "chtan/ponet-base-uncased": {"do_lower_case": True},
44
+ }
45
+
46
+
47
+ def load_vocab(vocab_file):
48
+ """Loads a vocabulary file into a dictionary."""
49
+ vocab = collections.OrderedDict()
50
+ with open(vocab_file, "r", encoding="utf-8") as reader:
51
+ tokens = reader.readlines()
52
+ for index, token in enumerate(tokens):
53
+ token = token.rstrip("\n")
54
+ vocab[token] = index
55
+ return vocab
56
+
57
+
58
+ def whitespace_tokenize(text):
59
+ """Runs basic whitespace cleaning and splitting on a piece of text."""
60
+ text = text.strip()
61
+ if not text:
62
+ return []
63
+ tokens = text.split()
64
+ return tokens
65
+
66
+
67
+ class PoNetTokenizer(PreTrainedTokenizer):
68
+ r"""
69
+ Construct a PONET tokenizer. Based on WordPiece.
70
+
71
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
72
+ this superclass for more information regarding those methods.
73
+
74
+ Args:
75
+ vocab_file (`str`):
76
+ File containing the vocabulary.
77
+ do_lower_case (`bool`, *optional*, defaults to `True`):
78
+ Whether or not to lowercase the input when tokenizing.
79
+ do_basic_tokenize (`bool`, *optional*, defaults to `True`):
80
+ Whether or not to do basic tokenization before WordPiece.
81
+ never_split (`Iterable`, *optional*):
82
+ Collection of tokens which will never be split during tokenization. Only has an effect when
83
+ `do_basic_tokenize=True`
84
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
85
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
86
+ token instead.
87
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
88
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
89
+ sequence classification or for a text and a question for question answering. It is also used as the last
90
+ token of a sequence built with special tokens.
91
+ pad_token (`str`, *optional*, defaults to `"[PAD]"`):
92
+ The token used for padding, for example when batching sequences of different lengths.
93
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
94
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
95
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
96
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
97
+ The token used for masking values. This is the token used when training this model with masked language
98
+ modeling. This is the token which the model will try to predict.
99
+ tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
100
+ Whether or not to tokenize Chinese characters.
101
+
102
+ This should likely be deactivated for Japanese (see this
103
+ [issue](https://github.com/huggingface/transformers/issues/328)).
104
+ strip_accents (`bool`, *optional*):
105
+ Whether or not to strip all accents. If this option is not specified, then it will be determined by the
106
+ value for `lowercase` (as in the original PONET).
107
+ """
108
+
109
+ vocab_files_names = VOCAB_FILES_NAMES
110
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
111
+ pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
112
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
113
+
114
+ def __init__(
115
+ self,
116
+ vocab_file,
117
+ do_lower_case=True,
118
+ do_basic_tokenize=True,
119
+ never_split=None,
120
+ unk_token="[UNK]",
121
+ sep_token="[SEP]",
122
+ pad_token="[PAD]",
123
+ cls_token="[CLS]",
124
+ mask_token="[MASK]",
125
+ tokenize_chinese_chars=True,
126
+ strip_accents=None,
127
+ **kwargs,
128
+ ):
129
+ super().__init__(
130
+ do_lower_case=do_lower_case,
131
+ do_basic_tokenize=do_basic_tokenize,
132
+ never_split=never_split,
133
+ unk_token=unk_token,
134
+ sep_token=sep_token,
135
+ pad_token=pad_token,
136
+ cls_token=cls_token,
137
+ mask_token=mask_token,
138
+ tokenize_chinese_chars=tokenize_chinese_chars,
139
+ strip_accents=strip_accents,
140
+ **kwargs,
141
+ )
142
+
143
+ if not os.path.isfile(vocab_file):
144
+ raise ValueError(
145
+ f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
146
+ " model use `tokenizer = PoNetTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
147
+ )
148
+ self.vocab = load_vocab(vocab_file)
149
+ self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
150
+ self.do_basic_tokenize = do_basic_tokenize
151
+ if do_basic_tokenize:
152
+ self.basic_tokenizer = BasicTokenizer(
153
+ do_lower_case=do_lower_case,
154
+ never_split=never_split,
155
+ tokenize_chinese_chars=tokenize_chinese_chars,
156
+ strip_accents=strip_accents,
157
+ )
158
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
159
+
160
+ def _pad(
161
+ self,
162
+ encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
163
+ max_length: Optional[int] = None,
164
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
165
+ pad_to_multiple_of: Optional[int] = None,
166
+ return_attention_mask: Optional[bool] = None,
167
+ ) -> dict:
168
+ """
169
+ Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
170
+
171
+ Args:
172
+ encoded_inputs:
173
+ Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
174
+ max_length: maximum length of the returned list and optionally padding length (see below).
175
+ Will truncate by taking into account the special tokens.
176
+ padding_strategy: PaddingStrategy to use for padding.
177
+
178
+ - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
179
+ - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
180
+ - PaddingStrategy.DO_NOT_PAD: Do not pad
181
+ The tokenizer padding sides are defined in self.padding_side:
182
+
183
+ - 'left': pads on the left of the sequences
184
+ - 'right': pads on the right of the sequences
185
+ pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
186
+ This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
187
+ `>= 7.5` (Volta).
188
+ return_attention_mask:
189
+ (optional) Set to False to avoid returning attention mask (default: set to model specifics)
190
+ """
191
+ # Load from model defaults
192
+ if return_attention_mask is None:
193
+ return_attention_mask = "attention_mask" in self.model_input_names
194
+
195
+ required_input = encoded_inputs[self.model_input_names[0]]
196
+
197
+ if padding_strategy == PaddingStrategy.LONGEST:
198
+ max_length = len(required_input)
199
+
200
+ if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
201
+ max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
202
+
203
+ needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
204
+
205
+ # Initialize attention mask if not present.
206
+ if return_attention_mask and "attention_mask" not in encoded_inputs:
207
+ encoded_inputs["attention_mask"] = [1] * len(required_input)
208
+
209
+ if needs_to_be_padded:
210
+ difference = max_length - len(required_input)
211
+
212
+ if self.padding_side == "right":
213
+ if return_attention_mask:
214
+ encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
215
+ if "token_type_ids" in encoded_inputs:
216
+ encoded_inputs["token_type_ids"] = (
217
+ encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
218
+ )
219
+ if "segment_ids" in encoded_inputs:
220
+ encoded_inputs["segment_ids"] = (
221
+ encoded_inputs["segment_ids"] + [encoded_inputs["segment_ids"][-1] + 1] * difference
222
+ )
223
+ if "special_tokens_mask" in encoded_inputs:
224
+ encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
225
+ encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
226
+ elif self.padding_side == "left":
227
+ if return_attention_mask:
228
+ encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
229
+ if "token_type_ids" in encoded_inputs:
230
+ encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
231
+ "token_type_ids"
232
+ ]
233
+ if "segment_ids" in encoded_inputs:
234
+ encoded_inputs["segment_ids"] = [
235
+ encoded_inputs["segment_ids"][-1] + 1
236
+ ] * difference + encoded_inputs["segment_ids"]
237
+ if "special_tokens_mask" in encoded_inputs:
238
+ encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
239
+ encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
240
+ else:
241
+ raise ValueError("Invalid padding strategy:" + str(self.padding_side))
242
+
243
+ return encoded_inputs
244
+
245
+ @property
246
+ def do_lower_case(self):
247
+ return self.basic_tokenizer.do_lower_case
248
+
249
+ @property
250
+ def vocab_size(self):
251
+ return len(self.vocab)
252
+
253
+ def get_vocab(self):
254
+ return dict(self.vocab, **self.added_tokens_encoder)
255
+
256
+ def _tokenize(self, text):
257
+ split_tokens = []
258
+ if self.do_basic_tokenize:
259
+ for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
260
+ # If the token is part of the never_split set
261
+ if token in self.basic_tokenizer.never_split:
262
+ split_tokens.append(token)
263
+ else:
264
+ split_tokens += self.wordpiece_tokenizer.tokenize(token)
265
+ else:
266
+ split_tokens = self.wordpiece_tokenizer.tokenize(text)
267
+ return split_tokens
268
+
269
+ def _convert_token_to_id(self, token):
270
+ """Converts a token (str) in an id using the vocab."""
271
+ return self.vocab.get(token, self.vocab.get(self.unk_token))
272
+
273
+ def _convert_id_to_token(self, index):
274
+ """Converts an index (integer) in a token (str) using the vocab."""
275
+ return self.ids_to_tokens.get(index, self.unk_token)
276
+
277
+ def convert_tokens_to_string(self, tokens):
278
+ """Converts a sequence of tokens (string) in a single string."""
279
+ out_string = " ".join(tokens).replace(" ##", "").strip()
280
+ return out_string
281
+
282
+ def build_inputs_with_special_tokens(
283
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
284
+ ) -> List[int]:
285
+ """
286
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
287
+ adding special tokens. A PONET sequence has the following format:
288
+
289
+ - single sequence: `[CLS] X [SEP]`
290
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
291
+
292
+ Args:
293
+ token_ids_0 (`List[int]`):
294
+ List of IDs to which the special tokens will be added.
295
+ token_ids_1 (`List[int]`, *optional*):
296
+ Optional second list of IDs for sequence pairs.
297
+
298
+ Returns:
299
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
300
+ """
301
+ if token_ids_1 is None:
302
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
303
+ cls = [self.cls_token_id]
304
+ sep = [self.sep_token_id]
305
+ return cls + token_ids_0 + sep + token_ids_1 + sep
306
+
307
+ def get_special_tokens_mask(
308
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
309
+ ) -> List[int]:
310
+ """
311
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
312
+ special tokens using the tokenizer `prepare_for_model` method.
313
+
314
+ Args:
315
+ token_ids_0 (`List[int]`):
316
+ List of IDs.
317
+ token_ids_1 (`List[int]`, *optional*):
318
+ Optional second list of IDs for sequence pairs.
319
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
320
+ Whether or not the token list is already formatted with special tokens for the model.
321
+
322
+ Returns:
323
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
324
+ """
325
+
326
+ if already_has_special_tokens:
327
+ return super().get_special_tokens_mask(
328
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
329
+ )
330
+
331
+ if token_ids_1 is not None:
332
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
333
+ return [1] + ([0] * len(token_ids_0)) + [1]
334
+
335
+ def create_token_type_ids_from_sequences(
336
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
337
+ ) -> List[int]:
338
+ """
339
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A PONET sequence
340
+ pair mask has the following format:
341
+
342
+ ```
343
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
344
+ | first sequence | second sequence |
345
+ ```
346
+
347
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
348
+
349
+ Args:
350
+ token_ids_0 (`List[int]`):
351
+ List of IDs.
352
+ token_ids_1 (`List[int]`, *optional*):
353
+ Optional second list of IDs for sequence pairs.
354
+
355
+ Returns:
356
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
357
+ """
358
+ sep = [self.sep_token_id]
359
+ cls = [self.cls_token_id]
360
+ if token_ids_1 is None:
361
+ return len(cls + token_ids_0 + sep) * [0]
362
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
363
+
364
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
365
+ index = 0
366
+ if os.path.isdir(save_directory):
367
+ vocab_file = os.path.join(
368
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
369
+ )
370
+ else:
371
+ vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
372
+ with open(vocab_file, "w", encoding="utf-8") as writer:
373
+ for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
374
+ if index != token_index:
375
+ logger.warning(
376
+ f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
377
+ " Please check that the vocabulary is not corrupted!"
378
+ )
379
+ index = token_index
380
+ writer.write(token + "\n")
381
+ index += 1
382
+ return (vocab_file,)
383
+
384
+
385
+ # Copied from transformers.models.bert.tokenization_bert.BasicTokenizer with Bert->PoNet
386
+ class BasicTokenizer(object):
387
+ """
388
+ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
389
+
390
+ Args:
391
+ do_lower_case (`bool`, *optional*, defaults to `True`):
392
+ Whether or not to lowercase the input when tokenizing.
393
+ never_split (`Iterable`, *optional*):
394
+ Collection of tokens which will never be split during tokenization. Only has an effect when
395
+ `do_basic_tokenize=True`
396
+ tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
397
+ Whether or not to tokenize Chinese characters.
398
+
399
+ This should likely be deactivated for Japanese (see this
400
+ [issue](https://github.com/huggingface/transformers/issues/328)).
401
+ strip_accents (`bool`, *optional*):
402
+ Whether or not to strip all accents. If this option is not specified, then it will be determined by the
403
+ value for `lowercase` (as in the original BERT).
404
+ """
405
+
406
+ def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
407
+ if never_split is None:
408
+ never_split = []
409
+ self.do_lower_case = do_lower_case
410
+ self.never_split = set(never_split)
411
+ self.tokenize_chinese_chars = tokenize_chinese_chars
412
+ self.strip_accents = strip_accents
413
+
414
+ def tokenize(self, text, never_split=None):
415
+ """
416
+ Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
417
+ WordPieceTokenizer.
418
+
419
+ Args:
420
+ never_split (`List[str]`, *optional*)
421
+ Kept for backward compatibility purposes. Now implemented directly at the base class level (see
422
+ [`PreTrainedTokenizer.tokenize`]) List of token not to split.
423
+ """
424
+ # union() returns a new set by concatenating the two sets.
425
+ never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
426
+ text = self._clean_text(text)
427
+
428
+ # This was added on November 1st, 2018 for the multilingual and Chinese
429
+ # models. This is also applied to the English models now, but it doesn't
430
+ # matter since the English models were not trained on any Chinese data
431
+ # and generally don't have any Chinese data in them (there are Chinese
432
+ # characters in the vocabulary because Wikipedia does have some Chinese
433
+ # words in the English Wikipedia.).
434
+ if self.tokenize_chinese_chars:
435
+ text = self._tokenize_chinese_chars(text)
436
+ orig_tokens = whitespace_tokenize(text)
437
+ split_tokens = []
438
+ for token in orig_tokens:
439
+ if token not in never_split:
440
+ if self.do_lower_case:
441
+ token = token.lower()
442
+ if self.strip_accents is not False:
443
+ token = self._run_strip_accents(token)
444
+ elif self.strip_accents:
445
+ token = self._run_strip_accents(token)
446
+ split_tokens.extend(self._run_split_on_punc(token, never_split))
447
+
448
+ output_tokens = whitespace_tokenize(" ".join(split_tokens))
449
+ return output_tokens
450
+
451
+ def _run_strip_accents(self, text):
452
+ """Strips accents from a piece of text."""
453
+ text = unicodedata.normalize("NFD", text)
454
+ output = []
455
+ for char in text:
456
+ cat = unicodedata.category(char)
457
+ if cat == "Mn":
458
+ continue
459
+ output.append(char)
460
+ return "".join(output)
461
+
462
+ def _run_split_on_punc(self, text, never_split=None):
463
+ """Splits punctuation on a piece of text."""
464
+ if never_split is not None and text in never_split:
465
+ return [text]
466
+ chars = list(text)
467
+ i = 0
468
+ start_new_word = True
469
+ output = []
470
+ while i < len(chars):
471
+ char = chars[i]
472
+ if _is_punctuation(char):
473
+ output.append([char])
474
+ start_new_word = True
475
+ else:
476
+ if start_new_word:
477
+ output.append([])
478
+ start_new_word = False
479
+ output[-1].append(char)
480
+ i += 1
481
+
482
+ return ["".join(x) for x in output]
483
+
484
+ def _tokenize_chinese_chars(self, text):
485
+ """Adds whitespace around any CJK character."""
486
+ output = []
487
+ for char in text:
488
+ cp = ord(char)
489
+ if self._is_chinese_char(cp):
490
+ output.append(" ")
491
+ output.append(char)
492
+ output.append(" ")
493
+ else:
494
+ output.append(char)
495
+ return "".join(output)
496
+
497
+ def _is_chinese_char(self, cp):
498
+ """Checks whether CP is the codepoint of a CJK character."""
499
+ # This defines a "chinese character" as anything in the CJK Unicode block:
500
+ # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
501
+ #
502
+ # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
503
+ # despite its name. The modern Korean Hangul alphabet is a different block,
504
+ # as is Japanese Hiragana and Katakana. Those alphabets are used to write
505
+ # space-separated words, so they are not treated specially and handled
506
+ # like the all of the other languages.
507
+ if (
508
+ (cp >= 0x4E00 and cp <= 0x9FFF)
509
+ or (cp >= 0x3400 and cp <= 0x4DBF) #
510
+ or (cp >= 0x20000 and cp <= 0x2A6DF) #
511
+ or (cp >= 0x2A700 and cp <= 0x2B73F) #
512
+ or (cp >= 0x2B740 and cp <= 0x2B81F) #
513
+ or (cp >= 0x2B820 and cp <= 0x2CEAF) #
514
+ or (cp >= 0xF900 and cp <= 0xFAFF)
515
+ or (cp >= 0x2F800 and cp <= 0x2FA1F) #
516
+ ): #
517
+ return True
518
+
519
+ return False
520
+
521
+ def _clean_text(self, text):
522
+ """Performs invalid character removal and whitespace cleanup on text."""
523
+ output = []
524
+ for char in text:
525
+ cp = ord(char)
526
+ if cp == 0 or cp == 0xFFFD or _is_control(char):
527
+ continue
528
+ if _is_whitespace(char):
529
+ output.append(" ")
530
+ else:
531
+ output.append(char)
532
+ return "".join(output)
533
+
534
+
535
+ # Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer with Bert->PoNet
536
+ class WordpieceTokenizer(object):
537
+ """Runs WordPiece tokenization."""
538
+
539
+ def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
540
+ self.vocab = vocab
541
+ self.unk_token = unk_token
542
+ self.max_input_chars_per_word = max_input_chars_per_word
543
+
544
+ def tokenize(self, text):
545
+ """
546
+ Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
547
+ tokenization using the given vocabulary.
548
+
549
+ For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
550
+
551
+ Args:
552
+ text: A single token or whitespace separated tokens. This should have
553
+ already been passed through *BasicTokenizer*.
554
+
555
+ Returns:
556
+ A list of wordpiece tokens.
557
+ """
558
+
559
+ output_tokens = []
560
+ for token in whitespace_tokenize(text):
561
+ chars = list(token)
562
+ if len(chars) > self.max_input_chars_per_word:
563
+ output_tokens.append(self.unk_token)
564
+ continue
565
+
566
+ is_bad = False
567
+ start = 0
568
+ sub_tokens = []
569
+ while start < len(chars):
570
+ end = len(chars)
571
+ cur_substr = None
572
+ while start < end:
573
+ substr = "".join(chars[start:end])
574
+ if start > 0:
575
+ substr = "##" + substr
576
+ if substr in self.vocab:
577
+ cur_substr = substr
578
+ break
579
+ end -= 1
580
+ if cur_substr is None:
581
+ is_bad = True
582
+ break
583
+ sub_tokens.append(cur_substr)
584
+ start = end
585
+
586
+ if is_bad:
587
+ output_tokens.append(self.unk_token)
588
+ else:
589
+ output_tokens.extend(sub_tokens)
590
+ return output_tokens
tokenizer_config.json CHANGED
@@ -1 +1,27 @@
1
- {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "tokenizer_class": "PoNetTokenizer", "model_input_names": ["input_ids", "token_type_ids", "attention_mask", "segment_ids"]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_ponet.PoNetTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "cls_token": "[CLS]",
9
+ "do_basic_tokenize": true,
10
+ "do_lower_case": true,
11
+ "mask_token": "[MASK]",
12
+ "model_input_names": [
13
+ "input_ids",
14
+ "token_type_ids",
15
+ "attention_mask",
16
+ "segment_ids"
17
+ ],
18
+ "model_max_length": 512,
19
+ "never_split": null,
20
+ "pad_token": "[PAD]",
21
+ "sep_token": "[SEP]",
22
+ "special_tokens_map_file": null,
23
+ "strip_accents": null,
24
+ "tokenize_chinese_chars": true,
25
+ "tokenizer_class": "PoNetTokenizer",
26
+ "unk_token": "[UNK]"
27
+ }