fukugawa commited on
Commit
01799ff
·
verified ·
1 Parent(s): d80883e

Upload tokenizer

Browse files
README.md CHANGED
@@ -1,9 +1,7 @@
1
  ---
2
- license: apache-2.0
3
- datasets:
4
- - wiki40b
5
  language:
6
  - ja
 
7
  tags:
8
  - ja
9
  - japanese
@@ -12,6 +10,8 @@ tags:
12
  - jax
13
  - flax
14
  - lm1b
 
 
15
  ---
16
  # transformer-lm-japanese-0.1b
17
 
 
1
  ---
 
 
 
2
  language:
3
  - ja
4
+ license: apache-2.0
5
  tags:
6
  - ja
7
  - japanese
 
10
  - jax
11
  - flax
12
  - lm1b
13
+ datasets:
14
+ - wiki40b
15
  ---
16
  # transformer-lm-japanese-0.1b
17
 
added_tokens.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "<pad>": 30002,
3
+ "[CLS]": 30000,
4
+ "[MASK]": 30003,
5
+ "[SEP]": 30001
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": {
6
+ "content": "[MASK]",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "[SEP]",
14
+ "unk_token": "<unk>"
15
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fd0e5d0f09e4e7c267e06e5da939a68e9fe4d9e3708109a5da478daef16e782
3
+ size 761433
tokenization_transformerlm.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Tokenization classes for ALBERT model."""
16
+
17
+
18
+ import os
19
+ import unicodedata
20
+ from shutil import copyfile
21
+ from typing import Any, Dict, List, Optional, Tuple
22
+
23
+ import sentencepiece as spm
24
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
25
+
26
+
27
+ VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
28
+
29
+ PRETRAINED_VOCAB_FILES_MAP = {
30
+ "vocab_file": {
31
+ "albert/albert-base-v1": "https://huggingface.co/albert/albert-base-v1/resolve/main/spiece.model",
32
+ "albert/albert-large-v1": "https://huggingface.co/albert/albert-large-v1/resolve/main/spiece.model",
33
+ "albert/albert-xlarge-v1": "https://huggingface.co/albert/albert-xlarge-v1/resolve/main/spiece.model",
34
+ "albert/albert-xxlarge-v1": "https://huggingface.co/albert/albert-xxlarge-v1/resolve/main/spiece.model",
35
+ "albert/albert-base-v2": "https://huggingface.co/albert/albert-base-v2/resolve/main/spiece.model",
36
+ "albert/albert-large-v2": "https://huggingface.co/albert/albert-large-v2/resolve/main/spiece.model",
37
+ "albert/albert-xlarge-v2": "https://huggingface.co/albert/albert-xlarge-v2/resolve/main/spiece.model",
38
+ "albert/albert-xxlarge-v2": "https://huggingface.co/albert/albert-xxlarge-v2/resolve/main/spiece.model",
39
+ }
40
+ }
41
+
42
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
43
+ "albert/albert-base-v1": 512,
44
+ "albert/albert-large-v1": 512,
45
+ "albert/albert-xlarge-v1": 512,
46
+ "albert/albert-xxlarge-v1": 512,
47
+ "albert/albert-base-v2": 512,
48
+ "albert/albert-large-v2": 512,
49
+ "albert/albert-xlarge-v2": 512,
50
+ "albert/albert-xxlarge-v2": 512,
51
+ }
52
+
53
+ SPIECE_UNDERLINE = "▁"
54
+
55
+
56
+ class TransformerLMTokenizer(PreTrainedTokenizer):
57
+ """
58
+ Construct an ALBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
59
+
60
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
61
+ this superclass for more information regarding those methods.
62
+
63
+ Args:
64
+ vocab_file (`str`):
65
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
66
+ contains the vocabulary necessary to instantiate a tokenizer.
67
+ do_lower_case (`bool`, *optional*, defaults to `True`):
68
+ Whether or not to lowercase the input when tokenizing.
69
+ remove_space (`bool`, *optional*, defaults to `True`):
70
+ Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
71
+ keep_accents (`bool`, *optional*, defaults to `False`):
72
+ Whether or not to keep accents when tokenizing.
73
+ bos_token (`str`, *optional*, defaults to `"[CLS]"`):
74
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
75
+
76
+ <Tip>
77
+
78
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
79
+ sequence. The token used is the `cls_token`.
80
+
81
+ </Tip>
82
+
83
+ eos_token (`str`, *optional*, defaults to `"[SEP]"`):
84
+ The end of sequence token.
85
+
86
+ <Tip>
87
+
88
+ When building a sequence using special tokens, this is not the token that is used for the end of sequence.
89
+ The token used is the `sep_token`.
90
+
91
+ </Tip>
92
+
93
+ unk_token (`str`, *optional*, defaults to `"<unk>"`):
94
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
95
+ token instead.
96
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
97
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
98
+ sequence classification or for a text and a question for question answering. It is also used as the last
99
+ token of a sequence built with special tokens.
100
+ pad_token (`str`, *optional*, defaults to `"<pad>"`):
101
+ The token used for padding, for example when batching sequences of different lengths.
102
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
103
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
104
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
105
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
106
+ The token used for masking values. This is the token used when training this model with masked language
107
+ modeling. This is the token which the model will try to predict.
108
+ sp_model_kwargs (`dict`, *optional*):
109
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
110
+ SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
111
+ to set:
112
+
113
+ - `enable_sampling`: Enable subword regularization.
114
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
115
+
116
+ - `nbest_size = {0,1}`: No sampling is performed.
117
+ - `nbest_size > 1`: samples from the nbest_size results.
118
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
119
+ using forward-filtering-and-backward-sampling algorithm.
120
+
121
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
122
+ BPE-dropout.
123
+
124
+ Attributes:
125
+ sp_model (`SentencePieceProcessor`):
126
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
127
+ """
128
+
129
+ vocab_files_names = VOCAB_FILES_NAMES
130
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
131
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
132
+
133
+ def __init__(
134
+ self,
135
+ vocab_file,
136
+ do_lower_case=True,
137
+ remove_space=True,
138
+ keep_accents=False,
139
+ bos_token="[CLS]",
140
+ eos_token="[SEP]",
141
+ unk_token="<unk>",
142
+ sep_token="[SEP]",
143
+ pad_token="<pad>",
144
+ cls_token="[CLS]",
145
+ mask_token="[MASK]",
146
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
147
+ **kwargs,
148
+ ) -> None:
149
+ # Mask token behave like a normal word, i.e. include the space before it and
150
+ # is included in the raw text, there should be a match in a non-normalized sentence.
151
+ mask_token = (
152
+ AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
153
+ if isinstance(mask_token, str)
154
+ else mask_token
155
+ )
156
+
157
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
158
+
159
+ self.do_lower_case = do_lower_case
160
+ self.remove_space = remove_space
161
+ self.keep_accents = keep_accents
162
+ self.vocab_file = vocab_file
163
+
164
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
165
+ self.sp_model.Load(vocab_file)
166
+
167
+ super().__init__(
168
+ do_lower_case=do_lower_case,
169
+ remove_space=remove_space,
170
+ keep_accents=keep_accents,
171
+ bos_token=bos_token,
172
+ eos_token=eos_token,
173
+ unk_token=unk_token,
174
+ sep_token=sep_token,
175
+ pad_token=pad_token,
176
+ cls_token=cls_token,
177
+ mask_token=mask_token,
178
+ sp_model_kwargs=self.sp_model_kwargs,
179
+ **kwargs,
180
+ )
181
+
182
+ @property
183
+ def vocab_size(self) -> int:
184
+ return len(self.sp_model)
185
+
186
+ def get_vocab(self) -> Dict[str, int]:
187
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
188
+ vocab.update(self.added_tokens_encoder)
189
+ return vocab
190
+
191
+ def __getstate__(self):
192
+ state = self.__dict__.copy()
193
+ state["sp_model"] = None
194
+ return state
195
+
196
+ def __setstate__(self, d):
197
+ self.__dict__ = d
198
+
199
+ # for backward compatibility
200
+ if not hasattr(self, "sp_model_kwargs"):
201
+ self.sp_model_kwargs = {}
202
+
203
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
204
+ self.sp_model.Load(self.vocab_file)
205
+
206
+ def preprocess_text(self, inputs):
207
+ if self.remove_space:
208
+ outputs = " ".join(inputs.strip().split())
209
+ else:
210
+ outputs = inputs
211
+ outputs = outputs.replace("``", '"').replace("''", '"')
212
+
213
+ if not self.keep_accents:
214
+ outputs = unicodedata.normalize("NFKD", outputs)
215
+ outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
216
+ if self.do_lower_case:
217
+ outputs = outputs.lower()
218
+
219
+ return outputs
220
+
221
+ def _tokenize(self, text: str) -> List[str]:
222
+ """Tokenize a string."""
223
+ text = self.preprocess_text(text)
224
+ pieces = self.sp_model.encode(text, out_type=str)
225
+ new_pieces = []
226
+ for piece in pieces:
227
+ if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
228
+ # Logic to handle special cases see https://github.com/google-research/bert/blob/master/README.md#tokenization
229
+ # `9,9` -> ['▁9', ',', '9'] instead of [`_9,`, '9']
230
+ cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
231
+ if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
232
+ if len(cur_pieces[0]) == 1:
233
+ cur_pieces = cur_pieces[1:]
234
+ else:
235
+ cur_pieces[0] = cur_pieces[0][1:]
236
+ cur_pieces.append(piece[-1])
237
+ new_pieces.extend(cur_pieces)
238
+ else:
239
+ new_pieces.append(piece)
240
+
241
+ return new_pieces
242
+
243
+ def _convert_token_to_id(self, token):
244
+ """Converts a token (str) in an id using the vocab."""
245
+ return self.sp_model.PieceToId(token)
246
+
247
+ def _convert_id_to_token(self, index):
248
+ """Converts an index (integer) in a token (str) using the vocab."""
249
+ return self.sp_model.IdToPiece(index)
250
+
251
+ def convert_tokens_to_string(self, tokens):
252
+ """Converts a sequence of tokens (string) in a single string."""
253
+ current_sub_tokens = []
254
+ out_string = ""
255
+ prev_is_special = False
256
+ for token in tokens:
257
+ # make sure that special tokens are not decoded using sentencepiece model
258
+ if token in self.all_special_tokens:
259
+ if not prev_is_special:
260
+ out_string += " "
261
+ out_string += self.sp_model.decode(current_sub_tokens) + token
262
+ prev_is_special = True
263
+ current_sub_tokens = []
264
+ else:
265
+ current_sub_tokens.append(token)
266
+ prev_is_special = False
267
+ out_string += self.sp_model.decode(current_sub_tokens)
268
+ return out_string.strip()
269
+
270
+ def build_inputs_with_special_tokens(
271
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
272
+ ) -> List[int]:
273
+ """
274
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
275
+ adding special tokens. An ALBERT sequence has the following format:
276
+
277
+ - single sequence: `[CLS] X [SEP]`
278
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
279
+
280
+ Args:
281
+ token_ids_0 (`List[int]`):
282
+ List of IDs to which the special tokens will be added.
283
+ token_ids_1 (`List[int]`, *optional*):
284
+ Optional second list of IDs for sequence pairs.
285
+
286
+ Returns:
287
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
288
+ """
289
+ sep = [self.sep_token_id]
290
+ cls = [self.cls_token_id]
291
+ if token_ids_1 is None:
292
+ return cls + token_ids_0 + sep
293
+ return cls + token_ids_0 + sep + token_ids_1 + sep
294
+
295
+ def get_special_tokens_mask(
296
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
297
+ ) -> List[int]:
298
+ """
299
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
300
+ special tokens using the tokenizer `prepare_for_model` method.
301
+
302
+ Args:
303
+ token_ids_0 (`List[int]`):
304
+ List of IDs.
305
+ token_ids_1 (`List[int]`, *optional*):
306
+ Optional second list of IDs for sequence pairs.
307
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
308
+ Whether or not the token list is already formatted with special tokens for the model.
309
+
310
+ Returns:
311
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
312
+ """
313
+
314
+ if already_has_special_tokens:
315
+ return super().get_special_tokens_mask(
316
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
317
+ )
318
+
319
+ if token_ids_1 is not None:
320
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
321
+ return [1] + ([0] * len(token_ids_0)) + [1]
322
+
323
+ def create_token_type_ids_from_sequences(
324
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
325
+ ) -> List[int]:
326
+ """
327
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
328
+ sequence pair mask has the following format:
329
+
330
+ ```
331
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
332
+ | first sequence | second sequence |
333
+ ```
334
+
335
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
336
+
337
+ Args:
338
+ token_ids_0 (`List[int]`):
339
+ List of IDs.
340
+ token_ids_1 (`List[int]`, *optional*):
341
+ Optional second list of IDs for sequence pairs.
342
+
343
+ Returns:
344
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
345
+ """
346
+ sep = [self.sep_token_id]
347
+ cls = [self.cls_token_id]
348
+
349
+ if token_ids_1 is None:
350
+ return len(cls + token_ids_0 + sep) * [0]
351
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
352
+
353
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
354
+ if not os.path.isdir(save_directory):
355
+ return
356
+ out_vocab_file = os.path.join(
357
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
358
+ )
359
+
360
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
361
+ copyfile(self.vocab_file, out_vocab_file)
362
+ elif not os.path.isfile(self.vocab_file):
363
+ with open(out_vocab_file, "wb") as fi:
364
+ content_spiece_model = self.sp_model.serialized_model_proto()
365
+ fi.write(content_spiece_model)
366
+
367
+ return (out_vocab_file,)
tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "30000": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "30001": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "30002": {
28
+ "content": "<pad>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "30003": {
36
+ "content": "[MASK]",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "auto_map": {
45
+ "AutoTokenizer": [
46
+ "tokenization_transformerlm.TransformerLMTokenizer",
47
+ null
48
+ ]
49
+ },
50
+ "bos_token": "[CLS]",
51
+ "clean_up_tokenization_spaces": true,
52
+ "cls_token": "[CLS]",
53
+ "do_lower_case": true,
54
+ "eos_token": "[SEP]",
55
+ "keep_accents": false,
56
+ "mask_token": "[MASK]",
57
+ "model_max_length": 1000000000000000019884624838656,
58
+ "pad_token": "<pad>",
59
+ "remove_space": true,
60
+ "sep_token": "[SEP]",
61
+ "sp_model_kwargs": {},
62
+ "tokenizer_class": "TransformerLMTokenizer",
63
+ "unk_token": "<unk>"
64
+ }