xu-song commited on
Commit
7156337
·
1 Parent(s): 309a593
app.py CHANGED
@@ -15,7 +15,8 @@
15
  - 英文 utf-8编码
16
  - 词典支持下载
17
  - 中文字词统计,是否要包括 _ G 等字符
18
-
 
19
 
20
 
21
  plots
@@ -137,7 +138,7 @@ with gr.Blocks(css="style.css") as demo:
137
  # )
138
  # https://www.onlinewebfonts.com/icon/418591
139
  gr.Image("images/VS.svg", scale=1, show_label=False,
140
- show_download_button=False, container=False,
141
  show_share_button=False)
142
  with gr.Column(scale=6):
143
  with gr.Group():
@@ -154,9 +155,9 @@ with gr.Blocks(css="style.css") as demo:
154
  lines=1,
155
  elem_classes="statistics"
156
  )
157
- stats_zh_token_size_2 = gr.TextArea( # 中文单子数,
158
  value=default_stats_zh_token_size_2,
159
- label="ZH char/word",
160
  lines=1,
161
  elem_classes="statistics"
162
  )
 
15
  - 英文 utf-8编码
16
  - 词典支持下载
17
  - 中文字词统计,是否要包括 _ G 等字符
18
+ - baichuan的单字数量怎么两万多个?
19
+ - gpt4
20
 
21
 
22
  plots
 
138
  # )
139
  # https://www.onlinewebfonts.com/icon/418591
140
  gr.Image("images/VS.svg", scale=1, show_label=False,
141
+ show_download_button=True, container=False,
142
  show_share_button=False)
143
  with gr.Column(scale=6):
144
  with gr.Group():
 
155
  lines=1,
156
  elem_classes="statistics"
157
  )
158
+ stats_zh_token_size_2 = gr.TextArea(
159
  value=default_stats_zh_token_size_2,
160
+ label="ZH char/word", # 中文字/词
161
  lines=1,
162
  elem_classes="statistics"
163
  )
vocab/gpt_4/__init__.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import tiktoken
4
+ from tiktoken import Encoding
5
+
6
+ tokenizer = tiktoken.encoding_for_model('gpt-4')
7
+ tokenizer.vocab_size = tokenizer.n_vocab
8
+
9
+
10
+
11
+ def decode(self, tokens, errors="replace"):
12
+ # def decode(self, tokens: list[int], errors: str = "replace") -> str:
13
+ try:
14
+ decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
15
+ except:
16
+ decode_str = "null"
17
+ return decode_str
18
+
19
+ def convert_ids_to_tokens(self, tokens):
20
+ return tokenizer.decode_tokens_bytes(tokens)
21
+
22
+ def get_vocab(self):
23
+ """Returns vocab as a dict"""
24
+ vocab = {}
25
+ for i in range(self.vocab_size):
26
+ try:
27
+ token_byte = self.convert_ids_to_tokens([i])[0]
28
+ token_str = token_byte.decode("utf-8")
29
+ vocab[token_str] = i
30
+ except KeyError:
31
+ print("gpt_35_turbo decode KeyError", i)
32
+ except UnicodeDecodeError:
33
+ print("gpt_35_turbo decode UnicodeDecodeError", i, str(token_byte))
34
+ # vocab.update(self.added_tokens_encoder)
35
+ return vocab
36
+
37
+
38
+ Encoding.decode = decode
39
+ Encoding.convert_ids_to_tokens = convert_ids_to_tokens
40
+ Encoding.get_vocab = get_vocab
41
+
42
+
vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.mock.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab/gpt_neox_chinese_v1/README.md CHANGED
@@ -10,3 +10,55 @@ Vocab size: 54634
10
 
11
  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
12
 
13
+
14
+ ## 20B
15
+
16
+ [configs/20B.yml](https://github.com/EleutherAI/gpt-neox/blob/main/configs/20B.yml#L7)
17
+ ```
18
+ "vocab-file": "./20B_checkpoints/20B_tokenizer.json",
19
+ ```
20
+
21
+ Vocab size: 50277
22
+ self.padded_vocab_size = 50304
23
+
24
+
25
+ padded vocab (size: 50277) with 27 dummy tokens (new size: 50304)
26
+
27
+ ## 词典
28
+
29
+ 见 convert_vocab_to_txt.py
30
+
31
+ ```
32
+ {"id": 13609, "token": "\u00e4\u00b8\u0143", "token_decode": "\u4e2d"} 中
33
+
34
+ # 多个符号拼接在一起的
35
+ {"id": 13663, "token": ".*]{}", "token_decode": ".*]{}"} .*]{}
36
+
37
+ # ss
38
+
39
+ ```
40
+
41
+
42
+ ## 中文支持
43
+
44
+ 基本没有OOV。
45
+
46
+ gpt-neox是在800G英文数据集上训练的,为啥词典支持中文?因为是byte-level BPE
47
+
48
+ ```
49
+ 丁 [3218, 212]
50
+ 七 [3218, 214]
51
+ 万 [3218, 218]
52
+ 诀 [11894, 211]
53
+ 证 [11894, 212]
54
+ ```
55
+
56
+
57
+ 编码长度统计: Counter({2: 4190, 3: 1295, 1: 285})
58
+ 平均编码长度: 2.1750433275563257
59
+
60
+
61
+ ## ss
62
+
63
+
64
+
vocab/gpt_neox_chinese_v1/mock.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import json
3
+
4
+ input_path = "20B_tokenizer_chinese.json"
5
+
6
+ tokenizer = json.load(open(input_path, "r", encoding="utf-8"))
7
+
8
+ vocab = tokenizer["model"]["vocab"]
9
+
10
+
11
+ for k, v in copy.deepcopy(vocab).items():
12
+ vocab[str(v)] = v
13
+ vocab.pop(k)
14
+
15
+ out_path = input_path.replace(".json", ".mock.json")
16
+ with open(out_path, "w", encoding="utf-8") as f_out:
17
+ f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2))
vocab/gpt_neox_chinese_v1/tokenizer/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from .tokenizer import build_tokenizer
vocab/gpt_neox_chinese_v1/tokenizer/gpt2_tokenization.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2021, EleutherAI
2
+ # This file is based on code by the authors denoted below and has been modified from its original version.
3
+ #
4
+ # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """Tokenization classes for OpenAI GPT."""
19
+
20
+ from __future__ import absolute_import, division, print_function, unicode_literals
21
+
22
+ import sys
23
+ import json
24
+ import logging
25
+ import os
26
+ import regex as re
27
+ from io import open
28
+
29
+ from functools import lru_cache
30
+
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ PRETRAINED_VOCAB_ARCHIVE_MAP = {
35
+ "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
36
+ }
37
+ PRETRAINED_MERGES_ARCHIVE_MAP = {
38
+ "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
39
+ }
40
+ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
41
+ "gpt2": 1024,
42
+ }
43
+
44
+ VOCAB_NAME = "vocab.json"
45
+ MERGES_NAME = "merges.txt"
46
+ SPECIAL_TOKENS_NAME = "special_tokens.txt"
47
+
48
+
49
+ @lru_cache()
50
+ def bytes_to_unicode():
51
+ """
52
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
53
+ The reversible bpe codes work on unicode strings.
54
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
55
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
56
+ This is a significant percentage of your normal, say, 32K bpe vocab.
57
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
58
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
59
+ """
60
+ _chr = unichr if sys.version_info[0] == 2 else chr
61
+ bs = (
62
+ list(range(ord("!"), ord("~") + 1))
63
+ + list(range(ord("¡"), ord("¬") + 1))
64
+ + list(range(ord("®"), ord("ÿ") + 1))
65
+ )
66
+ cs = bs[:]
67
+ n = 0
68
+ for b in range(2**8):
69
+ if b not in bs:
70
+ bs.append(b)
71
+ cs.append(2**8 + n)
72
+ n += 1
73
+ cs = [_chr(n) for n in cs]
74
+ return dict(zip(bs, cs))
75
+
76
+
77
+ def get_pairs(word):
78
+ """Return set of symbol pairs in a word.
79
+
80
+ Word is represented as tuple of symbols (symbols being variable-length strings).
81
+ """
82
+ pairs = set()
83
+ prev_char = word[0]
84
+ for char in word[1:]:
85
+ pairs.add((prev_char, char))
86
+ prev_char = char
87
+ return pairs
88
+
89
+
90
+ class GPT2Tokenizer(object):
91
+ """
92
+ GPT-2 BPE tokenizer. Peculiarities:
93
+ - Byte-level BPE
94
+ """
95
+
96
+ @classmethod
97
+ def from_pretrained(
98
+ cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs
99
+ ):
100
+ """
101
+ Instantiate a PreTrainedBertModel from a pre-trained model file.
102
+ Download and cache the pre-trained model file if needed.
103
+ """
104
+ if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
105
+ vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
106
+ merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
107
+ special_tokens_file = None
108
+ else:
109
+ vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
110
+ merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
111
+ special_tokens_file = os.path.join(
112
+ pretrained_model_name_or_path, SPECIAL_TOKENS_NAME
113
+ )
114
+ if not os.path.exists(special_tokens_file):
115
+ special_tokens_file = None
116
+ else:
117
+ logger.info(
118
+ "loading special tokens file {}".format(special_tokens_file)
119
+ )
120
+ # redirect to the cache, if necessary
121
+ try:
122
+ from .file_utils import cached_path
123
+
124
+ resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
125
+ resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
126
+ except EnvironmentError:
127
+ logger.error(
128
+ "Model name '{}' was not found in model name list ({}). "
129
+ "We assumed '{}' was a path or url but couldn't find files {} and {} "
130
+ "at this path or url.".format(
131
+ pretrained_model_name_or_path,
132
+ ", ".join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
133
+ pretrained_model_name_or_path,
134
+ vocab_file,
135
+ merges_file,
136
+ )
137
+ )
138
+ return None
139
+ if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
140
+ logger.info("loading vocabulary file {}".format(vocab_file))
141
+ logger.info("loading merges file {}".format(merges_file))
142
+ else:
143
+ logger.info(
144
+ "loading vocabulary file {} from cache at {}".format(
145
+ vocab_file, resolved_vocab_file
146
+ )
147
+ )
148
+ logger.info(
149
+ "loading merges file {} from cache at {}".format(
150
+ merges_file, resolved_merges_file
151
+ )
152
+ )
153
+ if (
154
+ pretrained_model_name_or_path
155
+ in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
156
+ ):
157
+ # if we're using a pretrained model, ensure the tokenizer won't index sequences longer
158
+ # than the number of positional embeddings
159
+ max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
160
+ pretrained_model_name_or_path
161
+ ]
162
+ kwargs["max_len"] = min(kwargs.get("max_len", int(1e12)), max_len)
163
+ # Instantiate tokenizer.
164
+ if special_tokens_file and "special_tokens" not in kwargs:
165
+ special_tokens = (
166
+ open(special_tokens_file, encoding="utf-8").read().split("\n")[:-1]
167
+ )
168
+ else:
169
+ special_tokens = kwargs.pop("special_tokens", [])
170
+ tokenizer = cls(
171
+ resolved_vocab_file,
172
+ resolved_merges_file,
173
+ special_tokens=special_tokens,
174
+ *inputs,
175
+ **kwargs
176
+ )
177
+ return tokenizer
178
+
179
+ def __init__(
180
+ self,
181
+ vocab_file,
182
+ merges_file,
183
+ errors="replace",
184
+ special_tokens=None,
185
+ max_len=None,
186
+ ):
187
+ self.max_len = max_len if max_len is not None else int(1e12)
188
+ self.encoder = json.load(open(vocab_file))
189
+ self.decoder = {v: k for k, v in self.encoder.items()}
190
+ self.errors = errors # how to handle errors in decoding
191
+ self.byte_encoder = bytes_to_unicode()
192
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
193
+ bpe_data = open(merges_file, encoding="utf-8").read().split("\n")[1:-1]
194
+ bpe_merges = [tuple(merge.split()) for merge in bpe_data]
195
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
196
+
197
+ # Should haved added re.IGNORECASE so BPE merges can happen for
198
+ # capitalized versions of contractions
199
+ self.pat = re.compile(
200
+ r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
201
+ )
202
+
203
+ self.special_tokens = {}
204
+ self.special_tokens_decoder = {}
205
+ self.set_special_tokens(special_tokens)
206
+
207
+ def __len__(self):
208
+ return len(self.encoder) + len(self.special_tokens)
209
+
210
+ def set_special_tokens(self, special_tokens):
211
+ """Add a list of additional tokens to the encoder.
212
+ The additional tokens are indexed starting from the last index of the
213
+ current vocabulary in the order of the `special_tokens` list.
214
+ """
215
+ if not special_tokens:
216
+ self.special_tokens = {}
217
+ self.special_tokens_decoder = {}
218
+ return
219
+ self.special_tokens = dict(
220
+ (tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens)
221
+ )
222
+ self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()}
223
+ logger.info("Special tokens {}".format(self.special_tokens))
224
+
225
+ @lru_cache(maxsize=131072)
226
+ def bpe(self, token):
227
+ word = tuple(token)
228
+ pairs = get_pairs(word)
229
+
230
+ if not pairs:
231
+ return token
232
+
233
+ while True:
234
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
235
+ if bigram not in self.bpe_ranks:
236
+ break
237
+ first, second = bigram
238
+ new_word = []
239
+ i = 0
240
+ while i < len(word):
241
+ try:
242
+ j = word.index(first, i)
243
+ new_word.extend(word[i:j])
244
+ i = j
245
+ except BaseException:
246
+ new_word.extend(word[i:])
247
+ break
248
+
249
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
250
+ new_word.append(first + second)
251
+ i += 2
252
+ else:
253
+ new_word.append(word[i])
254
+ i += 1
255
+ new_word = tuple(new_word)
256
+ word = new_word
257
+ if len(word) == 1:
258
+ break
259
+ else:
260
+ pairs = get_pairs(word)
261
+ word = " ".join(word)
262
+ return word
263
+
264
+ def tokenize(self, text):
265
+ """Tokenize a string."""
266
+ bpe_tokens = []
267
+ for token in re.findall(self.pat, text):
268
+ if sys.version_info[0] == 2:
269
+ token = "".join(self.byte_encoder[ord(b)] for b in token)
270
+ else:
271
+ token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
272
+ bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
273
+ return bpe_tokens
274
+
275
+ def convert_tokens_to_ids(self, tokens):
276
+ """Converts a sequence of tokens into ids using the vocab."""
277
+ ids = []
278
+ if isinstance(tokens, str) or (
279
+ sys.version_info[0] == 2 and isinstance(tokens, unicode)
280
+ ):
281
+ if tokens in self.special_tokens:
282
+ return self.special_tokens[tokens]
283
+ else:
284
+ return self.encoder.get(tokens, 0)
285
+ for token in tokens:
286
+ if token in self.special_tokens:
287
+ ids.append(self.special_tokens[token])
288
+ else:
289
+ ids.append(self.encoder.get(token, 0))
290
+ if len(ids) > self.max_len:
291
+ logger.warning(
292
+ "Token indices sequence length is longer than the specified maximum "
293
+ " sequence length for this OpenAI GPT model ({} > {}). Running this"
294
+ " sequence through the model will result in indexing errors".format(
295
+ len(ids), self.max_len
296
+ )
297
+ )
298
+ return ids
299
+
300
+ def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
301
+ """Converts a sequence of ids in BPE tokens using the vocab."""
302
+ tokens = []
303
+ for i in ids:
304
+ if i in self.special_tokens_decoder:
305
+ if not skip_special_tokens:
306
+ tokens.append(self.special_tokens_decoder[i])
307
+ else:
308
+ tokens.append(self.decoder[i])
309
+ return tokens
310
+
311
+ def encode(self, text):
312
+ return self.convert_tokens_to_ids(self.tokenize(text))
313
+
314
+ def decode(self, tokens):
315
+ text = "".join([self.decoder[token] for token in tokens])
316
+ text = bytearray([self.byte_decoder[c] for c in text]).decode(
317
+ "utf-8", errors=self.errors
318
+ )
319
+ return text
320
+
321
+ def save_vocabulary(self, vocab_path):
322
+ """Save the tokenizer vocabulary and merge files to a directory."""
323
+ if not os.path.isdir(vocab_path):
324
+ logger.error(
325
+ "Vocabulary path ({}) should be a directory".format(vocab_path)
326
+ )
327
+ return
328
+ vocab_file = os.path.join(vocab_path, VOCAB_NAME)
329
+ merge_file = os.path.join(vocab_path, MERGES_NAME)
330
+ special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
331
+
332
+ with open(vocab_file, "w", encoding="utf-8") as f:
333
+ f.write(json.dumps(self.encoder, ensure_ascii=False))
334
+
335
+ index = 0
336
+ with open(merge_file, "w", encoding="utf-8") as writer:
337
+ writer.write("#version: 0.2\n")
338
+ for bpe_tokens, token_index in sorted(
339
+ self.bpe_ranks.items(), key=lambda kv: kv[1]
340
+ ):
341
+ if index != token_index:
342
+ logger.warning(
343
+ "Saving vocabulary to {}: BPE merge indices are not consecutive."
344
+ " Please check that the tokenizer is not corrupted!".format(
345
+ merge_file
346
+ )
347
+ )
348
+ index = token_index
349
+ writer.write(" ".join(bpe_tokens) + "\n")
350
+ index += 1
351
+
352
+ index = len(self.encoder)
353
+ with open(special_tokens_file, "w", encoding="utf-8") as writer:
354
+ for token, token_index in sorted(
355
+ self.special_tokens.items(), key=lambda kv: kv[1]
356
+ ):
357
+ if index != token_index:
358
+ logger.warning(
359
+ "Saving special tokens vocabulary to {}: BPE indices are not consecutive."
360
+ " Please check that the tokenizer is not corrupted!".format(
361
+ special_tokens_file
362
+ )
363
+ )
364
+ index = token_index
365
+ writer.write(token + "\n")
366
+ index += 1
367
+
368
+ return vocab_file, merge_file, special_tokens_file
vocab/gpt_neox_chinese_v1/tokenizer/tokenizer.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2021, EleutherAI
2
+ # This file is based on code by the authors denoted below and has been modified from its original version.
3
+ #
4
+ # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """Megatron tokenizers."""
19
+
20
+ from abc import ABC
21
+ from abc import abstractmethod
22
+
23
+ from tokenizers import Tokenizer
24
+ from transformers import GPT2Tokenizer, GPT2TokenizerFast
25
+ import numpy as np
26
+ import sentencepiece as spm
27
+ from typing import List, Union
28
+ from .gpt2_tokenization import GPT2Tokenizer
29
+
30
+
31
+ def build_tokenizer(args):
32
+ """Initialize tokenizer."""
33
+ if args.rank == 0:
34
+ print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True)
35
+
36
+ # Select and instantiate the tokenizer.
37
+ if args.tokenizer_type.lower() == "GPT2BPETokenizer".lower():
38
+ assert args.vocab_file is not None
39
+ assert args.merge_file is not None
40
+ tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
41
+ elif args.tokenizer_type.lower() == "SPMTokenizer".lower():
42
+ assert args.vocab_file is not None
43
+ tokenizer = SentencePieceTokenizer(args.vocab_file)
44
+ elif args.tokenizer_type.lower() == "HFTokenizer".lower():
45
+ assert args.vocab_file is not None
46
+ tokenizer = HFTokenizer(args.vocab_file)
47
+ elif args.tokenizer_type.lower() == "HFGPT2Tokenizer".lower():
48
+ if args.vocab_file is None:
49
+ print(
50
+ "WARNING: No vocab file found, loading Huggingface's pretrained GPT2Tokenizer"
51
+ )
52
+ tokenizer = HFGPT2Tokenizer(args.vocab_file)
53
+ elif args.tokenizer_type.lower() == "CharLevelTokenizer".lower():
54
+ tokenizer = CharLevelTokenizer(vocab_size=512)
55
+ elif args.tokenizer_type.lower() == "TiktokenTokenizer".lower():
56
+ assert args.vocab_file is not None
57
+ tokenizer = TiktokenTokenizer(args.vocab_file)
58
+ else:
59
+ raise NotImplementedError(
60
+ "{} tokenizer is not " "implemented.".format(args.tokenizer_type)
61
+ )
62
+
63
+ # Add vocab size.
64
+ args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args)
65
+
66
+ return tokenizer
67
+
68
+
69
+ def _vocab_size_with_padding(orig_vocab_size, args):
70
+ """Pad vocab size so it is divisible by model parallel size and
71
+ still having GPU friendly size."""
72
+
73
+ after = orig_vocab_size
74
+ multiple = args.make_vocab_size_divisible_by * args.model_parallel_size
75
+ while (after % multiple) != 0:
76
+ after += 1
77
+ if args.rank == 0:
78
+ print(
79
+ " > padded vocab (size: {}) with {} dummy tokens "
80
+ "(new size: {})".format(orig_vocab_size, after - orig_vocab_size, after),
81
+ flush=True,
82
+ )
83
+ return after
84
+
85
+
86
+ class AbstractTokenizer(ABC):
87
+ """Abstract class for tokenizer."""
88
+
89
+ def __init__(self, name):
90
+ self.name = name
91
+ super().__init__()
92
+
93
+ @property
94
+ @abstractmethod
95
+ def vocab_size(self):
96
+ pass
97
+
98
+ @property
99
+ @abstractmethod
100
+ def vocab(self):
101
+ """Dictionary from vocab text token to id token."""
102
+ pass
103
+
104
+ @property
105
+ @abstractmethod
106
+ def inv_vocab(self):
107
+ """Dictionary from vocab id token to text token."""
108
+ pass
109
+
110
+ @abstractmethod
111
+ def tokenize(self, text):
112
+ pass
113
+
114
+ def detokenize(self, token_ids):
115
+ raise NotImplementedError(
116
+ "detokenizer is not implemented for {} " "tokenizer".format(self.name)
117
+ )
118
+
119
+ @property
120
+ def cls(self):
121
+ raise NotImplementedError(
122
+ "CLS is not provided for {} " "tokenizer".format(self.name)
123
+ )
124
+
125
+ @property
126
+ def sep(self):
127
+ raise NotImplementedError(
128
+ "SEP is not provided for {} " "tokenizer".format(self.name)
129
+ )
130
+
131
+ @property
132
+ def pad(self):
133
+ raise NotImplementedError(
134
+ "PAD is not provided for {} " "tokenizer".format(self.name)
135
+ )
136
+
137
+ @property
138
+ def eod(self):
139
+ raise NotImplementedError(
140
+ "EOD is not provided for {} " "tokenizer".format(self.name)
141
+ )
142
+
143
+ @property
144
+ def mask(self):
145
+ raise NotImplementedError(
146
+ "MASK is not provided for {} " "tokenizer".format(self.name)
147
+ )
148
+
149
+
150
+ class _GPT2BPETokenizer(AbstractTokenizer):
151
+ """Original GPT2 BPE tokenizer."""
152
+
153
+ def __init__(self, vocab_file, merge_file):
154
+ name = "GPT2 BPE"
155
+ super().__init__(name)
156
+
157
+ self.tokenizer = GPT2Tokenizer(
158
+ vocab_file, merge_file, errors="replace", special_tokens=[], max_len=None
159
+ )
160
+ self.eod_id = self.tokenizer.encoder["<|endoftext|>"]
161
+
162
+ @property
163
+ def vocab_size(self):
164
+ return len(self.tokenizer.encoder)
165
+
166
+ @property
167
+ def vocab(self):
168
+ return self.tokenizer.encoder
169
+
170
+ @property
171
+ def inv_vocab(self):
172
+ return self.tokenizer.decoder
173
+
174
+ def tokenize(self, text):
175
+ return self.tokenizer.encode(text)
176
+
177
+ def detokenize(self, token_ids):
178
+ return self.tokenizer.decode(token_ids)
179
+
180
+ @property
181
+ def eod(self):
182
+ return self.eod_id
183
+
184
+
185
+ class SentencePieceTokenizer(AbstractTokenizer):
186
+ """Designed to Integrate SP's Tokenizer."""
187
+
188
+ def __init__(self, vocab_file):
189
+ name = "SPM"
190
+ super().__init__(name)
191
+
192
+ self.tokenizer = spm.SentencePieceProcessor(model_file=vocab_file)
193
+ self.eod_id = self.tokenizer.piece_to_id("<|endoftext|>")
194
+
195
+ @property
196
+ def vocab_size(self):
197
+ return self.tokenizer.get_piece_size()
198
+
199
+ @property
200
+ def vocab(self):
201
+ return {
202
+ self.tokenizer.id_to_piece(idx): idx
203
+ for idx in range(self.tokenizer.get_piece_size())
204
+ }
205
+
206
+ @property
207
+ def inv_vocab(self):
208
+ return {
209
+ idx: self.tokenizer.id_to_piece(idx)
210
+ for idx in range(self.tokenizer.get_piece_size())
211
+ }
212
+
213
+ def tokenize(self, text):
214
+ return self.tokenizer.encode(text)
215
+
216
+ def detokenize(self, token_ids):
217
+ return self.tokenizer.decode(token_ids)
218
+
219
+ @property
220
+ def eod(self):
221
+ return self.eod_id
222
+
223
+
224
+ class HFTokenizer(AbstractTokenizer):
225
+ """Designed to Integrate HF's Tokenizer library."""
226
+
227
+ def __init__(self, vocab_file):
228
+ name = "HFTokenizer"
229
+ super().__init__(name)
230
+
231
+ self.tokenizer = Tokenizer.from_file(vocab_file)
232
+ self.eod_id = self.tokenizer.token_to_id("<|endoftext|>")
233
+ self.pad_id = self.tokenizer.token_to_id("<|padding|>")
234
+
235
+ @property
236
+ def vocab_size(self):
237
+ return self.tokenizer.get_vocab_size()
238
+
239
+ @property
240
+ def vocab(self):
241
+ return self.tokenizer.get_vocab()
242
+
243
+ @property
244
+ def inv_vocab(self):
245
+ return self.tokenizer.decoder
246
+
247
+ def tokenize(self, text: str):
248
+ return self.tokenizer.encode(text).ids
249
+
250
+ def tokenize_batch(self, text_batch: Union[List[str], str]):
251
+ return self.tokenizer.encode_batch(text_batch)
252
+
253
+ def detokenize(self, token_ids):
254
+ return self.tokenizer.decode(token_ids)
255
+
256
+ @property
257
+ def eod(self):
258
+ return self.eod_id
259
+
260
+
261
+ class HFGPT2Tokenizer(AbstractTokenizer):
262
+ """Designed to Integrate the pretrained OpenAI GPT2 Tokenizers from HF"""
263
+
264
+ def __init__(self, vocab_file=None, fast=True):
265
+ name = "HFGPT2Tokenizer"
266
+ if fast:
267
+ name += "Fast"
268
+ super().__init__(name)
269
+ if vocab_file is None:
270
+ vocab_file = "gpt2"
271
+ if fast:
272
+ self.tokenizer = GPT2TokenizerFast.from_pretrained(vocab_file)
273
+ else:
274
+ self.tokenizer = GPT2Tokenizer.from_pretrained(vocab_file)
275
+
276
+ self.tokenizer.add_special_tokens({"pad_token": "<|padding|>"})
277
+ self.eod_id = self.tokenizer.eos_token_id
278
+ self.pad_id = self.tokenizer.pad_token_id
279
+
280
+ @property
281
+ def vocab_size(self):
282
+ return len(self.tokenizer)
283
+
284
+ @property
285
+ def vocab(self):
286
+ return self.tokenizer.get_vocab()
287
+
288
+ @property
289
+ def inv_vocab(self):
290
+ return self.tokenizer._tokenizer.decoder
291
+
292
+ def tokenize(self, text: str):
293
+ return self.tokenizer.encode(text)
294
+
295
+ def tokenize_batch(self, text_batch: Union[List[str], str]):
296
+ if isinstance(text_batch, str):
297
+ text_batch = [text_batch]
298
+ return [self.tokenize(t) for t in text_batch]
299
+
300
+ def detokenize(self, token_ids):
301
+ return self.tokenizer.decode(token_ids)
302
+
303
+ @property
304
+ def eod(self):
305
+ return self.eod_id
306
+
307
+
308
+ class CharLevelTokenizer(AbstractTokenizer):
309
+ """Character Level Tokenizer"""
310
+
311
+ def __init__(self, vocab_size):
312
+ name = "CharLevelTokenizer"
313
+ super().__init__(name)
314
+ self._vocab_size = vocab_size
315
+ self.eod_id = 0
316
+ self.pad_id = 1
317
+
318
+ def clamp(self, n):
319
+ return max(32, min(n, self.vocab_size))
320
+
321
+ @property
322
+ def vocab_size(self):
323
+ return self._vocab_size
324
+
325
+ @property
326
+ def vocab(self):
327
+ raise NotImplementedError
328
+
329
+ @property
330
+ def inv_vocab(self):
331
+ raise NotImplementedError
332
+
333
+ def decode_token(self, token: int):
334
+ return str(chr(self.clamp(token)))
335
+
336
+ def tokenize(self, text: str):
337
+ return list(np.fromstring(text, dtype=np.uint8))
338
+
339
+ def tokenize_batch(self, text_batch: Union[List[str], str]):
340
+ if isinstance(text_batch, list):
341
+ return [self.tokenize(s) for s in text_batch]
342
+ else:
343
+ return self.tokenize(text_batch)
344
+
345
+ def detokenize(self, token_ids):
346
+ return "".join(list(map(self.decode_token, token_ids)))
347
+
348
+ @property
349
+ def eod(self):
350
+ return self.eod_id
351
+
352
+
353
+ class TiktokenTokenizer(AbstractTokenizer):
354
+ """Tokenizer from OpenAI's tiktoken implementation"""
355
+
356
+ def __init__(self, vocab_file):
357
+ try:
358
+ import tiktoken
359
+ except ModuleNotFoundError:
360
+ print("Please install tiktoken: (https://github.com/openai/tiktoken)")
361
+ raise Exception
362
+
363
+ name = "TiktokenTokenizer"
364
+ super().__init__(name)
365
+
366
+ self.tokenizer = tiktoken.get_encoding(vocab_file)
367
+ self.eod_id = self.tokenizer.eot_token
368
+ self.pad_id = None
369
+
370
+ @property
371
+ def vocab_size(self):
372
+ return self.tokenizer.n_vocab
373
+
374
+ @property
375
+ def vocab(self):
376
+ raise NotImplementedError(
377
+ "TiktokenTokenizer does not implement vocabulary access."
378
+ )
379
+
380
+ @property
381
+ def inv_vocab(self):
382
+ raise NotImplementedError(
383
+ "TiktokenTokenizer does not implement vocabulary access. \
384
+ To get the idx-th token in vocabulary, use tokenizer.decode([idx]) ."
385
+ )
386
+
387
+ def tokenize(self, text: str):
388
+ return self.tokenizer.encode(text) # , allowed_special="all")
389
+
390
+ def tokenize_batch(self, text_batch: List[str]):
391
+ return self.tokenizer.encode_batch(text_batch, allowed_special="all")
392
+
393
+ def detokenize(self, token_ids):
394
+ return self.tokenizer.decode(tokens=token_ids, errors="strict")
395
+
396
+ @property
397
+ def eod(self):
398
+ return self.eod_id
399
+
400
+ @property
401
+ def pad(self):
402
+ raise NotImplementedError
vocab/gpt_neox_chinese_v1/tokenizer/train_tokenizer.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2021, EleutherAI
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Assumes a dataset of jsonl files in the same format as the neox training set.
17
+ """
18
+
19
+ from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers
20
+ from tokenizers.normalizers import NFKC
21
+
22
+ from glob import glob
23
+ import os
24
+ import json
25
+ import argparse
26
+
27
+
28
+ def load_jsonl(input_path, quiet=True) -> list:
29
+ """
30
+ Read list of objects from a JSON lines file.
31
+ """
32
+ data = []
33
+ with open(input_path, "r", encoding="utf-8") as f:
34
+ for line in f:
35
+ data.append(json.loads(line.rstrip("\n|\r")))
36
+ if not quiet:
37
+ print("Loaded {} records from {}".format(len(data), input_path))
38
+ return data
39
+
40
+
41
+ def json_iterator(input_dir, text_key="text"):
42
+ all_jsonls = glob(f"{input_dir}/*.jsonl") + glob(f"{input_dir}/*.json")
43
+ for j in all_jsonls:
44
+ data = load_jsonl(j)
45
+ for doc in data:
46
+ yield doc[text_key]
47
+
48
+
49
+ def train_tokenizer(
50
+ input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000
51
+ ):
52
+ """
53
+ Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path`
54
+
55
+ :param input_dir: input directory containing jsonl files
56
+ :param save_path: path to save tokenizer to
57
+ :param tokenizer_type: type of tokenizer to train.
58
+ :param vocab_size: int, size of tokenizer's vocab
59
+ :return:
60
+ """
61
+
62
+ if tokenizer_type == "BPE":
63
+ model = models.BPE()
64
+ else:
65
+ raise NotImplementedError(f"Tokenizer type {tokenizer_type} not implemented")
66
+ tokenizer = Tokenizer(model)
67
+
68
+ # Customize pre-tokenization and decoding
69
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
70
+ tokenizer.decoder = decoders.ByteLevel()
71
+ tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
72
+ tokenizer.normalizer = NFKC()
73
+
74
+ # And then train
75
+ trainer = trainers.BpeTrainer(
76
+ vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"]
77
+ )
78
+ tokenizer.train_from_iterator(json_iterator(input_dir), trainer)
79
+
80
+ # And Save it
81
+ tokenizer.save(save_path, pretty=True)
82
+ print(f"Tokenizer saved at {save_path}")
83
+
84
+
85
+ def parse_args():
86
+ parser = argparse.ArgumentParser(
87
+ description="script for training a multilingual "
88
+ "HF tokenizer on CC dumps with upweighting for low resource languages"
89
+ )
90
+ parser.add_argument(
91
+ "--json_input_dir",
92
+ type=str,
93
+ help="Path to folder containing tokenizer training data in jsonl format",
94
+ )
95
+ parser.add_argument(
96
+ "--tokenizer_output_path",
97
+ type=str,
98
+ help="Path to which your trained tokenizer will be saved (should end in .json)",
99
+ )
100
+ parser.add_argument(
101
+ "--tokenizer_type",
102
+ type=str,
103
+ help="type of tokenizer to train, currently only BPE is supported",
104
+ choices=["BPE"],
105
+ default=["BPE"],
106
+ )
107
+ parser.add_argument(
108
+ "-v",
109
+ "--vocab_size",
110
+ help="vocabulary size of tokenizer, default=52k",
111
+ type=int,
112
+ default=52000,
113
+ )
114
+ return parser.parse_args()
115
+
116
+
117
+ if __name__ == "__main__":
118
+
119
+ args = parse_args()
120
+
121
+ train_tokenizer(
122
+ args.json_input_dir,
123
+ save_path=args.tokenizer_output_path,
124
+ tokenizer_type=args.tokenizer_type,
125
+ vocab_size=args.vocab_size,
126
+ )