Spaces:
Running
Running
fix tiktoken
Browse files- examples.py +22 -5
- tokenizer/tiktoken_patch.py +69 -0
- util.py +1 -0
- vocab/__init__.py +1 -1
- vocab/gpt_35_turbo/__init__.py +2 -69
- vocab/gpt_neox_chinese_v1/20B_tokenizer.tmp.json +0 -0
- vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.json +0 -0
- vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.mock.json +0 -0
- vocab/gpt_neox_chinese_v1/README.md +0 -64
- vocab/gpt_neox_chinese_v1/__init__.py +0 -14
- vocab/gpt_neox_chinese_v1/build_tokenizer_chinese.py +0 -61
- vocab/gpt_neox_chinese_v1/build_tokenizer_chinese_2.py +0 -50
- vocab/gpt_neox_chinese_v1/mock.py +0 -32
- vocab/gpt_neox_chinese_v1/test_tokenizer.py +0 -43
- vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.append.json +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.insert.json +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.json +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.2.json +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.tmp.json +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/README.md +0 -3
- vocab/gpt_neox_chinese_v1/to_v2/add_token_utils.py +0 -185
- vocab/gpt_neox_chinese_v1/to_v2/get_unused_id.py +0 -205
- vocab/gpt_neox_chinese_v1/to_v2/oov.add.txt +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/oov.txt +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/sort_test.py +0 -18
- vocab/gpt_neox_chinese_v1/to_v2/test2.py +0 -42
- vocab/gpt_neox_chinese_v1/to_v2/test_oov.py +0 -69
- vocab/gpt_neox_chinese_v1/to_v2/test_queue.py +0 -20
- vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.remove.jsonl +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.sort_by_count.jsonl +0 -0
- vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.txt +0 -0
- vocab/gpt_neox_chinese_v1/tokenizer/__init__.py +0 -16
- vocab/gpt_neox_chinese_v1/tokenizer/gpt2_tokenization.py +0 -368
- vocab/gpt_neox_chinese_v1/tokenizer/tokenizer.py +0 -402
- vocab/gpt_neox_chinese_v1/tokenizer/train_tokenizer.py +0 -126
- vocab/gpt_neox_chinese_v1/trouble-shooting.md +0 -22
- vocab/moss/__init__.py +1 -1
- vocab/text_davinci_003/__init__.py +14 -59
examples.py
CHANGED
@@ -1,12 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
examples = {
|
2 |
"en": [
|
3 |
-
["
|
|
|
4 |
# !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
|
5 |
["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"],
|
6 |
["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
|
7 |
-
|
8 |
-
]
|
9 |
-
,
|
10 |
"zh": [
|
11 |
["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
|
12 |
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
@@ -14,7 +32,6 @@ examples = {
|
|
14 |
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
15 |
["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
|
16 |
]
|
17 |
-
|
18 |
}
|
19 |
|
20 |
more_examples = [
|
|
|
1 |
+
"""
|
2 |
+
|
3 |
+
## characters
|
4 |
+
|
5 |
+
- alphanumeric characters
|
6 |
+
- numeric characters
|
7 |
+
- special characters: A special character is a character that is not an alphabetic or numeric character.
|
8 |
+
- ASCII control characters
|
9 |
+
- punctuation marks
|
10 |
+
- accent marks
|
11 |
+
- 数学符号
|
12 |
+
- whitespace:
|
13 |
+
- https://en.wikipedia.org/wiki/Whitespace_character
|
14 |
+
- https://emptycharacter.com/
|
15 |
+
|
16 |
+
|
17 |
+
https://www.computerhope.com/jargon/s/specchar.htm
|
18 |
+
"""
|
19 |
+
|
20 |
examples = {
|
21 |
"en": [
|
22 |
+
["number: (10086 + 98) = 100184", "llama", "bloom"],
|
23 |
+
["whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"], # chatglm 有blank_n,
|
24 |
# !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
|
25 |
["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"],
|
26 |
["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
|
27 |
+
],
|
|
|
|
|
28 |
"zh": [
|
29 |
["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
|
30 |
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
|
|
32 |
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
|
33 |
["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
|
34 |
]
|
|
|
35 |
}
|
36 |
|
37 |
more_examples = [
|
tokenizer/tiktoken_patch.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from tiktoken import Encoding
|
3 |
+
from utils.log_util import logger
|
4 |
+
|
5 |
+
def decode(self, tokens, errors="replace", skip_special_tokens=False):
|
6 |
+
"""
|
7 |
+
默认的decode,可能会报错,详见 decode_test.py
|
8 |
+
skip_special_tokens 是为了兼容 hf_tokenizer
|
9 |
+
"""
|
10 |
+
try:
|
11 |
+
decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
|
12 |
+
except:
|
13 |
+
decode_str = "null"
|
14 |
+
return decode_str
|
15 |
+
|
16 |
+
|
17 |
+
def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
|
18 |
+
"""
|
19 |
+
为什么没有这个方法?
|
20 |
+
"""
|
21 |
+
try:
|
22 |
+
return self.decode_tokens_bytes(tokens)
|
23 |
+
except Exception as e:
|
24 |
+
# 什么要返回None?见zh_util.py
|
25 |
+
# 16个空闲id, 100256 100261-100275
|
26 |
+
logger.error(e)
|
27 |
+
return [None for _ in tokens]
|
28 |
+
|
29 |
+
|
30 |
+
def get_vocab(self, token_type="str"):
|
31 |
+
"""Returns vocab as a dict
|
32 |
+
:param token_type: ["str", "byte"]
|
33 |
+
:return:
|
34 |
+
"""
|
35 |
+
vocab = {}
|
36 |
+
key_error_list = []
|
37 |
+
unicode_decode_error_list = []
|
38 |
+
for i in range(self.vocab_size):
|
39 |
+
try:
|
40 |
+
token_byte = self.convert_ids_to_tokens([i])[0]
|
41 |
+
if token_byte is None:
|
42 |
+
continue
|
43 |
+
# token_str = token_byte.decode("utf-8")
|
44 |
+
vocab[token_byte] = i
|
45 |
+
|
46 |
+
except UnicodeDecodeError: # 773 UnicodeDecodeError
|
47 |
+
unicode_decode_error_list.append((i, str(token_byte)))
|
48 |
+
vocab[token_byte] = i
|
49 |
+
|
50 |
+
# vocab.update(self.added_tokens_encoder)
|
51 |
+
logger.info(f"{self.name} {len(key_error_list)} KeyError: {key_error_list}")
|
52 |
+
logger.info(f"{self.name} {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
|
53 |
+
return vocab
|
54 |
+
|
55 |
+
|
56 |
+
def encode(self, *args, **kwargs):
|
57 |
+
"""
|
58 |
+
add_special_token 是为了兼容 hf_tokenizer
|
59 |
+
"""
|
60 |
+
kwargs.pop("add_special_tokens", None)
|
61 |
+
return self._encode(*args, **kwargs)
|
62 |
+
|
63 |
+
|
64 |
+
# tiktoken patch
|
65 |
+
Encoding._encode = Encoding.encode
|
66 |
+
Encoding.encode = encode
|
67 |
+
Encoding.decode = decode
|
68 |
+
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
|
69 |
+
Encoding.get_vocab = get_vocab
|
util.py
CHANGED
@@ -52,6 +52,7 @@ def tokenize(text, tokenizer_type, color_num=5):
|
|
52 |
# continue
|
53 |
|
54 |
# ⭐
|
|
|
55 |
table.append(
|
56 |
{"TokenID": token_id,
|
57 |
"Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
|
|
|
52 |
# continue
|
53 |
|
54 |
# ⭐
|
55 |
+
# TODO: gpt3.5_turbo错误: 只有id和text是对的,token和 utf8都是错的。说明 convert_ids_to_tokens 出错了。
|
56 |
table.append(
|
57 |
{"TokenID": token_id,
|
58 |
"Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
|
vocab/__init__.py
CHANGED
@@ -85,7 +85,7 @@ all_tokenizers = [
|
|
85 |
# "gpt_neox_chinese_v1",
|
86 |
#
|
87 |
# ##### glm系列
|
88 |
-
"glm_chinese",
|
89 |
"chatglm_6b",
|
90 |
"chatglm2_6b",
|
91 |
"chatglm3_6b",
|
|
|
85 |
# "gpt_neox_chinese_v1",
|
86 |
#
|
87 |
# ##### glm系列
|
88 |
+
# "glm_chinese",
|
89 |
"chatglm_6b",
|
90 |
"chatglm2_6b",
|
91 |
"chatglm3_6b",
|
vocab/gpt_35_turbo/__init__.py
CHANGED
@@ -1,10 +1,9 @@
|
|
1 |
"""
|
2 |
-
|
3 |
"""
|
4 |
|
5 |
import tiktoken
|
6 |
-
|
7 |
-
from utils.log_util import logger
|
8 |
|
9 |
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
|
10 |
tokenizer.vocab_size = tokenizer.n_vocab
|
@@ -12,69 +11,3 @@ tokenizer.vocab_size = tokenizer.n_vocab
|
|
12 |
tokenizer.comments = "tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"
|
13 |
tokenizer.reversible = True # It's reversible and lossless, so you can convert tokens back into the original text
|
14 |
|
15 |
-
|
16 |
-
def decode(self, tokens, errors="replace", skip_special_tokens=False):
|
17 |
-
"""
|
18 |
-
默认的decode,可能会报错,详见 decode_test.py
|
19 |
-
skip_special_tokens 是为了兼容 hf_tokenizer
|
20 |
-
"""
|
21 |
-
try:
|
22 |
-
decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
|
23 |
-
except:
|
24 |
-
decode_str = "null"
|
25 |
-
return decode_str
|
26 |
-
|
27 |
-
|
28 |
-
def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
|
29 |
-
"""
|
30 |
-
为什么没有这个方法?
|
31 |
-
"""
|
32 |
-
try:
|
33 |
-
return self.decode_tokens_bytes(tokens)
|
34 |
-
except Exception as e:
|
35 |
-
# 什么要返回None?见zh_util.py
|
36 |
-
# 16个空闲id, 100256 100261-100275
|
37 |
-
logger.error(e)
|
38 |
-
return [None for _ in tokens]
|
39 |
-
|
40 |
-
|
41 |
-
def get_vocab(self, token_type="str"):
|
42 |
-
"""Returns vocab as a dict
|
43 |
-
:param token_type: ["str", "byte"]
|
44 |
-
:return:
|
45 |
-
"""
|
46 |
-
vocab = {}
|
47 |
-
key_error_list = []
|
48 |
-
unicode_decode_error_list = []
|
49 |
-
for i in range(self.vocab_size):
|
50 |
-
try:
|
51 |
-
token_byte = self.convert_ids_to_tokens([i])[0]
|
52 |
-
if token_byte is None:
|
53 |
-
continue
|
54 |
-
# token_str = token_byte.decode("utf-8")
|
55 |
-
vocab[token_byte] = i
|
56 |
-
|
57 |
-
except UnicodeDecodeError: # 773 UnicodeDecodeError
|
58 |
-
unicode_decode_error_list.append((i, str(token_byte)))
|
59 |
-
vocab[token_byte] = i
|
60 |
-
|
61 |
-
# vocab.update(self.added_tokens_encoder)
|
62 |
-
logger.info(f"gpt_35_turbo {len(key_error_list)} KeyError: {key_error_list}")
|
63 |
-
logger.info(f"gpt_35_turbo {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
|
64 |
-
return vocab
|
65 |
-
|
66 |
-
|
67 |
-
def encode(self, *args, **kwargs):
|
68 |
-
"""
|
69 |
-
add_special_token 是为了兼容 hf_tokenizer
|
70 |
-
"""
|
71 |
-
kwargs.pop("add_special_tokens", None)
|
72 |
-
return self._encode(*args, **kwargs)
|
73 |
-
|
74 |
-
|
75 |
-
# tiktoken patch
|
76 |
-
Encoding._encode = Encoding.encode
|
77 |
-
Encoding.encode = encode
|
78 |
-
Encoding.decode = decode
|
79 |
-
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
|
80 |
-
Encoding.get_vocab = get_vocab
|
|
|
1 |
"""
|
2 |
+
|
3 |
"""
|
4 |
|
5 |
import tiktoken
|
6 |
+
import tokenizer.tiktoken_patch
|
|
|
7 |
|
8 |
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
|
9 |
tokenizer.vocab_size = tokenizer.n_vocab
|
|
|
11 |
tokenizer.comments = "tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"
|
12 |
tokenizer.reversible = True # It's reversible and lossless, so you can convert tokens back into the original text
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/20B_tokenizer.tmp.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.mock.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
vocab/gpt_neox_chinese_v1/README.md
DELETED
@@ -1,64 +0,0 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
```
|
4 |
-
added vocab (size: 54634) with 22 dummy tokens (new size: 54656)
|
5 |
-
Vocab size: 54634
|
6 |
-
|
7 |
-
训练数据
|
8 |
-
```
|
9 |
-
|
10 |
-
|
11 |
-
https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
|
12 |
-
|
13 |
-
|
14 |
-
## 20B
|
15 |
-
|
16 |
-
[configs/20B.yml](https://github.com/EleutherAI/gpt-neox/blob/main/configs/20B.yml#L7)
|
17 |
-
```
|
18 |
-
"vocab-file": "./20B_checkpoints/20B_tokenizer.json",
|
19 |
-
```
|
20 |
-
|
21 |
-
Vocab size: 50277
|
22 |
-
self.padded_vocab_size = 50304
|
23 |
-
|
24 |
-
|
25 |
-
padded vocab (size: 50277) with 27 dummy tokens (new size: 50304)
|
26 |
-
|
27 |
-
## 词典
|
28 |
-
|
29 |
-
见 convert_vocab_to_txt.py
|
30 |
-
|
31 |
-
```
|
32 |
-
{"id": 13609, "token": "\u00e4\u00b8\u0143", "token_decode": "\u4e2d"} 中
|
33 |
-
|
34 |
-
# 多个符号拼接在一起的
|
35 |
-
{"id": 13663, "token": ".*]{}", "token_decode": ".*]{}"} .*]{}
|
36 |
-
|
37 |
-
# ss
|
38 |
-
|
39 |
-
```
|
40 |
-
|
41 |
-
|
42 |
-
## 中文支持
|
43 |
-
|
44 |
-
基本没有OOV。
|
45 |
-
|
46 |
-
gpt-neox是在800G英文数据集上训练的,为啥词典支持中文?因为是byte-level BPE
|
47 |
-
|
48 |
-
```
|
49 |
-
丁 [3218, 212]
|
50 |
-
七 [3218, 214]
|
51 |
-
万 [3218, 218]
|
52 |
-
诀 [11894, 211]
|
53 |
-
证 [11894, 212]
|
54 |
-
```
|
55 |
-
|
56 |
-
|
57 |
-
编码长度统计: Counter({2: 4190, 3: 1295, 1: 285})
|
58 |
-
平均编码长度: 2.1750433275563257
|
59 |
-
|
60 |
-
|
61 |
-
## ss
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/__init__.py
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
|
2 |
-
import os
|
3 |
-
from tokenizers import Tokenizer
|
4 |
-
|
5 |
-
|
6 |
-
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
7 |
-
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "20B_tokenizer_chinese.json")
|
8 |
-
|
9 |
-
tokenizer = Tokenizer.from_file(TOKENIZER_DIR)
|
10 |
-
|
11 |
-
tokenizer.vocab_size = tokenizer.get_vocab_size(with_added_tokens=True)
|
12 |
-
|
13 |
-
# vocab_size = len(tokenizer.get_vocab())
|
14 |
-
# vocab_size = tokenizer.vocab_size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/build_tokenizer_chinese.py
DELETED
@@ -1,61 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
merge 是干嘛的?
|
3 |
-
|
4 |
-
## 结果
|
5 |
-
|
6 |
-
共merge 4357 个 token
|
7 |
-
"""
|
8 |
-
|
9 |
-
import json
|
10 |
-
from tokenizers import Tokenizer
|
11 |
-
from data_sample.oov_base import jd_vocab_tokens
|
12 |
-
from zhon.hanzi import punctuation as zh_punc
|
13 |
-
|
14 |
-
def load_base_tokenizer(vocab_path):
|
15 |
-
data = json.load(open(vocab_path, "r", encoding="utf-8"))
|
16 |
-
tokenizer = Tokenizer.from_file(vocab_path)
|
17 |
-
print("vocab_size with added_tokens:", )
|
18 |
-
return data, tokenizer
|
19 |
-
|
20 |
-
data, base_tokenizer = load_base_tokenizer("../gpt_nexo_20b/20B_tokenizer.json")
|
21 |
-
vocab = data["model"]["vocab"]
|
22 |
-
merges = data["model"]["merges"]
|
23 |
-
vocab_size = base_tokenizer.get_vocab_size(with_added_tokens=True)
|
24 |
-
|
25 |
-
|
26 |
-
"""
|
27 |
-
方式一:原有的added_tokens保持id不变。方式二:原有的added_tokens进行id移位。
|
28 |
-
以下采用方式一。
|
29 |
-
"""
|
30 |
-
new_added_tokens = {}
|
31 |
-
for word in jd_vocab_tokens + list(zh_punc):
|
32 |
-
if len(word) > 1 or word in new_added_tokens:
|
33 |
-
continue
|
34 |
-
encoding = base_tokenizer.encode(word)
|
35 |
-
# if len(encoding.ids) > 1:
|
36 |
-
if len(encoding.ids) == 2: # 3个的,怎么处理?
|
37 |
-
tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
|
38 |
-
# print("merging", vocab_size, word, json.dumps(tokens))
|
39 |
-
vocab["".join(tokens)] = vocab_size
|
40 |
-
new_added_tokens[word] = vocab_size
|
41 |
-
vocab_size += 1
|
42 |
-
merges.append(" ".join(tokens))
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
print("共merge %d 个 token" % (len(new_added_tokens)))
|
47 |
-
|
48 |
-
with open("20B_tokenizer_chinese.json", "w", encoding="utf-8") as f_out:
|
49 |
-
json.dump(data, f_out, indent=2)
|
50 |
-
|
51 |
-
## check
|
52 |
-
tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.json")
|
53 |
-
all_error_ids = []
|
54 |
-
for word, idx in new_added_tokens.items():
|
55 |
-
decode_str = tokenizer.decode([idx])
|
56 |
-
if word != decode_str:
|
57 |
-
all_error_ids.append(idx)
|
58 |
-
print(idx, word, decode_str)
|
59 |
-
|
60 |
-
print(all_error_ids)
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/build_tokenizer_chinese_2.py
DELETED
@@ -1,50 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
merge 是干嘛的?
|
3 |
-
|
4 |
-
## 结果
|
5 |
-
|
6 |
-
共merge 4357 个 token
|
7 |
-
"""
|
8 |
-
|
9 |
-
import json
|
10 |
-
from tokenizers import Tokenizer
|
11 |
-
from data_sample.oov_base import jd_vocab_tokens
|
12 |
-
from zhon.hanzi import punctuation as zh_punc
|
13 |
-
|
14 |
-
def load_base_tokenizer():
|
15 |
-
old_vocab_path = "../gpt_nexo_20b/20B_tokenizer.json"
|
16 |
-
data = json.load(open(old_vocab_path, "r", encoding="utf-8"))
|
17 |
-
tokenizer = Tokenizer.from_file(old_vocab_path)
|
18 |
-
print("vocab_size with added_tokens:", )
|
19 |
-
return data, tokenizer
|
20 |
-
|
21 |
-
data, base_tokenizer = load_base_tokenizer()
|
22 |
-
vocab = data["model"]["vocab"]
|
23 |
-
merges = data["model"]["merges"]
|
24 |
-
vocab_size = base_tokenizer.get_vocab_size(with_added_tokens=True)
|
25 |
-
|
26 |
-
|
27 |
-
"""
|
28 |
-
方式一:原有的added_tokens保持id不变。方式二:原有的added_tokens进行id移位。
|
29 |
-
以下采用方式一。
|
30 |
-
"""
|
31 |
-
new_added_tokens = set()
|
32 |
-
for word in jd_vocab_tokens + list(zh_punc):
|
33 |
-
if len(word) > 1 or word in new_added_tokens:
|
34 |
-
continue
|
35 |
-
encoding = base_tokenizer.encode(word)
|
36 |
-
# if len(encoding.ids) > 1:
|
37 |
-
if len(encoding.ids) == 2: # 3个的,怎么处理?
|
38 |
-
tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
|
39 |
-
print("merging", vocab_size, word, json.dumps(tokens))
|
40 |
-
vocab["".join(tokens)] = vocab_size
|
41 |
-
vocab_size += 1
|
42 |
-
merges.append(" ".join(tokens))
|
43 |
-
new_added_tokens.add(word)
|
44 |
-
|
45 |
-
|
46 |
-
print("共merge %d 个 token" % (len(new_added_tokens)))
|
47 |
-
|
48 |
-
f_out = open("20B_tokenizer_chinese_2.json", "w", encoding="utf-8")
|
49 |
-
|
50 |
-
json.dump(data, f_out, indent=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/mock.py
DELETED
@@ -1,32 +0,0 @@
|
|
1 |
-
import copy
|
2 |
-
import json
|
3 |
-
from tokenizers import Tokenizer
|
4 |
-
|
5 |
-
def export_mock_tokenizer():
|
6 |
-
input_path = "20B_tokenizer_chinese.json"
|
7 |
-
|
8 |
-
tokenizer = json.load(open(input_path, "r", encoding="utf-8"))
|
9 |
-
|
10 |
-
vocab = tokenizer["model"]["vocab"]
|
11 |
-
added_tokens = [token["id"] for token in tokenizer["added_tokens"]]
|
12 |
-
|
13 |
-
for k, v in copy.deepcopy(vocab).items():
|
14 |
-
if v not in added_tokens:
|
15 |
-
vocab[str(v)] = v
|
16 |
-
vocab.pop(k)
|
17 |
-
|
18 |
-
out_path = input_path.replace(".json", ".mock.json")
|
19 |
-
with open(out_path, "w", encoding="utf-8") as f_out:
|
20 |
-
f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2))
|
21 |
-
|
22 |
-
|
23 |
-
def mock2():
|
24 |
-
pass
|
25 |
-
|
26 |
-
|
27 |
-
def load_mock_tokenizer():
|
28 |
-
tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.mock.json")
|
29 |
-
print('')
|
30 |
-
|
31 |
-
export_mock_tokenizer()
|
32 |
-
load_mock_tokenizer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/test_tokenizer.py
DELETED
@@ -1,43 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
from tokenizers import Tokenizer
|
3 |
-
|
4 |
-
tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.json")
|
5 |
-
print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True))
|
6 |
-
print("vocab_size without added_tokens:", tokenizer.get_vocab_size(with_added_tokens=False))
|
7 |
-
|
8 |
-
def test_token():
|
9 |
-
"""
|
10 |
-
:return:
|
11 |
-
"""
|
12 |
-
text = " \t\n中国解决方法黑白侗鸩玥,。!"
|
13 |
-
# text = open("../../data_sample/EBKE20150806001_epub_30198917_30198917.txt", "r", encoding="utf-8").readline()
|
14 |
-
encoding = tokenizer.encode(text)
|
15 |
-
decoding = tokenizer.decode(encoding.ids)
|
16 |
-
print(decoding)
|
17 |
-
for word in text:
|
18 |
-
encoding = tokenizer.encode(word)
|
19 |
-
for token_id in encoding.ids:
|
20 |
-
decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
|
21 |
-
token = tokenizer.id_to_token(token_id)
|
22 |
-
print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
|
23 |
-
|
24 |
-
def test_encode():
|
25 |
-
text = "中国解决方法黑白侗鸩,。!?;一个人去哪里疗疗<|endoftext|>一 个刹车卉"
|
26 |
-
encoding = tokenizer.encode(text)
|
27 |
-
print(tokenizer.decode(encoding.ids))
|
28 |
-
for token_id in encoding.ids:
|
29 |
-
decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
|
30 |
-
token = tokenizer.id_to_token(token_id)
|
31 |
-
print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
|
32 |
-
|
33 |
-
def test_decode():
|
34 |
-
encoding = [30903, 20287, 20005, 52300, 25949, 30329, 50039, 31949, 25538,
|
35 |
-
34698, 18764, 5225, 53915, 163, 223]
|
36 |
-
|
37 |
-
decode_str = tokenizer.decode(encoding, skip_special_tokens=False)
|
38 |
-
print(decode_str)
|
39 |
-
|
40 |
-
# test_token()
|
41 |
-
test_encode()
|
42 |
-
# test_decode()
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.append.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.insert.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.2.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.tmp.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
vocab/gpt_neox_chinese_v1/to_v2/README.md
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
|
2 |
-
扩充词典到 v2
|
3 |
-
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/add_token_utils.py
DELETED
@@ -1,185 +0,0 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
import shutil
|
6 |
-
import json
|
7 |
-
from queue import Queue
|
8 |
-
from tokenizers import Tokenizer
|
9 |
-
from data_sample.oov_base import jd_vocab_tokens
|
10 |
-
from zhon.hanzi import punctuation as zh_punc
|
11 |
-
|
12 |
-
def load_base_tokenizer(tokenizer_path):
|
13 |
-
print("loading", tokenizer_path)
|
14 |
-
data = json.load(open(tokenizer_path, "r", encoding="utf-8"))
|
15 |
-
tokenizer = Tokenizer.from_file(tokenizer_path)
|
16 |
-
print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True))
|
17 |
-
return data, tokenizer
|
18 |
-
|
19 |
-
|
20 |
-
def insert_token(word, index):
|
21 |
-
pass
|
22 |
-
|
23 |
-
# 不能删除的token。比如初始统计是低频的,可以删除,但是新增词典里包含的。
|
24 |
-
|
25 |
-
|
26 |
-
def load_reserve_tokens(word_list, base_tokenizer):
|
27 |
-
data, base_tokenizer = base_tokenizer
|
28 |
-
reserved_token = set()
|
29 |
-
for word in word_list:
|
30 |
-
encoding = base_tokenizer.encode(word)
|
31 |
-
tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
|
32 |
-
for i in range(0, len(encoding.ids)):
|
33 |
-
reserved_token.add("".join(tokens[:i+1]))
|
34 |
-
return reserved_token
|
35 |
-
|
36 |
-
|
37 |
-
reserved_token = set()
|
38 |
-
|
39 |
-
|
40 |
-
def append_token(word_list, base_tokenizer, output_tokenizer_path, unused_ids=None):
|
41 |
-
"""
|
42 |
-
append token to the end of vocab
|
43 |
-
"""
|
44 |
-
new_vocab = set()
|
45 |
-
new_merges = set()
|
46 |
-
|
47 |
-
data, base_tokenizer = base_tokenizer
|
48 |
-
vocab = data["model"]["vocab"]
|
49 |
-
merges = data["model"]["merges"]
|
50 |
-
vocab_size = base_tokenizer.basic_count(with_added_tokens=True)
|
51 |
-
|
52 |
-
for word in word_list:
|
53 |
-
encoding = base_tokenizer.encode(word)
|
54 |
-
if len(encoding.ids) == 1:
|
55 |
-
continue
|
56 |
-
|
57 |
-
if len(encoding.ids) >= 4:
|
58 |
-
print("[ERROR]: encoding不能超过4", word, encoding)
|
59 |
-
|
60 |
-
tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
|
61 |
-
# print("merging", word, json.dumps(tokens))
|
62 |
-
for i in range(1, len(encoding.ids)):
|
63 |
-
new_vocab.add("".join(tokens[:i+1]))
|
64 |
-
new_merges.add("".join(tokens[:i]) + " " + tokens[i])
|
65 |
-
|
66 |
-
# append to the end of vocab
|
67 |
-
# print("new_vocab size", len(new_vocab))
|
68 |
-
# print("new_merges size", len(new_merges))
|
69 |
-
if unused_ids == None:
|
70 |
-
for token in new_vocab:
|
71 |
-
vocab[token] = vocab_size
|
72 |
-
vocab_size += 1
|
73 |
-
merges += new_merges
|
74 |
-
else:
|
75 |
-
for iddx, token in enumerate(new_vocab):
|
76 |
-
# print(unused_ids.qsize())
|
77 |
-
unused_token_id, unused_token_str, unused_merges = unused_ids.get()
|
78 |
-
if unused_token_id == 39468:
|
79 |
-
print("catch")
|
80 |
-
if unused_token_str in reserved_token:
|
81 |
-
print("skip unused token", unused_token_id, unused_token_str)
|
82 |
-
unused_token_id, unused_token_str, unused_merges = unused_ids.get()
|
83 |
-
|
84 |
-
print("[%d]merging %s to unused %s %s" % (unused_ids.qsize(), json.dumps(token), unused_token_id, json.dumps(unused_token_str)) )
|
85 |
-
vocab[token] = unused_token_id
|
86 |
-
if unused_token_id != vocab.pop(unused_token_str):
|
87 |
-
print("ERROR")
|
88 |
-
# assert unused_token_id == vocab.pop(unused_token_str)
|
89 |
-
merges.remove(unused_merges)
|
90 |
-
# print(new_merges)
|
91 |
-
merges += new_merges
|
92 |
-
|
93 |
-
# print("共merge %d 个 token" % (len(new_vocab)))
|
94 |
-
# print(json.dumps(list(new_vocab)))
|
95 |
-
|
96 |
-
|
97 |
-
with open(output_tokenizer_path, "w", encoding="utf-8") as f_out:
|
98 |
-
json.dump(data, f_out, indent=2)
|
99 |
-
|
100 |
-
return data, base_tokenizer
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
# data, base_tokenizer = load_base_tokenizer(output_tokenizer_path)
|
106 |
-
# encoding = base_tokenizer.encode(word)
|
107 |
-
# print(encoding.ids)
|
108 |
-
|
109 |
-
|
110 |
-
def load_unused_id():
|
111 |
-
unused_ids = Queue(maxsize=0)
|
112 |
-
for line in open("word_count.corpus.remove.jsonl", "r", encoding="utf-8"):
|
113 |
-
line_data = json.loads(line)
|
114 |
-
token_id = line_data["id"]
|
115 |
-
token_str = line_data["token"]
|
116 |
-
merges = line_data["merges"]
|
117 |
-
unused_ids.put((token_id, token_str, merges))
|
118 |
-
# for i in range(2000):
|
119 |
-
# unused_ids.get()
|
120 |
-
return unused_ids
|
121 |
-
|
122 |
-
|
123 |
-
def check_tokenize(base_tokenizer, word):
|
124 |
-
data, base_tokenizer = base_tokenizer
|
125 |
-
encodings = base_tokenizer.encode(word)
|
126 |
-
assert len(encodings.ids) == 1
|
127 |
-
assert base_tokenizer.decode(encodings.ids) == word
|
128 |
-
|
129 |
-
|
130 |
-
def add_tokens():
|
131 |
-
|
132 |
-
|
133 |
-
unused_ids = load_unused_id()
|
134 |
-
add_tokens = [line.strip() for line in open("oov.add.txt", "r", encoding="utf-8")]
|
135 |
-
add_chars = [char for token in add_tokens for char in token]
|
136 |
-
add_chars = list(set(add_chars))
|
137 |
-
add_words = [token for token in add_tokens if len(token) > 1]
|
138 |
-
|
139 |
-
|
140 |
-
tokenizer_path = "../20B_tokenizer_chinese.json"
|
141 |
-
# tokenizer_path = "../../gpt_nexo_20b/20B_tokenizer.json"
|
142 |
-
base_tokenizer = load_base_tokenizer(tokenizer_path)
|
143 |
-
reserved_token.update(load_reserve_tokens(add_chars, base_tokenizer))
|
144 |
-
|
145 |
-
## add chars
|
146 |
-
append_token(add_chars, base_tokenizer, "20B_tokenizer.1.json", unused_ids=unused_ids)
|
147 |
-
print(unused_ids.qsize()) # 22320
|
148 |
-
new_tokenizer = load_base_tokenizer("20B_tokenizer.1.json")
|
149 |
-
|
150 |
-
append_token(add_words,
|
151 |
-
new_tokenizer, "20B_tokenizer.2.json", unused_ids=unused_ids)
|
152 |
-
new_tokenizer = load_base_tokenizer("20B_tokenizer.2.json")
|
153 |
-
|
154 |
-
#
|
155 |
-
# ## add words
|
156 |
-
# while unused_ids.qsize() != 22320:
|
157 |
-
# unused_ids.get()
|
158 |
-
# assert unused_ids.qsize() == 22320
|
159 |
-
#
|
160 |
-
# shutil.copyfile("20B_tokenizer.1.json", "20B_tokenizer.2.json")
|
161 |
-
# while len(add_words) > 0:
|
162 |
-
# new_tokenizer = load_base_tokenizer("20B_tokenizer.2.json")
|
163 |
-
# append_token([add_words.pop()],
|
164 |
-
# new_tokenizer, "20B_tokenizer.2.json", unused_ids=unused_ids)
|
165 |
-
# # new_tokenizer = load_base_tokenizer("20B_tokenizer.2.json")
|
166 |
-
|
167 |
-
|
168 |
-
def check_all_tokens():
|
169 |
-
add_tokens = [line.strip() for line in open("oov.add.txt", "r", encoding="utf-8")]
|
170 |
-
add_chars = [char for token in add_tokens for char in token]
|
171 |
-
add_chars = list(set(add_chars))
|
172 |
-
add_words = [token for token in add_tokens if len(token) > 1]
|
173 |
-
# add_chars = ['吳']
|
174 |
-
base_tokenizer = load_base_tokenizer("20B_tokenizer.2.json")
|
175 |
-
for k in add_chars:
|
176 |
-
check_tokenize(base_tokenizer, k)
|
177 |
-
for word in add_words:
|
178 |
-
# print(word)
|
179 |
-
check_tokenize(base_tokenizer, word)
|
180 |
-
|
181 |
-
add_tokens()
|
182 |
-
check_all_tokens()
|
183 |
-
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/get_unused_id.py
DELETED
@@ -1,205 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
获取超低频token,用于裁剪
|
3 |
-
"""
|
4 |
-
|
5 |
-
import copy
|
6 |
-
import glob
|
7 |
-
import json
|
8 |
-
from collections import defaultdict
|
9 |
-
|
10 |
-
|
11 |
-
def word_count():
|
12 |
-
from collections import Counter
|
13 |
-
from megatron.data.indexed_dataset import MMapIndexedDataset
|
14 |
-
counter = Counter()
|
15 |
-
for file_name in glob.glob("data/jd/*.bin"):
|
16 |
-
print(file_name)
|
17 |
-
file_name = file_name[:-4]
|
18 |
-
dataset = MMapIndexedDataset(file_name, skip_warmup=True)
|
19 |
-
for doc in dataset:
|
20 |
-
counter.update(doc)
|
21 |
-
|
22 |
-
f_out = open("word_count.txt", "w", encoding="utf-8")
|
23 |
-
for token_id, count in counter.most_common():
|
24 |
-
f_out.write("%d\t%d\n" % (token_id, count))
|
25 |
-
|
26 |
-
|
27 |
-
def get_unused_id():
|
28 |
-
pass
|
29 |
-
|
30 |
-
|
31 |
-
def print_word_count():
|
32 |
-
from tokenizers import Tokenizer
|
33 |
-
tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json")
|
34 |
-
data = json.load(open("../20B_tokenizer_chinese.json", "r", encoding="utf-8"))
|
35 |
-
|
36 |
-
vocab = data["model"]["vocab"]
|
37 |
-
merges = data["model"]["merges"]
|
38 |
-
merge_dict = {}
|
39 |
-
|
40 |
-
sorted_parts = []
|
41 |
-
for merge in merges:
|
42 |
-
idx = merge.find(" ")
|
43 |
-
token_str = merge[:idx] + merge[idx + 1:]
|
44 |
-
merge_dict[token_str] = (merge[:idx], merge[idx + 1:])
|
45 |
-
sorted_parts += [token_str, merge[:idx], merge[idx + 1:]]
|
46 |
-
id2vocab = {idx: token for token, idx in vocab.items()}
|
47 |
-
|
48 |
-
# 补充 sorted_parts,并排序
|
49 |
-
all_tokens = [line.strip().split("\t") for line in open("word_count.corpus.txt", "r", encoding="utf-8")]
|
50 |
-
raw_token_count = {int(token_id): int(count) for token_id, count in all_tokens}
|
51 |
-
sorted_parts = set(sorted_parts)
|
52 |
-
for token_id in raw_token_count:
|
53 |
-
if token_id in [35448, 40519]:
|
54 |
-
print("ddd")
|
55 |
-
token_str = id2vocab[token_id]
|
56 |
-
if token_str not in sorted_parts:
|
57 |
-
sorted_parts.add(token_str)
|
58 |
-
# print(token_id, token_str, json.dumps(token_str), raw_token_count[token_id], " not in parts")
|
59 |
-
sorted_parts = sorted(set(sorted_parts), key=lambda k: len(k), reverse=True)
|
60 |
-
|
61 |
-
# 重新计算merge的频率
|
62 |
-
# token_count = copy.deepcopy(raw_token_count)
|
63 |
-
token_count = defaultdict(int)
|
64 |
-
for token_str in sorted_parts: # 从长到短 遍历 (否则要深度遍历,)
|
65 |
-
token_id = vocab[token_str]
|
66 |
-
if token_id in [35448, 40519]:
|
67 |
-
print("ddd")
|
68 |
-
|
69 |
-
count = raw_token_count.get(token_id, 0)
|
70 |
-
token_count[token_id] += count # 原token 的词频
|
71 |
-
if token_str in merge_dict:
|
72 |
-
if vocab[merge_dict[token_str][0]] in [35448, 40519] or vocab[merge_dict[token_str][1]] in [35448, 40519]:
|
73 |
-
print("ddd")
|
74 |
-
token_count[vocab[merge_dict[token_str][0]]] += token_count[token_id]
|
75 |
-
token_count[vocab[merge_dict[token_str][1]]] += token_count[token_id]
|
76 |
-
else:
|
77 |
-
print(token_id, json.dumps(token_str))
|
78 |
-
|
79 |
-
|
80 |
-
# 重新排序 (按频率升序排列,相同频率按长度降序排列)
|
81 |
-
sorted_token_count = sorted(token_count.items(), key=lambda kv: (kv[1], -len(id2vocab[kv[0]])))
|
82 |
-
f_out = open("word_count.corpus.sort_by_count.jsonl", "w", encoding="utf-8")
|
83 |
-
for token_id, count in sorted_token_count:
|
84 |
-
# for token_str, count in token_count.items():
|
85 |
-
token_str = id2vocab[token_id]
|
86 |
-
# token_id = vocab[token_str]
|
87 |
-
decode_str = tokenizer.decode([token_id]) # 解码会失真
|
88 |
-
if token_str in merge_dict:
|
89 |
-
merges = " ".join(merge_dict[token_str])
|
90 |
-
else:
|
91 |
-
merges = "NULL"
|
92 |
-
f_out.write(json.dumps(
|
93 |
-
{"id": token_id, "token": token_str, "merges": merges, "raw_count": raw_token_count.get(token_id, 0),
|
94 |
-
"count": count, "decode_str": decode_str}) + "\n")
|
95 |
-
|
96 |
-
|
97 |
-
def get_remove_words():
|
98 |
-
from tokenizers import Tokenizer
|
99 |
-
tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json")
|
100 |
-
|
101 |
-
data = json.load(open("../20B_tokenizer_chinese.json", "r", encoding="utf-8"))
|
102 |
-
added_tokens = [token["id"] for token in data["added_tokens"]]
|
103 |
-
|
104 |
-
vocab = data["model"]["vocab"]
|
105 |
-
merges = data["model"]["merges"]
|
106 |
-
id2vocab = {idx: token for token, idx in vocab.items()}
|
107 |
-
|
108 |
-
merge_dict = {k.replace(" ", "", 1): k for k in merges}
|
109 |
-
|
110 |
-
token_count = {}
|
111 |
-
for line in open("word_count.corpus.sort_by_count.jsonl", "r", encoding="utf-8"):
|
112 |
-
line_data = json.loads(line)
|
113 |
-
token_id = int(line_data["id"])
|
114 |
-
count = int(line_data["count"])
|
115 |
-
token_count[token_id] = count
|
116 |
-
|
117 |
-
f_out = open("word_count.corpus.remove.jsonl", "w", encoding="utf-8")
|
118 |
-
remove_vocab_set = set()
|
119 |
-
|
120 |
-
# # 1. 去掉错误token
|
121 |
-
# error_tokens = [54611, 54612, 54613, 54614, 54615, 54616, 54617, 54618, 54619, 54620, 54621, 54622,
|
122 |
-
# 54623, 54624, 54625, 54626, 54627, 54628, 54629, 54630, 54631, 54632, 54633]
|
123 |
-
# for token_id in error_tokens:
|
124 |
-
# token_str = id2vocab[token_id]
|
125 |
-
# # token_str = tokenizer.id_to_token(token_id) # 失真
|
126 |
-
# remove_vocab_set.add(token_id)
|
127 |
-
# f_out.write(json.dumps(
|
128 |
-
# {"id": token_id, "token": token_str, "merges": merge_dict.get(token_str), "count": 0,
|
129 |
-
# "type": "error-char"}) + "\n")
|
130 |
-
|
131 |
-
|
132 |
-
# 2. 去掉超长token
|
133 |
-
# for token_id in range(tokenizer.get_vocab_size()):
|
134 |
-
# if token_id in added_tokens:
|
135 |
-
# continue
|
136 |
-
# token_str = id2vocab[token_id]
|
137 |
-
# # token_str = tokenizer.id_to_token(token_id) # 也会失真,比如 54611 个token
|
138 |
-
# decode_str = tokenizer.decode([token_id]) # decode会失真,比如 Ġ 会变成空格
|
139 |
-
# if len(decode_str) > 8 and len(set(decode_str)) < 3:
|
140 |
-
# if token_id in remove_vocab_set:
|
141 |
-
# continue
|
142 |
-
# remove_vocab_set.add(token_id)
|
143 |
-
# f_out.write(
|
144 |
-
# json.dumps({"id": token_id, "token": token_str,
|
145 |
-
# "merges": merge_dict.get(token_str), "count": token_count.get(token_id, 0),
|
146 |
-
# "type": "按长度过滤"}, ensure_ascii=False) + "\n")
|
147 |
-
#
|
148 |
-
# # 删除依赖,(否则会造成 merges中存在oov的token)
|
149 |
-
# #
|
150 |
-
# for merge in merges:
|
151 |
-
# if token_str in merge:
|
152 |
-
# # if token_str + " " in merge or " " + token_str in merge:
|
153 |
-
# parent_token_str = merge.replace(" ", "", 1)
|
154 |
-
# parent_token_id = vocab[parent_token_str]
|
155 |
-
# if parent_token_id in remove_vocab_set:
|
156 |
-
# continue
|
157 |
-
# remove_vocab_set.add(parent_token_id)
|
158 |
-
# f_out.write(
|
159 |
-
# json.dumps({"id": parent_token_id, "token": parent_token_str,
|
160 |
-
# "merges": merge, "count": token_count.get(parent_token_id, 0),
|
161 |
-
# "type": "按长度过滤-依赖删除"}, ensure_ascii=False) + "\n")
|
162 |
-
|
163 |
-
# 3. 去掉低频token
|
164 |
-
for token_id, count in list(token_count.items())[:25000]:
|
165 |
-
# token_id = 6460
|
166 |
-
if token_id in added_tokens:
|
167 |
-
continue
|
168 |
-
if token_id in remove_vocab_set:
|
169 |
-
continue
|
170 |
-
|
171 |
-
token_str = tokenizer.id_to_token(token_id)
|
172 |
-
# token_str = tokenizer.decode([int(token_id)])
|
173 |
-
if len(token_str.strip()) > 1:
|
174 |
-
remove_vocab_set.add(token_id)
|
175 |
-
f_out.write(json.dumps(
|
176 |
-
{"id": token_id, "token": token_str, "merges": merge_dict.get(token_str), "count": count,
|
177 |
-
"type": "remove by frequency"}) + "\n")
|
178 |
-
|
179 |
-
######## 已经按频率排序的,就不需要删除依赖了
|
180 |
-
# # 删除依赖,(否则会造成 merges中存在oov的token)
|
181 |
-
# for merge in merges:
|
182 |
-
# # if token_str + " " in merge or " " + token_str in merge:
|
183 |
-
# if token_str in merge:
|
184 |
-
# parent_token_str = merge.replace(" ", "", 1)
|
185 |
-
# parent_token_id = vocab[parent_token_str]
|
186 |
-
# if parent_token_id in remove_vocab_set:
|
187 |
-
# continue
|
188 |
-
# remove_vocab_set.add(parent_token_id)
|
189 |
-
# f_out.write(
|
190 |
-
# json.dumps({"id": parent_token_id, "token": parent_token_str,
|
191 |
-
# "merges": merge, "count": token_count.get(parent_token_id, 0),
|
192 |
-
# "type": "按频率过滤-依赖删除"}, ensure_ascii=False) + "\n")
|
193 |
-
|
194 |
-
# remove 24969 tokens
|
195 |
-
print("remove %d tokens" % (len(remove_vocab_set)))
|
196 |
-
|
197 |
-
|
198 |
-
def ss():
|
199 |
-
pass
|
200 |
-
|
201 |
-
|
202 |
-
# word_count()
|
203 |
-
# print_word_count()
|
204 |
-
get_remove_words()
|
205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/oov.add.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|
vocab/gpt_neox_chinese_v1/to_v2/oov.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|
vocab/gpt_neox_chinese_v1/to_v2/sort_test.py
DELETED
@@ -1,18 +0,0 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
a = {
|
5 |
-
"aa", 1,
|
6 |
-
"aaa", 1,
|
7 |
-
"aaaa", 1,
|
8 |
-
"aaaaaa", 1,
|
9 |
-
"aaaaaaa", 1,
|
10 |
-
|
11 |
-
"baa", 3,
|
12 |
-
"baaa", 2,
|
13 |
-
"baaaa", 2,
|
14 |
-
"baaaaaa", 2,
|
15 |
-
"baaaaaaa", 2,
|
16 |
-
}
|
17 |
-
|
18 |
-
sorted(a.items(), key=lambda kv:(kv[1], ))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/test2.py
DELETED
@@ -1,42 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
from tokenizers import Tokenizer
|
3 |
-
from data_sample.oov_base import jd_vocab_tokens
|
4 |
-
from zhon.hanzi import punctuation as zh_punc
|
5 |
-
|
6 |
-
def load_base_tokenizer(tokenizer_path):
|
7 |
-
print("loading", tokenizer_path)
|
8 |
-
data = json.load(open(tokenizer_path, "r", encoding="utf-8"))
|
9 |
-
tokenizer = Tokenizer.from_file(tokenizer_path)
|
10 |
-
print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True))
|
11 |
-
return data, tokenizer
|
12 |
-
|
13 |
-
|
14 |
-
def append_token(word_list, base_tokenizer, unused_ids=None):
|
15 |
-
"""
|
16 |
-
append token to the end of vocab
|
17 |
-
"""
|
18 |
-
new_vocab = set()
|
19 |
-
new_merges = set()
|
20 |
-
|
21 |
-
data, base_tokenizer = base_tokenizer
|
22 |
-
vocab = data["model"]["vocab"]
|
23 |
-
merges = data["model"]["merges"]
|
24 |
-
vocab_size = base_tokenizer.basic_count(with_added_tokens=True)
|
25 |
-
|
26 |
-
for word in word_list:
|
27 |
-
encoding = base_tokenizer.encode(word)
|
28 |
-
if len(encoding.ids) == 1:
|
29 |
-
continue
|
30 |
-
|
31 |
-
if len(encoding.ids) >= 4:
|
32 |
-
print("[ERROR]: encoding不能超过4", word, encoding)
|
33 |
-
|
34 |
-
tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
|
35 |
-
if "\u00e6\u00a5\u0143" in tokens:
|
36 |
-
print(word)
|
37 |
-
|
38 |
-
add_tokens = [line.strip() for line in open("oov.add.txt", "r", encoding="utf-8")]
|
39 |
-
add_words = [token for token in add_tokens if len(token) > 1]
|
40 |
-
new_tokenizer = load_base_tokenizer("20B_tokenizer.1.json")
|
41 |
-
|
42 |
-
append_token(add_words, new_tokenizer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/test_oov.py
DELETED
@@ -1,69 +0,0 @@
|
|
1 |
-
from tokenizers import Tokenizer
|
2 |
-
|
3 |
-
tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json")
|
4 |
-
|
5 |
-
def get_oov():
|
6 |
-
|
7 |
-
f_out = open("oov.txt", "w", encoding="utf-8")
|
8 |
-
all_words = open("../../vocab.freq.zh.txt", "r", encoding="utf-8")
|
9 |
-
for line in all_words:
|
10 |
-
word, count = line.strip().split("\t")
|
11 |
-
if "�" in word or word in ["之长", "个好", "亿亿", "余个", "聊了", "与该", "多花"]:
|
12 |
-
continue
|
13 |
-
|
14 |
-
encoding = tokenizer.encode(word)
|
15 |
-
if len(encoding.ids) > 1:
|
16 |
-
f_out.write(line)
|
17 |
-
|
18 |
-
|
19 |
-
def build_vocab():
|
20 |
-
pass
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
def convert_oov_to_merges():
|
25 |
-
"""将词拆分成merge分组,必须是两个一组,
|
26 |
-
比如
|
27 |
-
承担 -> 承 担
|
28 |
-
天津市 -> 天津 市
|
29 |
-
社会保障 -> 社会 保障
|
30 |
-
的一部分 -> 的 一部分 -> 一 部分
|
31 |
-
"""
|
32 |
-
all_tokens_and_counts = [line.strip().split("\t") for line in open("oov.txt", "r", encoding="utf-8")]
|
33 |
-
all_tokens = [token for token,count in all_tokens_and_counts if int(count) > 2] # 至少3个词典中出现过
|
34 |
-
len1 = [token for token in all_tokens if len(token) == 1]
|
35 |
-
len2 = [token for token in all_tokens if len(token) == 2]
|
36 |
-
len3 = [token for token in all_tokens if len(token) == 3]
|
37 |
-
len4 = [token for token in all_tokens if len(token) == 4]
|
38 |
-
print(len(len1), len(len2), len(len3), len(len4))
|
39 |
-
|
40 |
-
# vocab = set(["天津", "社会", "保障", "部分", "一部分", "需要", "数据", "使用", "我们", "一个",] + len2)
|
41 |
-
# vocab = set(["天津", "社会", "保障", "部分", "需要", "数据", "使用", "我们", "一个"] + len2)
|
42 |
-
|
43 |
-
|
44 |
-
with open("oov.add.txt", "w", encoding="utf-8") as f_out:
|
45 |
-
for token in len1:
|
46 |
-
f_out.write(token + "\n")
|
47 |
-
for token in len2[:20000]:
|
48 |
-
f_out.write(token + "\n")
|
49 |
-
# f_out.write(token[0] + " " + token[1] + "\n")
|
50 |
-
|
51 |
-
# for token in len3:
|
52 |
-
# idx = -1
|
53 |
-
# for part in len2:
|
54 |
-
# if part in token:
|
55 |
-
# idx = token.find(part)
|
56 |
-
# break
|
57 |
-
# if idx == -1:
|
58 |
-
# print("not found", token)
|
59 |
-
# elif idx == 0:
|
60 |
-
# f_out.write(token[0] + " " + token[1:] + "\n")
|
61 |
-
# else:
|
62 |
-
# f_out.write(token[:2] + " " + token[2] + "\n")
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
get_oov()
|
69 |
-
convert_oov_to_merges()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/test_queue.py
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
|
2 |
-
from queue import Queue
|
3 |
-
|
4 |
-
q = Queue(maxsize=0)
|
5 |
-
|
6 |
-
#写入队列数据
|
7 |
-
q.put(0)
|
8 |
-
q.put(1)
|
9 |
-
q.put(2)
|
10 |
-
|
11 |
-
#输出当前队列所有数据
|
12 |
-
print(q.queue)
|
13 |
-
#删除队列数据,并返回该数据
|
14 |
-
q.get()
|
15 |
-
#输也所有队列数据
|
16 |
-
print(q.queue)
|
17 |
-
|
18 |
-
for i in range(10):
|
19 |
-
print(q.get(), q.qsize())
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.remove.jsonl
DELETED
The diff for this file is too large to render.
See raw diff
|
|
vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.sort_by_count.jsonl
DELETED
The diff for this file is too large to render.
See raw diff
|
|
vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|
vocab/gpt_neox_chinese_v1/tokenizer/__init__.py
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
2 |
-
#
|
3 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
-
# you may not use this file except in compliance with the License.
|
5 |
-
# You may obtain a copy of the License at
|
6 |
-
#
|
7 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
-
#
|
9 |
-
# Unless required by applicable law or agreed to in writing, software
|
10 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
-
# See the License for the specific language governing permissions and
|
13 |
-
# limitations under the License.
|
14 |
-
|
15 |
-
|
16 |
-
from .tokenizer import build_tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/tokenizer/gpt2_tokenization.py
DELETED
@@ -1,368 +0,0 @@
|
|
1 |
-
# Copyright (c) 2021, EleutherAI
|
2 |
-
# This file is based on code by the authors denoted below and has been modified from its original version.
|
3 |
-
#
|
4 |
-
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
|
5 |
-
#
|
6 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
-
# you may not use this file except in compliance with the License.
|
8 |
-
# You may obtain a copy of the License at
|
9 |
-
#
|
10 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
-
#
|
12 |
-
# Unless required by applicable law or agreed to in writing, software
|
13 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 |
-
# See the License for the specific language governing permissions and
|
16 |
-
# limitations under the License.
|
17 |
-
|
18 |
-
"""Tokenization classes for OpenAI GPT."""
|
19 |
-
|
20 |
-
from __future__ import absolute_import, division, print_function, unicode_literals
|
21 |
-
|
22 |
-
import sys
|
23 |
-
import json
|
24 |
-
import logging
|
25 |
-
import os
|
26 |
-
import regex as re
|
27 |
-
from io import open
|
28 |
-
|
29 |
-
from functools import lru_cache
|
30 |
-
|
31 |
-
|
32 |
-
logger = logging.getLogger(__name__)
|
33 |
-
|
34 |
-
PRETRAINED_VOCAB_ARCHIVE_MAP = {
|
35 |
-
"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
|
36 |
-
}
|
37 |
-
PRETRAINED_MERGES_ARCHIVE_MAP = {
|
38 |
-
"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
|
39 |
-
}
|
40 |
-
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
|
41 |
-
"gpt2": 1024,
|
42 |
-
}
|
43 |
-
|
44 |
-
VOCAB_NAME = "vocab.json"
|
45 |
-
MERGES_NAME = "merges.txt"
|
46 |
-
SPECIAL_TOKENS_NAME = "special_tokens.txt"
|
47 |
-
|
48 |
-
|
49 |
-
@lru_cache()
|
50 |
-
def bytes_to_unicode():
|
51 |
-
"""
|
52 |
-
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
53 |
-
The reversible bpe codes work on unicode strings.
|
54 |
-
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
55 |
-
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
56 |
-
This is a significant percentage of your normal, say, 32K bpe vocab.
|
57 |
-
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
58 |
-
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
59 |
-
"""
|
60 |
-
_chr = unichr if sys.version_info[0] == 2 else chr
|
61 |
-
bs = (
|
62 |
-
list(range(ord("!"), ord("~") + 1))
|
63 |
-
+ list(range(ord("¡"), ord("¬") + 1))
|
64 |
-
+ list(range(ord("®"), ord("ÿ") + 1))
|
65 |
-
)
|
66 |
-
cs = bs[:]
|
67 |
-
n = 0
|
68 |
-
for b in range(2**8):
|
69 |
-
if b not in bs:
|
70 |
-
bs.append(b)
|
71 |
-
cs.append(2**8 + n)
|
72 |
-
n += 1
|
73 |
-
cs = [_chr(n) for n in cs]
|
74 |
-
return dict(zip(bs, cs))
|
75 |
-
|
76 |
-
|
77 |
-
def get_pairs(word):
|
78 |
-
"""Return set of symbol pairs in a word.
|
79 |
-
|
80 |
-
Word is represented as tuple of symbols (symbols being variable-length strings).
|
81 |
-
"""
|
82 |
-
pairs = set()
|
83 |
-
prev_char = word[0]
|
84 |
-
for char in word[1:]:
|
85 |
-
pairs.add((prev_char, char))
|
86 |
-
prev_char = char
|
87 |
-
return pairs
|
88 |
-
|
89 |
-
|
90 |
-
class GPT2Tokenizer(object):
|
91 |
-
"""
|
92 |
-
GPT-2 BPE tokenizer. Peculiarities:
|
93 |
-
- Byte-level BPE
|
94 |
-
"""
|
95 |
-
|
96 |
-
@classmethod
|
97 |
-
def from_pretrained(
|
98 |
-
cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs
|
99 |
-
):
|
100 |
-
"""
|
101 |
-
Instantiate a PreTrainedBertModel from a pre-trained model file.
|
102 |
-
Download and cache the pre-trained model file if needed.
|
103 |
-
"""
|
104 |
-
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
105 |
-
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
|
106 |
-
merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
|
107 |
-
special_tokens_file = None
|
108 |
-
else:
|
109 |
-
vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
|
110 |
-
merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
|
111 |
-
special_tokens_file = os.path.join(
|
112 |
-
pretrained_model_name_or_path, SPECIAL_TOKENS_NAME
|
113 |
-
)
|
114 |
-
if not os.path.exists(special_tokens_file):
|
115 |
-
special_tokens_file = None
|
116 |
-
else:
|
117 |
-
logger.info(
|
118 |
-
"loading special tokens file {}".format(special_tokens_file)
|
119 |
-
)
|
120 |
-
# redirect to the cache, if necessary
|
121 |
-
try:
|
122 |
-
from .file_utils import cached_path
|
123 |
-
|
124 |
-
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
|
125 |
-
resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
|
126 |
-
except EnvironmentError:
|
127 |
-
logger.error(
|
128 |
-
"Model name '{}' was not found in model name list ({}). "
|
129 |
-
"We assumed '{}' was a path or url but couldn't find files {} and {} "
|
130 |
-
"at this path or url.".format(
|
131 |
-
pretrained_model_name_or_path,
|
132 |
-
", ".join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
|
133 |
-
pretrained_model_name_or_path,
|
134 |
-
vocab_file,
|
135 |
-
merges_file,
|
136 |
-
)
|
137 |
-
)
|
138 |
-
return None
|
139 |
-
if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
|
140 |
-
logger.info("loading vocabulary file {}".format(vocab_file))
|
141 |
-
logger.info("loading merges file {}".format(merges_file))
|
142 |
-
else:
|
143 |
-
logger.info(
|
144 |
-
"loading vocabulary file {} from cache at {}".format(
|
145 |
-
vocab_file, resolved_vocab_file
|
146 |
-
)
|
147 |
-
)
|
148 |
-
logger.info(
|
149 |
-
"loading merges file {} from cache at {}".format(
|
150 |
-
merges_file, resolved_merges_file
|
151 |
-
)
|
152 |
-
)
|
153 |
-
if (
|
154 |
-
pretrained_model_name_or_path
|
155 |
-
in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
|
156 |
-
):
|
157 |
-
# if we're using a pretrained model, ensure the tokenizer won't index sequences longer
|
158 |
-
# than the number of positional embeddings
|
159 |
-
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
|
160 |
-
pretrained_model_name_or_path
|
161 |
-
]
|
162 |
-
kwargs["max_len"] = min(kwargs.get("max_len", int(1e12)), max_len)
|
163 |
-
# Instantiate tokenizer.
|
164 |
-
if special_tokens_file and "special_tokens" not in kwargs:
|
165 |
-
special_tokens = (
|
166 |
-
open(special_tokens_file, encoding="utf-8").read().split("\n")[:-1]
|
167 |
-
)
|
168 |
-
else:
|
169 |
-
special_tokens = kwargs.pop("special_tokens", [])
|
170 |
-
tokenizer = cls(
|
171 |
-
resolved_vocab_file,
|
172 |
-
resolved_merges_file,
|
173 |
-
special_tokens=special_tokens,
|
174 |
-
*inputs,
|
175 |
-
**kwargs
|
176 |
-
)
|
177 |
-
return tokenizer
|
178 |
-
|
179 |
-
def __init__(
|
180 |
-
self,
|
181 |
-
vocab_file,
|
182 |
-
merges_file,
|
183 |
-
errors="replace",
|
184 |
-
special_tokens=None,
|
185 |
-
max_len=None,
|
186 |
-
):
|
187 |
-
self.max_len = max_len if max_len is not None else int(1e12)
|
188 |
-
self.encoder = json.load(open(vocab_file))
|
189 |
-
self.decoder = {v: k for k, v in self.encoder.items()}
|
190 |
-
self.errors = errors # how to handle errors in decoding
|
191 |
-
self.byte_encoder = bytes_to_unicode()
|
192 |
-
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
193 |
-
bpe_data = open(merges_file, encoding="utf-8").read().split("\n")[1:-1]
|
194 |
-
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
|
195 |
-
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
|
196 |
-
|
197 |
-
# Should haved added re.IGNORECASE so BPE merges can happen for
|
198 |
-
# capitalized versions of contractions
|
199 |
-
self.pat = re.compile(
|
200 |
-
r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
|
201 |
-
)
|
202 |
-
|
203 |
-
self.special_tokens = {}
|
204 |
-
self.special_tokens_decoder = {}
|
205 |
-
self.set_special_tokens(special_tokens)
|
206 |
-
|
207 |
-
def __len__(self):
|
208 |
-
return len(self.encoder) + len(self.special_tokens)
|
209 |
-
|
210 |
-
def set_special_tokens(self, special_tokens):
|
211 |
-
"""Add a list of additional tokens to the encoder.
|
212 |
-
The additional tokens are indexed starting from the last index of the
|
213 |
-
current vocabulary in the order of the `special_tokens` list.
|
214 |
-
"""
|
215 |
-
if not special_tokens:
|
216 |
-
self.special_tokens = {}
|
217 |
-
self.special_tokens_decoder = {}
|
218 |
-
return
|
219 |
-
self.special_tokens = dict(
|
220 |
-
(tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens)
|
221 |
-
)
|
222 |
-
self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()}
|
223 |
-
logger.info("Special tokens {}".format(self.special_tokens))
|
224 |
-
|
225 |
-
@lru_cache(maxsize=131072)
|
226 |
-
def bpe(self, token):
|
227 |
-
word = tuple(token)
|
228 |
-
pairs = get_pairs(word)
|
229 |
-
|
230 |
-
if not pairs:
|
231 |
-
return token
|
232 |
-
|
233 |
-
while True:
|
234 |
-
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
|
235 |
-
if bigram not in self.bpe_ranks:
|
236 |
-
break
|
237 |
-
first, second = bigram
|
238 |
-
new_word = []
|
239 |
-
i = 0
|
240 |
-
while i < len(word):
|
241 |
-
try:
|
242 |
-
j = word.index(first, i)
|
243 |
-
new_word.extend(word[i:j])
|
244 |
-
i = j
|
245 |
-
except BaseException:
|
246 |
-
new_word.extend(word[i:])
|
247 |
-
break
|
248 |
-
|
249 |
-
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
|
250 |
-
new_word.append(first + second)
|
251 |
-
i += 2
|
252 |
-
else:
|
253 |
-
new_word.append(word[i])
|
254 |
-
i += 1
|
255 |
-
new_word = tuple(new_word)
|
256 |
-
word = new_word
|
257 |
-
if len(word) == 1:
|
258 |
-
break
|
259 |
-
else:
|
260 |
-
pairs = get_pairs(word)
|
261 |
-
word = " ".join(word)
|
262 |
-
return word
|
263 |
-
|
264 |
-
def tokenize(self, text):
|
265 |
-
"""Tokenize a string."""
|
266 |
-
bpe_tokens = []
|
267 |
-
for token in re.findall(self.pat, text):
|
268 |
-
if sys.version_info[0] == 2:
|
269 |
-
token = "".join(self.byte_encoder[ord(b)] for b in token)
|
270 |
-
else:
|
271 |
-
token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
|
272 |
-
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
|
273 |
-
return bpe_tokens
|
274 |
-
|
275 |
-
def convert_tokens_to_ids(self, tokens):
|
276 |
-
"""Converts a sequence of tokens into ids using the vocab."""
|
277 |
-
ids = []
|
278 |
-
if isinstance(tokens, str) or (
|
279 |
-
sys.version_info[0] == 2 and isinstance(tokens, unicode)
|
280 |
-
):
|
281 |
-
if tokens in self.special_tokens:
|
282 |
-
return self.special_tokens[tokens]
|
283 |
-
else:
|
284 |
-
return self.encoder.get(tokens, 0)
|
285 |
-
for token in tokens:
|
286 |
-
if token in self.special_tokens:
|
287 |
-
ids.append(self.special_tokens[token])
|
288 |
-
else:
|
289 |
-
ids.append(self.encoder.get(token, 0))
|
290 |
-
if len(ids) > self.max_len:
|
291 |
-
logger.warning(
|
292 |
-
"Token indices sequence length is longer than the specified maximum "
|
293 |
-
" sequence length for this OpenAI GPT model ({} > {}). Running this"
|
294 |
-
" sequence through the model will result in indexing errors".format(
|
295 |
-
len(ids), self.max_len
|
296 |
-
)
|
297 |
-
)
|
298 |
-
return ids
|
299 |
-
|
300 |
-
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
301 |
-
"""Converts a sequence of ids in BPE tokens using the vocab."""
|
302 |
-
tokens = []
|
303 |
-
for i in ids:
|
304 |
-
if i in self.special_tokens_decoder:
|
305 |
-
if not skip_special_tokens:
|
306 |
-
tokens.append(self.special_tokens_decoder[i])
|
307 |
-
else:
|
308 |
-
tokens.append(self.decoder[i])
|
309 |
-
return tokens
|
310 |
-
|
311 |
-
def encode(self, text):
|
312 |
-
return self.convert_tokens_to_ids(self.tokenize(text))
|
313 |
-
|
314 |
-
def decode(self, tokens):
|
315 |
-
text = "".join([self.decoder[token] for token in tokens])
|
316 |
-
text = bytearray([self.byte_decoder[c] for c in text]).decode(
|
317 |
-
"utf-8", errors=self.errors
|
318 |
-
)
|
319 |
-
return text
|
320 |
-
|
321 |
-
def save_vocabulary(self, vocab_path):
|
322 |
-
"""Save the tokenizer vocabulary and merge files to a directory."""
|
323 |
-
if not os.path.isdir(vocab_path):
|
324 |
-
logger.error(
|
325 |
-
"Vocabulary path ({}) should be a directory".format(vocab_path)
|
326 |
-
)
|
327 |
-
return
|
328 |
-
vocab_file = os.path.join(vocab_path, VOCAB_NAME)
|
329 |
-
merge_file = os.path.join(vocab_path, MERGES_NAME)
|
330 |
-
special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
|
331 |
-
|
332 |
-
with open(vocab_file, "w", encoding="utf-8") as f:
|
333 |
-
f.write(json.dumps(self.encoder, ensure_ascii=False))
|
334 |
-
|
335 |
-
index = 0
|
336 |
-
with open(merge_file, "w", encoding="utf-8") as writer:
|
337 |
-
writer.write("#version: 0.2\n")
|
338 |
-
for bpe_tokens, token_index in sorted(
|
339 |
-
self.bpe_ranks.items(), key=lambda kv: kv[1]
|
340 |
-
):
|
341 |
-
if index != token_index:
|
342 |
-
logger.warning(
|
343 |
-
"Saving vocabulary to {}: BPE merge indices are not consecutive."
|
344 |
-
" Please check that the tokenizer is not corrupted!".format(
|
345 |
-
merge_file
|
346 |
-
)
|
347 |
-
)
|
348 |
-
index = token_index
|
349 |
-
writer.write(" ".join(bpe_tokens) + "\n")
|
350 |
-
index += 1
|
351 |
-
|
352 |
-
index = len(self.encoder)
|
353 |
-
with open(special_tokens_file, "w", encoding="utf-8") as writer:
|
354 |
-
for token, token_index in sorted(
|
355 |
-
self.special_tokens.items(), key=lambda kv: kv[1]
|
356 |
-
):
|
357 |
-
if index != token_index:
|
358 |
-
logger.warning(
|
359 |
-
"Saving special tokens vocabulary to {}: BPE indices are not consecutive."
|
360 |
-
" Please check that the tokenizer is not corrupted!".format(
|
361 |
-
special_tokens_file
|
362 |
-
)
|
363 |
-
)
|
364 |
-
index = token_index
|
365 |
-
writer.write(token + "\n")
|
366 |
-
index += 1
|
367 |
-
|
368 |
-
return vocab_file, merge_file, special_tokens_file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/tokenizer/tokenizer.py
DELETED
@@ -1,402 +0,0 @@
|
|
1 |
-
# Copyright (c) 2021, EleutherAI
|
2 |
-
# This file is based on code by the authors denoted below and has been modified from its original version.
|
3 |
-
#
|
4 |
-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
5 |
-
#
|
6 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
-
# you may not use this file except in compliance with the License.
|
8 |
-
# You may obtain a copy of the License at
|
9 |
-
#
|
10 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
-
#
|
12 |
-
# Unless required by applicable law or agreed to in writing, software
|
13 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 |
-
# See the License for the specific language governing permissions and
|
16 |
-
# limitations under the License.
|
17 |
-
|
18 |
-
"""Megatron tokenizers."""
|
19 |
-
|
20 |
-
from abc import ABC
|
21 |
-
from abc import abstractmethod
|
22 |
-
|
23 |
-
from tokenizers import Tokenizer
|
24 |
-
from transformers import GPT2Tokenizer, GPT2TokenizerFast
|
25 |
-
import numpy as np
|
26 |
-
import sentencepiece as spm
|
27 |
-
from typing import List, Union
|
28 |
-
from .gpt2_tokenization import GPT2Tokenizer
|
29 |
-
|
30 |
-
|
31 |
-
def build_tokenizer(args):
|
32 |
-
"""Initialize tokenizer."""
|
33 |
-
if args.rank == 0:
|
34 |
-
print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True)
|
35 |
-
|
36 |
-
# Select and instantiate the tokenizer.
|
37 |
-
if args.tokenizer_type.lower() == "GPT2BPETokenizer".lower():
|
38 |
-
assert args.vocab_file is not None
|
39 |
-
assert args.merge_file is not None
|
40 |
-
tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
|
41 |
-
elif args.tokenizer_type.lower() == "SPMTokenizer".lower():
|
42 |
-
assert args.vocab_file is not None
|
43 |
-
tokenizer = SentencePieceTokenizer(args.vocab_file)
|
44 |
-
elif args.tokenizer_type.lower() == "HFTokenizer".lower():
|
45 |
-
assert args.vocab_file is not None
|
46 |
-
tokenizer = HFTokenizer(args.vocab_file)
|
47 |
-
elif args.tokenizer_type.lower() == "HFGPT2Tokenizer".lower():
|
48 |
-
if args.vocab_file is None:
|
49 |
-
print(
|
50 |
-
"WARNING: No vocab file found, loading Huggingface's pretrained GPT2Tokenizer"
|
51 |
-
)
|
52 |
-
tokenizer = HFGPT2Tokenizer(args.vocab_file)
|
53 |
-
elif args.tokenizer_type.lower() == "CharLevelTokenizer".lower():
|
54 |
-
tokenizer = CharLevelTokenizer(vocab_size=512)
|
55 |
-
elif args.tokenizer_type.lower() == "TiktokenTokenizer".lower():
|
56 |
-
assert args.vocab_file is not None
|
57 |
-
tokenizer = TiktokenTokenizer(args.vocab_file)
|
58 |
-
else:
|
59 |
-
raise NotImplementedError(
|
60 |
-
"{} tokenizer is not " "implemented.".format(args.tokenizer_type)
|
61 |
-
)
|
62 |
-
|
63 |
-
# Add vocab size.
|
64 |
-
args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args)
|
65 |
-
|
66 |
-
return tokenizer
|
67 |
-
|
68 |
-
|
69 |
-
def _vocab_size_with_padding(orig_vocab_size, args):
|
70 |
-
"""Pad vocab size so it is divisible by model parallel size and
|
71 |
-
still having GPU friendly size."""
|
72 |
-
|
73 |
-
after = orig_vocab_size
|
74 |
-
multiple = args.make_vocab_size_divisible_by * args.model_parallel_size
|
75 |
-
while (after % multiple) != 0:
|
76 |
-
after += 1
|
77 |
-
if args.rank == 0:
|
78 |
-
print(
|
79 |
-
" > padded vocab (size: {}) with {} dummy tokens "
|
80 |
-
"(new size: {})".format(orig_vocab_size, after - orig_vocab_size, after),
|
81 |
-
flush=True,
|
82 |
-
)
|
83 |
-
return after
|
84 |
-
|
85 |
-
|
86 |
-
class AbstractTokenizer(ABC):
|
87 |
-
"""Abstract class for tokenizer."""
|
88 |
-
|
89 |
-
def __init__(self, name):
|
90 |
-
self.name = name
|
91 |
-
super().__init__()
|
92 |
-
|
93 |
-
@property
|
94 |
-
@abstractmethod
|
95 |
-
def vocab_size(self):
|
96 |
-
pass
|
97 |
-
|
98 |
-
@property
|
99 |
-
@abstractmethod
|
100 |
-
def vocab(self):
|
101 |
-
"""Dictionary from vocab text token to id token."""
|
102 |
-
pass
|
103 |
-
|
104 |
-
@property
|
105 |
-
@abstractmethod
|
106 |
-
def inv_vocab(self):
|
107 |
-
"""Dictionary from vocab id token to text token."""
|
108 |
-
pass
|
109 |
-
|
110 |
-
@abstractmethod
|
111 |
-
def tokenize(self, text):
|
112 |
-
pass
|
113 |
-
|
114 |
-
def detokenize(self, token_ids):
|
115 |
-
raise NotImplementedError(
|
116 |
-
"detokenizer is not implemented for {} " "tokenizer".format(self.name)
|
117 |
-
)
|
118 |
-
|
119 |
-
@property
|
120 |
-
def cls(self):
|
121 |
-
raise NotImplementedError(
|
122 |
-
"CLS is not provided for {} " "tokenizer".format(self.name)
|
123 |
-
)
|
124 |
-
|
125 |
-
@property
|
126 |
-
def sep(self):
|
127 |
-
raise NotImplementedError(
|
128 |
-
"SEP is not provided for {} " "tokenizer".format(self.name)
|
129 |
-
)
|
130 |
-
|
131 |
-
@property
|
132 |
-
def pad(self):
|
133 |
-
raise NotImplementedError(
|
134 |
-
"PAD is not provided for {} " "tokenizer".format(self.name)
|
135 |
-
)
|
136 |
-
|
137 |
-
@property
|
138 |
-
def eod(self):
|
139 |
-
raise NotImplementedError(
|
140 |
-
"EOD is not provided for {} " "tokenizer".format(self.name)
|
141 |
-
)
|
142 |
-
|
143 |
-
@property
|
144 |
-
def mask(self):
|
145 |
-
raise NotImplementedError(
|
146 |
-
"MASK is not provided for {} " "tokenizer".format(self.name)
|
147 |
-
)
|
148 |
-
|
149 |
-
|
150 |
-
class _GPT2BPETokenizer(AbstractTokenizer):
|
151 |
-
"""Original GPT2 BPE tokenizer."""
|
152 |
-
|
153 |
-
def __init__(self, vocab_file, merge_file):
|
154 |
-
name = "GPT2 BPE"
|
155 |
-
super().__init__(name)
|
156 |
-
|
157 |
-
self.tokenizer = GPT2Tokenizer(
|
158 |
-
vocab_file, merge_file, errors="replace", special_tokens=[], max_len=None
|
159 |
-
)
|
160 |
-
self.eod_id = self.tokenizer.encoder["<|endoftext|>"]
|
161 |
-
|
162 |
-
@property
|
163 |
-
def vocab_size(self):
|
164 |
-
return len(self.tokenizer.encoder)
|
165 |
-
|
166 |
-
@property
|
167 |
-
def vocab(self):
|
168 |
-
return self.tokenizer.encoder
|
169 |
-
|
170 |
-
@property
|
171 |
-
def inv_vocab(self):
|
172 |
-
return self.tokenizer.decoder
|
173 |
-
|
174 |
-
def tokenize(self, text):
|
175 |
-
return self.tokenizer.encode(text)
|
176 |
-
|
177 |
-
def detokenize(self, token_ids):
|
178 |
-
return self.tokenizer.decode(token_ids)
|
179 |
-
|
180 |
-
@property
|
181 |
-
def eod(self):
|
182 |
-
return self.eod_id
|
183 |
-
|
184 |
-
|
185 |
-
class SentencePieceTokenizer(AbstractTokenizer):
|
186 |
-
"""Designed to Integrate SP's Tokenizer."""
|
187 |
-
|
188 |
-
def __init__(self, vocab_file):
|
189 |
-
name = "SPM"
|
190 |
-
super().__init__(name)
|
191 |
-
|
192 |
-
self.tokenizer = spm.SentencePieceProcessor(model_file=vocab_file)
|
193 |
-
self.eod_id = self.tokenizer.piece_to_id("<|endoftext|>")
|
194 |
-
|
195 |
-
@property
|
196 |
-
def vocab_size(self):
|
197 |
-
return self.tokenizer.get_piece_size()
|
198 |
-
|
199 |
-
@property
|
200 |
-
def vocab(self):
|
201 |
-
return {
|
202 |
-
self.tokenizer.id_to_piece(idx): idx
|
203 |
-
for idx in range(self.tokenizer.get_piece_size())
|
204 |
-
}
|
205 |
-
|
206 |
-
@property
|
207 |
-
def inv_vocab(self):
|
208 |
-
return {
|
209 |
-
idx: self.tokenizer.id_to_piece(idx)
|
210 |
-
for idx in range(self.tokenizer.get_piece_size())
|
211 |
-
}
|
212 |
-
|
213 |
-
def tokenize(self, text):
|
214 |
-
return self.tokenizer.encode(text)
|
215 |
-
|
216 |
-
def detokenize(self, token_ids):
|
217 |
-
return self.tokenizer.decode(token_ids)
|
218 |
-
|
219 |
-
@property
|
220 |
-
def eod(self):
|
221 |
-
return self.eod_id
|
222 |
-
|
223 |
-
|
224 |
-
class HFTokenizer(AbstractTokenizer):
|
225 |
-
"""Designed to Integrate HF's Tokenizer library."""
|
226 |
-
|
227 |
-
def __init__(self, vocab_file):
|
228 |
-
name = "HFTokenizer"
|
229 |
-
super().__init__(name)
|
230 |
-
|
231 |
-
self.tokenizer = Tokenizer.from_file(vocab_file)
|
232 |
-
self.eod_id = self.tokenizer.token_to_id("<|endoftext|>")
|
233 |
-
self.pad_id = self.tokenizer.token_to_id("<|padding|>")
|
234 |
-
|
235 |
-
@property
|
236 |
-
def vocab_size(self):
|
237 |
-
return self.tokenizer.get_vocab_size()
|
238 |
-
|
239 |
-
@property
|
240 |
-
def vocab(self):
|
241 |
-
return self.tokenizer.get_vocab()
|
242 |
-
|
243 |
-
@property
|
244 |
-
def inv_vocab(self):
|
245 |
-
return self.tokenizer.decoder
|
246 |
-
|
247 |
-
def tokenize(self, text: str):
|
248 |
-
return self.tokenizer.encode(text).ids
|
249 |
-
|
250 |
-
def tokenize_batch(self, text_batch: Union[List[str], str]):
|
251 |
-
return self.tokenizer.encode_batch(text_batch)
|
252 |
-
|
253 |
-
def detokenize(self, token_ids):
|
254 |
-
return self.tokenizer.decode(token_ids)
|
255 |
-
|
256 |
-
@property
|
257 |
-
def eod(self):
|
258 |
-
return self.eod_id
|
259 |
-
|
260 |
-
|
261 |
-
class HFGPT2Tokenizer(AbstractTokenizer):
|
262 |
-
"""Designed to Integrate the pretrained OpenAI GPT2 Tokenizers from HF"""
|
263 |
-
|
264 |
-
def __init__(self, vocab_file=None, fast=True):
|
265 |
-
name = "HFGPT2Tokenizer"
|
266 |
-
if fast:
|
267 |
-
name += "Fast"
|
268 |
-
super().__init__(name)
|
269 |
-
if vocab_file is None:
|
270 |
-
vocab_file = "gpt2"
|
271 |
-
if fast:
|
272 |
-
self.tokenizer = GPT2TokenizerFast.from_pretrained(vocab_file)
|
273 |
-
else:
|
274 |
-
self.tokenizer = GPT2Tokenizer.from_pretrained(vocab_file)
|
275 |
-
|
276 |
-
self.tokenizer.add_special_tokens({"pad_token": "<|padding|>"})
|
277 |
-
self.eod_id = self.tokenizer.eos_token_id
|
278 |
-
self.pad_id = self.tokenizer.pad_token_id
|
279 |
-
|
280 |
-
@property
|
281 |
-
def vocab_size(self):
|
282 |
-
return len(self.tokenizer)
|
283 |
-
|
284 |
-
@property
|
285 |
-
def vocab(self):
|
286 |
-
return self.tokenizer.get_vocab()
|
287 |
-
|
288 |
-
@property
|
289 |
-
def inv_vocab(self):
|
290 |
-
return self.tokenizer._tokenizer.decoder
|
291 |
-
|
292 |
-
def tokenize(self, text: str):
|
293 |
-
return self.tokenizer.encode(text)
|
294 |
-
|
295 |
-
def tokenize_batch(self, text_batch: Union[List[str], str]):
|
296 |
-
if isinstance(text_batch, str):
|
297 |
-
text_batch = [text_batch]
|
298 |
-
return [self.tokenize(t) for t in text_batch]
|
299 |
-
|
300 |
-
def detokenize(self, token_ids):
|
301 |
-
return self.tokenizer.decode(token_ids)
|
302 |
-
|
303 |
-
@property
|
304 |
-
def eod(self):
|
305 |
-
return self.eod_id
|
306 |
-
|
307 |
-
|
308 |
-
class CharLevelTokenizer(AbstractTokenizer):
|
309 |
-
"""Character Level Tokenizer"""
|
310 |
-
|
311 |
-
def __init__(self, vocab_size):
|
312 |
-
name = "CharLevelTokenizer"
|
313 |
-
super().__init__(name)
|
314 |
-
self._vocab_size = vocab_size
|
315 |
-
self.eod_id = 0
|
316 |
-
self.pad_id = 1
|
317 |
-
|
318 |
-
def clamp(self, n):
|
319 |
-
return max(32, min(n, self.vocab_size))
|
320 |
-
|
321 |
-
@property
|
322 |
-
def vocab_size(self):
|
323 |
-
return self._vocab_size
|
324 |
-
|
325 |
-
@property
|
326 |
-
def vocab(self):
|
327 |
-
raise NotImplementedError
|
328 |
-
|
329 |
-
@property
|
330 |
-
def inv_vocab(self):
|
331 |
-
raise NotImplementedError
|
332 |
-
|
333 |
-
def decode_token(self, token: int):
|
334 |
-
return str(chr(self.clamp(token)))
|
335 |
-
|
336 |
-
def tokenize(self, text: str):
|
337 |
-
return list(np.fromstring(text, dtype=np.uint8))
|
338 |
-
|
339 |
-
def tokenize_batch(self, text_batch: Union[List[str], str]):
|
340 |
-
if isinstance(text_batch, list):
|
341 |
-
return [self.tokenize(s) for s in text_batch]
|
342 |
-
else:
|
343 |
-
return self.tokenize(text_batch)
|
344 |
-
|
345 |
-
def detokenize(self, token_ids):
|
346 |
-
return "".join(list(map(self.decode_token, token_ids)))
|
347 |
-
|
348 |
-
@property
|
349 |
-
def eod(self):
|
350 |
-
return self.eod_id
|
351 |
-
|
352 |
-
|
353 |
-
class TiktokenTokenizer(AbstractTokenizer):
|
354 |
-
"""Tokenizer from OpenAI's tiktoken implementation"""
|
355 |
-
|
356 |
-
def __init__(self, vocab_file):
|
357 |
-
try:
|
358 |
-
import tiktoken
|
359 |
-
except ModuleNotFoundError:
|
360 |
-
print("Please install tiktoken: (https://github.com/openai/tiktoken)")
|
361 |
-
raise Exception
|
362 |
-
|
363 |
-
name = "TiktokenTokenizer"
|
364 |
-
super().__init__(name)
|
365 |
-
|
366 |
-
self.tokenizer = tiktoken.get_encoding(vocab_file)
|
367 |
-
self.eod_id = self.tokenizer.eot_token
|
368 |
-
self.pad_id = None
|
369 |
-
|
370 |
-
@property
|
371 |
-
def vocab_size(self):
|
372 |
-
return self.tokenizer.n_vocab
|
373 |
-
|
374 |
-
@property
|
375 |
-
def vocab(self):
|
376 |
-
raise NotImplementedError(
|
377 |
-
"TiktokenTokenizer does not implement vocabulary access."
|
378 |
-
)
|
379 |
-
|
380 |
-
@property
|
381 |
-
def inv_vocab(self):
|
382 |
-
raise NotImplementedError(
|
383 |
-
"TiktokenTokenizer does not implement vocabulary access. \
|
384 |
-
To get the idx-th token in vocabulary, use tokenizer.decode([idx]) ."
|
385 |
-
)
|
386 |
-
|
387 |
-
def tokenize(self, text: str):
|
388 |
-
return self.tokenizer.encode(text) # , allowed_special="all")
|
389 |
-
|
390 |
-
def tokenize_batch(self, text_batch: List[str]):
|
391 |
-
return self.tokenizer.encode_batch(text_batch, allowed_special="all")
|
392 |
-
|
393 |
-
def detokenize(self, token_ids):
|
394 |
-
return self.tokenizer.decode(tokens=token_ids, errors="strict")
|
395 |
-
|
396 |
-
@property
|
397 |
-
def eod(self):
|
398 |
-
return self.eod_id
|
399 |
-
|
400 |
-
@property
|
401 |
-
def pad(self):
|
402 |
-
raise NotImplementedError
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/tokenizer/train_tokenizer.py
DELETED
@@ -1,126 +0,0 @@
|
|
1 |
-
# Copyright (c) 2021, EleutherAI
|
2 |
-
#
|
3 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
-
# you may not use this file except in compliance with the License.
|
5 |
-
# You may obtain a copy of the License at
|
6 |
-
#
|
7 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
-
#
|
9 |
-
# Unless required by applicable law or agreed to in writing, software
|
10 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
-
# See the License for the specific language governing permissions and
|
13 |
-
# limitations under the License.
|
14 |
-
|
15 |
-
"""
|
16 |
-
Assumes a dataset of jsonl files in the same format as the neox training set.
|
17 |
-
"""
|
18 |
-
|
19 |
-
from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers
|
20 |
-
from tokenizers.normalizers import NFKC
|
21 |
-
|
22 |
-
from glob import glob
|
23 |
-
import os
|
24 |
-
import json
|
25 |
-
import argparse
|
26 |
-
|
27 |
-
|
28 |
-
def load_jsonl(input_path, quiet=True) -> list:
|
29 |
-
"""
|
30 |
-
Read list of objects from a JSON lines file.
|
31 |
-
"""
|
32 |
-
data = []
|
33 |
-
with open(input_path, "r", encoding="utf-8") as f:
|
34 |
-
for line in f:
|
35 |
-
data.append(json.loads(line.rstrip("\n|\r")))
|
36 |
-
if not quiet:
|
37 |
-
print("Loaded {} records from {}".format(len(data), input_path))
|
38 |
-
return data
|
39 |
-
|
40 |
-
|
41 |
-
def json_iterator(input_dir, text_key="text"):
|
42 |
-
all_jsonls = glob(f"{input_dir}/*.jsonl") + glob(f"{input_dir}/*.json")
|
43 |
-
for j in all_jsonls:
|
44 |
-
data = load_jsonl(j)
|
45 |
-
for doc in data:
|
46 |
-
yield doc[text_key]
|
47 |
-
|
48 |
-
|
49 |
-
def train_tokenizer(
|
50 |
-
input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000
|
51 |
-
):
|
52 |
-
"""
|
53 |
-
Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path`
|
54 |
-
|
55 |
-
:param input_dir: input directory containing jsonl files
|
56 |
-
:param save_path: path to save tokenizer to
|
57 |
-
:param tokenizer_type: type of tokenizer to train.
|
58 |
-
:param vocab_size: int, size of tokenizer's vocab
|
59 |
-
:return:
|
60 |
-
"""
|
61 |
-
|
62 |
-
if tokenizer_type == "BPE":
|
63 |
-
model = models.BPE()
|
64 |
-
else:
|
65 |
-
raise NotImplementedError(f"Tokenizer type {tokenizer_type} not implemented")
|
66 |
-
tokenizer = Tokenizer(model)
|
67 |
-
|
68 |
-
# Customize pre-tokenization and decoding
|
69 |
-
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
|
70 |
-
tokenizer.decoder = decoders.ByteLevel()
|
71 |
-
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
|
72 |
-
tokenizer.normalizer = NFKC()
|
73 |
-
|
74 |
-
# And then train
|
75 |
-
trainer = trainers.BpeTrainer(
|
76 |
-
vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"]
|
77 |
-
)
|
78 |
-
tokenizer.train_from_iterator(json_iterator(input_dir), trainer)
|
79 |
-
|
80 |
-
# And Save it
|
81 |
-
tokenizer.save(save_path, pretty=True)
|
82 |
-
print(f"Tokenizer saved at {save_path}")
|
83 |
-
|
84 |
-
|
85 |
-
def parse_args():
|
86 |
-
parser = argparse.ArgumentParser(
|
87 |
-
description="script for training a multilingual "
|
88 |
-
"HF tokenizer on CC dumps with upweighting for low resource languages"
|
89 |
-
)
|
90 |
-
parser.add_argument(
|
91 |
-
"--json_input_dir",
|
92 |
-
type=str,
|
93 |
-
help="Path to folder containing tokenizer training data in jsonl format",
|
94 |
-
)
|
95 |
-
parser.add_argument(
|
96 |
-
"--tokenizer_output_path",
|
97 |
-
type=str,
|
98 |
-
help="Path to which your trained tokenizer will be saved (should end in .json)",
|
99 |
-
)
|
100 |
-
parser.add_argument(
|
101 |
-
"--tokenizer_type",
|
102 |
-
type=str,
|
103 |
-
help="type of tokenizer to train, currently only BPE is supported",
|
104 |
-
choices=["BPE"],
|
105 |
-
default=["BPE"],
|
106 |
-
)
|
107 |
-
parser.add_argument(
|
108 |
-
"-v",
|
109 |
-
"--vocab_size",
|
110 |
-
help="vocabulary size of tokenizer, default=52k",
|
111 |
-
type=int,
|
112 |
-
default=52000,
|
113 |
-
)
|
114 |
-
return parser.parse_args()
|
115 |
-
|
116 |
-
|
117 |
-
if __name__ == "__main__":
|
118 |
-
|
119 |
-
args = parse_args()
|
120 |
-
|
121 |
-
train_tokenizer(
|
122 |
-
args.json_input_dir,
|
123 |
-
save_path=args.tokenizer_output_path,
|
124 |
-
tokenizer_type=args.tokenizer_type,
|
125 |
-
vocab_size=args.vocab_size,
|
126 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/gpt_neox_chinese_v1/trouble-shooting.md
DELETED
@@ -1,22 +0,0 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
## Exception: data did not match any variant of untagged enum ModelWrapper at line 108219 column 3
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
## The OrderedVocab you are attempting to save contains a hole for index 50254, your vocabulary could be corrupted !
|
9 |
-
|
10 |
-
|
11 |
-
```
|
12 |
-
The OrderedVocab you are attempting to save contains a hole for index 50254, your vocabulary could be corrupted !
|
13 |
-
The OrderedVocab you are attempting to save contains a hole for index 50255, your vocabulary could be corrupted !
|
14 |
-
The OrderedVocab you are attempting to save contains a hole for index 50256, your vocabulary could be corrupted !
|
15 |
-
```
|
16 |
-
|
17 |
-
|
18 |
-
原因:50254 这些token并未在vocab中定义,只在 `added_tokens` 里定义了。
|
19 |
-
|
20 |
-
## ss
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/moss/__init__.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
|
2 |
import os
|
3 |
-
from transformers import AutoTokenizer
|
4 |
|
5 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
6 |
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "moss-moon-003-sft")
|
|
|
1 |
|
2 |
import os
|
3 |
+
from transformers import AutoTokenizer
|
4 |
|
5 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
6 |
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "moss-moon-003-sft")
|
vocab/text_davinci_003/__init__.py
CHANGED
@@ -1,70 +1,25 @@
|
|
1 |
"""
|
2 |
-
|
3 |
-
"""
|
4 |
-
|
5 |
-
import tiktoken
|
6 |
-
from tiktoken import Encoding
|
7 |
-
from utils.log_util import logger
|
8 |
-
|
9 |
-
tokenizer = tiktoken.encoding_for_model('text-davinci-003')
|
10 |
-
tokenizer.vocab_size = tokenizer.n_vocab
|
11 |
-
|
12 |
-
tokenizer.comments = ""
|
13 |
-
tokenizer.reversible = True
|
14 |
-
|
15 |
|
16 |
|
17 |
|
18 |
-
|
19 |
-
"""
|
20 |
-
默认的decode,可能会报错,详见 decode_test.py
|
21 |
-
skip_special_tokens 是为了兼容 hf_tokenizer
|
22 |
-
"""
|
23 |
-
try:
|
24 |
-
decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
|
25 |
-
except:
|
26 |
-
decode_str = "null"
|
27 |
-
return decode_str
|
28 |
|
29 |
-
|
30 |
-
""
|
31 |
-
|
32 |
-
"""
|
33 |
-
try:
|
34 |
-
return tokenizer.decode_tokens_bytes(tokens)
|
35 |
-
except:
|
36 |
-
# 什么要返回None?见zh_util.py
|
37 |
-
# 16个空闲id, 100256 100261-100275
|
38 |
-
return [None for token in tokens]
|
39 |
|
40 |
-
def get_vocab(self, token_type="str"):
|
41 |
-
"""Returns vocab as a dict
|
42 |
-
:param token_type: ["str", "byte"]
|
43 |
-
:return:
|
44 |
-
"""
|
45 |
-
vocab = {}
|
46 |
-
key_error_list = []
|
47 |
-
unicode_decode_error_list = []
|
48 |
-
for i in range(self.vocab_size):
|
49 |
-
try:
|
50 |
-
token_byte = self.convert_ids_to_tokens([i])[0]
|
51 |
-
if token_byte is None:
|
52 |
-
continue
|
53 |
-
# token_str = token_byte.decode("utf-8")
|
54 |
-
vocab[token_byte] = i
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
vocab[token_byte] = i
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
logger.info(f"text-davinci-003 {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
|
63 |
-
return vocab
|
64 |
|
|
|
|
|
65 |
|
|
|
|
|
66 |
|
67 |
-
# tiktoken patch
|
68 |
-
Encoding.decode = decode
|
69 |
-
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
|
70 |
-
Encoding.get_vocab = get_vocab
|
|
|
1 |
"""
|
2 |
+
,请
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
|
5 |
|
6 |
+
## tiktoken API
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
tokens = enc.encode("hello world")
|
9 |
+
assert enc.decode(tokens) == "hello world"
|
10 |
+
assert enc.decode_bytes(tokens) == b"hello world"
|
11 |
+
assert enc.decode_tokens_bytes(tokens) == [b"hello", b" world"]
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
decode_single_token_bytes
|
15 |
+
"""
|
|
|
16 |
|
17 |
+
import tiktoken
|
18 |
+
import tokenizer.tiktoken_patch
|
|
|
|
|
19 |
|
20 |
+
tokenizer = tiktoken.encoding_for_model('text-davinci-003')
|
21 |
+
tokenizer.vocab_size = tokenizer.n_vocab
|
22 |
|
23 |
+
tokenizer.comments = ""
|
24 |
+
tokenizer.reversible = True
|
25 |
|
|
|
|
|
|
|
|