Spaces:
Running
Running
fix tiktoken special tokens
Browse files- tokenizer/tiktoken_patch.py +3 -0
- vocab/__init__.py +92 -91
- vocab/starchat_alpha/__init__.py +5 -0
tokenizer/tiktoken_patch.py
CHANGED
@@ -6,6 +6,8 @@ def decode(self, tokens, errors="replace", skip_special_tokens=False):
|
|
6 |
"""
|
7 |
默认的decode,可能会报错,详见 decode_test.py
|
8 |
skip_special_tokens 是为了兼容 hf_tokenizer
|
|
|
|
|
9 |
"""
|
10 |
try:
|
11 |
decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
|
@@ -58,6 +60,7 @@ def encode(self, *args, **kwargs):
|
|
58 |
add_special_token 是为了兼容 hf_tokenizer
|
59 |
"""
|
60 |
kwargs.pop("add_special_tokens", None)
|
|
|
61 |
return self._encode(*args, **kwargs)
|
62 |
|
63 |
|
|
|
6 |
"""
|
7 |
默认的decode,可能会报错,详见 decode_test.py
|
8 |
skip_special_tokens 是为了兼容 hf_tokenizer
|
9 |
+
|
10 |
+
errors=replace, ignore, strict 有什么区别?
|
11 |
"""
|
12 |
try:
|
13 |
decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
|
|
|
60 |
add_special_token 是为了兼容 hf_tokenizer
|
61 |
"""
|
62 |
kwargs.pop("add_special_tokens", None)
|
63 |
+
kwargs["allowed_special"] = "all"
|
64 |
return self._encode(*args, **kwargs)
|
65 |
|
66 |
|
vocab/__init__.py
CHANGED
@@ -15,11 +15,11 @@ tokenizer.type = TokenizerType.ByteBPE.name
|
|
15 |
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
|
16 |
"HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
|
17 |
|
18 |
-
- bert
|
19 |
- 特征
|
20 |
- 词典:有##开头的token,表示subword
|
21 |
- 示例:
|
22 |
-
- sentencepiece:
|
23 |
- 特征:
|
24 |
- 训练:
|
25 |
- 文件: *.sp_model 或 *.model (可选文件 .vocab,)
|
@@ -28,10 +28,10 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.c
|
|
28 |
- 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
|
29 |
- 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
|
30 |
- 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
|
31 |
-
- 示例:llama,baichuan, orion
|
32 |
- icetk: sentencepiece的分支,支持image_tokenizer
|
33 |
- glm, chatglm1, chatglm2
|
34 |
-
- tiktoken
|
35 |
- hf_tokenizer
|
36 |
- 特征:
|
37 |
- 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
|
@@ -65,102 +65,103 @@ uniq_tokenizers = [
|
|
65 |
""
|
66 |
]
|
67 |
|
|
|
68 |
all_tokenizers = [
|
69 |
-
|
70 |
-
"
|
71 |
-
|
72 |
-
|
73 |
-
"
|
74 |
-
"
|
75 |
-
"
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
"
|
81 |
-
#
|
82 |
-
#
|
83 |
-
"chatyuan_large_v2",
|
84 |
-
"prompt_clue",
|
85 |
-
#
|
86 |
-
# #### bloom 系列
|
87 |
-
"bloom",
|
88 |
-
# "bloomz_6b4_zh",
|
89 |
-
# "belle_7b_2m", # 模型和词典都基于bloom
|
90 |
#
|
91 |
-
"gpt_nexo_20b",
|
92 |
-
"qwen1_5_14b_chat",
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
#
|
97 |
-
"
|
98 |
-
"
|
99 |
-
"
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
"
|
104 |
-
"
|
105 |
-
"
|
106 |
-
|
107 |
-
# "
|
108 |
-
# "
|
109 |
-
"
|
110 |
-
"
|
111 |
-
"
|
112 |
-
"
|
113 |
-
"
|
114 |
-
"
|
115 |
-
"
|
116 |
-
"
|
|
|
117 |
# "goat",
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
# tiktoken 系列
|
120 |
-
"qwen_1_8b_chat",
|
121 |
-
"qwen_7b_chat",
|
122 |
-
"qwen_72b_chat",
|
123 |
-
"text_davinci_003",
|
124 |
-
"code_davinci_002",
|
125 |
-
"gpt_35_turbo",
|
126 |
-
"gpt_4",
|
127 |
|
128 |
# 未分类
|
129 |
-
"skywork_13b_base",
|
130 |
-
"skywork_13b_math",
|
131 |
-
"mistral_7b",
|
132 |
-
"mixtral_8_7b",
|
133 |
-
|
134 |
-
"
|
135 |
-
"
|
136 |
-
"
|
137 |
-
"
|
138 |
-
"
|
139 |
-
"
|
140 |
-
"
|
141 |
-
"
|
142 |
-
"
|
143 |
-
"
|
144 |
-
"
|
145 |
-
"
|
146 |
-
"
|
147 |
-
"
|
148 |
-
"
|
149 |
-
"
|
150 |
-
"
|
151 |
-
"
|
152 |
-
"
|
153 |
-
"
|
154 |
-
"
|
155 |
-
"
|
156 |
-
"
|
157 |
-
"
|
158 |
-
"
|
159 |
-
"
|
160 |
-
"olmo_7b",
|
161 |
-
"aya_101",
|
162 |
]
|
163 |
|
|
|
164 |
all_tokenizers = sorted(all_tokenizers)
|
165 |
|
166 |
|
|
|
15 |
tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
|
16 |
"HFGPT2Tokenizer", "HFTokenizer", "GPT2BPETokenizer", "CharLevelTokenizer", "TiktokenTokenizer", "SPMTokenizer", https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
|
17 |
|
18 |
+
- google/bert
|
19 |
- 特征
|
20 |
- 词典:有##开头的token,表示subword
|
21 |
- 示例:
|
22 |
+
- google/sentencepiece:
|
23 |
- 特征:
|
24 |
- 训练:
|
25 |
- 文件: *.sp_model 或 *.model (可选文件 .vocab,)
|
|
|
28 |
- 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
|
29 |
- 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
|
30 |
- 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
|
31 |
+
- 示例:google-t5, llama,baichuan, orion,
|
32 |
- icetk: sentencepiece的分支,支持image_tokenizer
|
33 |
- glm, chatglm1, chatglm2
|
34 |
+
- openai/tiktoken
|
35 |
- hf_tokenizer
|
36 |
- 特征:
|
37 |
- 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
|
|
|
65 |
""
|
66 |
]
|
67 |
|
68 |
+
# TODO: alias/abbr, hf_path, tokenizer_class, comments,
|
69 |
all_tokenizers = [
|
70 |
+
##### bert 系列
|
71 |
+
("bert_base_cased", "", ""),
|
72 |
+
("bert_base_uncased","",),
|
73 |
+
("bert_base_chinese",),
|
74 |
+
("roberta_chinese_clue",),
|
75 |
+
("kplug",),
|
76 |
+
("gpt2_chinese",),
|
77 |
+
|
78 |
+
##### GPT2Tokenizer
|
79 |
+
("gpt2",), #
|
80 |
+
("moss",),
|
81 |
+
("bloom",),
|
82 |
+
# ("bloomz_6b4_zh",
|
83 |
+
# ("belle_7b_2m", # 模型和词典都基于bloom
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
#
|
85 |
+
("gpt_nexo_20b",), # 5万
|
86 |
+
("qwen1_5_14b_chat",), # 15万,速度有点慢
|
87 |
+
("starchat_alpha",),
|
88 |
+
|
89 |
+
####### google/sentencepiece tokenizer:
|
90 |
+
# T5 llama internlm
|
91 |
+
("t5_small",),
|
92 |
+
("t5_base",),
|
93 |
+
("t5_large",),
|
94 |
+
("chatyuan_large_v2",),
|
95 |
+
("prompt_clue",),
|
96 |
+
|
97 |
+
("llama",), # '中文单字': 700, '中文多字': 0
|
98 |
+
("llama2",),
|
99 |
+
("chinese_llama",), #
|
100 |
+
("chinese_llama2",), #
|
101 |
+
# ("chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
|
102 |
+
# ("belle_llama_ext_7b",
|
103 |
+
# ("alpaca_7b",
|
104 |
+
("baichuan",),
|
105 |
+
("baichuan2",),
|
106 |
+
("internlm_chat_7b",),
|
107 |
+
("internlm2_chat_7b",),
|
108 |
+
("internlm2_math_7b",),
|
109 |
+
("internlm_xcomposer_7b",),
|
110 |
+
("falcon_7b",),
|
111 |
+
("falcon_180b",),
|
112 |
# "goat",
|
113 |
|
114 |
+
# ##### glm系列
|
115 |
+
# "glm_chinese",),
|
116 |
+
("chatglm_6b",),
|
117 |
+
("chatglm2_6b",),
|
118 |
+
("chatglm3_6b",),
|
119 |
+
|
120 |
+
|
121 |
# tiktoken 系列
|
122 |
+
("qwen_1_8b_chat",),
|
123 |
+
("qwen_7b_chat",),
|
124 |
+
("qwen_72b_chat",),
|
125 |
+
("text_davinci_003",),
|
126 |
+
("code_davinci_002",),
|
127 |
+
("gpt_35_turbo",),
|
128 |
+
("gpt_4",),
|
129 |
|
130 |
# 未分类
|
131 |
+
("skywork_13b_base",),
|
132 |
+
("skywork_13b_math",),
|
133 |
+
("mistral_7b",),
|
134 |
+
("mixtral_8_7b",),
|
135 |
+
|
136 |
+
("flan_t5_base",),
|
137 |
+
("fastchat_t5_3b",),
|
138 |
+
("pko_t5_large",),
|
139 |
+
("wizardcoder_15b_v1",),
|
140 |
+
("yi_6b",),
|
141 |
+
("yi_34b",),
|
142 |
+
("yi_vl34b",),
|
143 |
+
("orion_14b_chat",),
|
144 |
+
("phi_1",),
|
145 |
+
("phi_2",),
|
146 |
+
("solar_10_7b",),
|
147 |
+
("mobilebert_uncased",),
|
148 |
+
("mobilenet_v2",),
|
149 |
+
("switch_c_2048",),
|
150 |
+
("byt5_small",),
|
151 |
+
("mt5_large",),
|
152 |
+
("wizardcoder_python_7b_v1",),
|
153 |
+
("wizardlm_7b_v1",),
|
154 |
+
("wizardmath_70b_v1",),
|
155 |
+
("tigerbot_70b_chat_v4_4k",),
|
156 |
+
("tigerbot_13b_chat_v2",),
|
157 |
+
("deepseek_coder_33b_instruct",),
|
158 |
+
("deepseek_llm_7b_base",),
|
159 |
+
("gemma_7b",),
|
160 |
+
("olmo_7b",),
|
161 |
+
("aya_101",),
|
|
|
|
|
162 |
]
|
163 |
|
164 |
+
all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]
|
165 |
all_tokenizers = sorted(all_tokenizers)
|
166 |
|
167 |
|
vocab/starchat_alpha/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
|
5 |
+
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/starchat-alpha")
|