add character glm
Browse files- vocab/__init__.py +65 -53
- vocab/character_glm_6b/__init__.py +4 -0
vocab/__init__.py
CHANGED
@@ -19,32 +19,37 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.c
|
|
19 |
- 特征
|
20 |
- 词典:有##开头的token,表示subword
|
21 |
- 示例:
|
22 |
-
- google/sentencepiece:
|
23 |
- 特征:
|
24 |
- 训练:
|
25 |
-
- 文件: *.sp_model 或 *.model (可选文件 .vocab,)
|
26 |
- 实现:
|
|
|
27 |
- 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
|
28 |
- 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
|
29 |
- 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
|
|
|
|
|
30 |
- 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
|
31 |
- 示例:google-t5, llama,baichuan, orion,
|
32 |
- icetk: sentencepiece的分支,支持image_tokenizer
|
33 |
- glm, chatglm1, chatglm2
|
34 |
- openai/tiktoken
|
35 |
-
- hf_tokenizer
|
|
|
36 |
- 特征:
|
37 |
- 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
|
38 |
- added_tokens 在vocab中不一定存在。
|
39 |
- 实现:
|
40 |
-
- 训练:
|
41 |
- 加载:
|
42 |
-
- 方法:
|
43 |
- .model 是 tokenizer.models.BPE 类型
|
44 |
- 词典有 Ġ "\u0120" 开头
|
45 |
-
-
|
|
|
46 |
- 示例:gpt2, gpt_neox_20b, moss, bloom, qwen2
|
47 |
-
-
|
48 |
- ss
|
49 |
- tiktoken
|
50 |
- 特征:空格就是空格,
|
@@ -65,71 +70,72 @@ uniq_tokenizers = [
|
|
65 |
""
|
66 |
]
|
67 |
|
68 |
-
# TODO: alias/abbr, hf_path, tokenizer_class, comments,
|
69 |
all_tokenizers = [
|
70 |
##### bert 系列
|
71 |
-
("bert_base_cased", "", ""),
|
72 |
-
("bert_base_uncased","",),
|
73 |
-
("bert_base_chinese",),
|
74 |
-
("roberta_chinese_clue",),
|
75 |
("kplug",),
|
76 |
("gpt2_chinese",),
|
77 |
|
78 |
##### GPT2Tokenizer
|
79 |
-
("gpt2",),
|
80 |
-
("moss",),
|
81 |
-
("bloom",),
|
82 |
# ("bloomz_6b4_zh",
|
83 |
# ("belle_7b_2m", # 模型和词典都基于bloom
|
84 |
#
|
85 |
-
("gpt_nexo_20b",),
|
86 |
-
("qwen1_5_14b_chat",), # 15万,速度有点慢
|
87 |
-
("starchat_alpha",),
|
88 |
|
89 |
####### google/sentencepiece tokenizer:
|
90 |
# T5 llama internlm
|
91 |
-
("t5_small",),
|
92 |
-
("t5_base",),
|
93 |
-
("t5_large",),
|
94 |
-
("chatyuan_large_v2",),
|
95 |
-
("prompt_clue",),
|
96 |
-
|
97 |
-
("llama",), # '中文单字': 700, '中文多字': 0
|
98 |
-
("llama2",),
|
99 |
-
("chinese_llama",), #
|
100 |
-
("chinese_llama2",), #
|
101 |
# ("chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
|
102 |
# ("belle_llama_ext_7b",
|
103 |
# ("alpaca_7b",
|
104 |
-
("baichuan",),
|
105 |
-
("baichuan2",),
|
106 |
-
("internlm_chat_7b",),
|
107 |
-
("internlm2_chat_7b",),
|
108 |
-
("internlm2_math_7b",),
|
109 |
-
("internlm_xcomposer_7b",),
|
110 |
-
("falcon_7b",),
|
111 |
-
("falcon_180b",),
|
|
|
|
|
112 |
# "goat",
|
113 |
|
114 |
# ##### glm系列
|
115 |
# "glm_chinese",),
|
116 |
-
("chatglm_6b",),
|
117 |
-
("chatglm2_6b",),
|
118 |
-
("chatglm3_6b",),
|
119 |
-
|
120 |
|
121 |
# tiktoken 系列
|
122 |
-
("qwen_1_8b_chat",),
|
123 |
-
("qwen_7b_chat",),
|
124 |
-
("qwen_72b_chat",),
|
125 |
-
("text_davinci_003",),
|
126 |
-
("code_davinci_002",),
|
127 |
-
("gpt_35_turbo",),
|
128 |
-
("gpt_4",),
|
129 |
|
130 |
# 未分类
|
131 |
-
|
132 |
-
("skywork_13b_math",),
|
133 |
("mistral_7b",),
|
134 |
("mixtral_8_7b",),
|
135 |
|
@@ -205,15 +211,21 @@ class TokenizerType(Enum):
|
|
205 |
|
206 |
|
207 |
class TokenizerImpl(Enum):
|
|
|
208 |
"""
|
|
|
|
|
209 |
"""
|
210 |
-
SentencePiece = auto()
|
211 |
|
212 |
# https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/gpt2/tokenization_gpt2.py#L104
|
213 |
# 构造词典:
|
214 |
-
#
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
217 |
|
218 |
|
219 |
def load_tokener(model_name):
|
|
|
19 |
- 特征
|
20 |
- 词典:有##开头的token,表示subword
|
21 |
- 示例:
|
22 |
+
- bpe-google/sentencepiece:
|
23 |
- 特征:
|
24 |
- 训练:
|
25 |
+
- 文件: *.sp_model 或 *.model (可选文件 .vocab,) spm简称
|
26 |
- 实现:
|
27 |
+
- 依赖: protobuf
|
28 |
- 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
|
29 |
- 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
|
30 |
- 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
|
31 |
+
- 分词:
|
32 |
+
- pre_tokenizers.ByteLevel(add_prefix_space=True, use_regex=False)
|
33 |
- 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
|
34 |
- 示例:google-t5, llama,baichuan, orion,
|
35 |
- icetk: sentencepiece的分支,支持image_tokenizer
|
36 |
- glm, chatglm1, chatglm2
|
37 |
- openai/tiktoken
|
38 |
+
- bpe-hf_tokenizer
|
39 |
+
- ss
|
40 |
- 特征:
|
41 |
- 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
|
42 |
- added_tokens 在vocab中不一定存在。
|
43 |
- 实现:
|
44 |
+
- 训练: `from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer`
|
45 |
- 加载:
|
46 |
+
- 方法: .model.from_file .model.save .model.token_to_id .model.tokenize
|
47 |
- .model 是 tokenizer.models.BPE 类型
|
48 |
- 词典有 Ġ "\u0120" 开头
|
49 |
+
- 优势
|
50 |
+
-
|
51 |
- 示例:gpt2, gpt_neox_20b, moss, bloom, qwen2
|
52 |
+
- 优势:相对sentence piece,hf_tokenizer支持pre-tokenization的正则表达式,对tab和换行支持更好 ()
|
53 |
- ss
|
54 |
- tiktoken
|
55 |
- 特征:空格就是空格,
|
|
|
70 |
""
|
71 |
]
|
72 |
|
73 |
+
# TODO: alias/abbr, hf_path, tokenizer_class/type, comments,
|
74 |
all_tokenizers = [
|
75 |
##### bert 系列
|
76 |
+
("bert_base_cased", "", "bert"),
|
77 |
+
("bert_base_uncased", "", "bert"),
|
78 |
+
("bert_base_chinese", "", "bert"),
|
79 |
+
("roberta_chinese_clue", "", "bert"),
|
80 |
("kplug",),
|
81 |
("gpt2_chinese",),
|
82 |
|
83 |
##### GPT2Tokenizer
|
84 |
+
("gpt2", "", "GPT2Tokenizer",), #
|
85 |
+
("moss", "", "GPT2Tokenizer",),
|
86 |
+
("bloom", "", "GPT2Tokenizer",),
|
87 |
# ("bloomz_6b4_zh",
|
88 |
# ("belle_7b_2m", # 模型和词典都基于bloom
|
89 |
#
|
90 |
+
("gpt_nexo_20b", "", "GPT2Tokenizer",), # 5万
|
91 |
+
("qwen1_5_14b_chat", "", "GPT2Tokenizer",), # 15万,速度有点慢
|
92 |
+
("starchat_alpha", "", "GPT2Tokenizer",),
|
93 |
|
94 |
####### google/sentencepiece tokenizer:
|
95 |
# T5 llama internlm
|
96 |
+
("t5_small", "", "sentencepiece"),
|
97 |
+
("t5_base", "", "sentencepiece"),
|
98 |
+
("t5_large", "", "sentencepiece"),
|
99 |
+
("chatyuan_large_v2", "", "sentencepiece"),
|
100 |
+
("prompt_clue", "", "sentencepiece"),
|
101 |
+
|
102 |
+
("llama", "", "sentencepiece"), # '中文单字': 700, '中文多字': 0
|
103 |
+
("llama2", "", "sentencepiece"),
|
104 |
+
("chinese_llama", "", "sentencepiece"), #
|
105 |
+
("chinese_llama2", "", "sentencepiece"), #
|
106 |
# ("chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
|
107 |
# ("belle_llama_ext_7b",
|
108 |
# ("alpaca_7b",
|
109 |
+
("baichuan", "", "sentencepiece"),
|
110 |
+
("baichuan2", "", "sentencepiece"),
|
111 |
+
("internlm_chat_7b", "", "sentencepiece"),
|
112 |
+
("internlm2_chat_7b", "", "sentencepiece"),
|
113 |
+
("internlm2_math_7b", "", "sentencepiece"),
|
114 |
+
("internlm_xcomposer_7b", "", "sentencepiece"),
|
115 |
+
("falcon_7b", "", "sentencepiece"),
|
116 |
+
("falcon_180b", "", "sentencepiece"),
|
117 |
+
("skywork_13b_base",),
|
118 |
+
("skywork_13b_math",),
|
119 |
# "goat",
|
120 |
|
121 |
# ##### glm系列
|
122 |
# "glm_chinese",),
|
123 |
+
("chatglm_6b", "", "sentencepiece"),
|
124 |
+
("chatglm2_6b", "", "sentencepiece"),
|
125 |
+
("chatglm3_6b", "", "sentencepiece"),
|
126 |
+
("character_glm_6b", "", "sentencepiece"),
|
127 |
|
128 |
# tiktoken 系列
|
129 |
+
("qwen_1_8b_chat", "", "tiktoken"),
|
130 |
+
("qwen_7b_chat", "", "tiktoken"),
|
131 |
+
("qwen_72b_chat", "", "tiktoken"),
|
132 |
+
("text_davinci_003", "", "tiktoken"),
|
133 |
+
("code_davinci_002", "", "tiktoken"),
|
134 |
+
("gpt_35_turbo", "", "tiktoken"),
|
135 |
+
("gpt_4", "", "tiktoken"),
|
136 |
|
137 |
# 未分类
|
138 |
+
|
|
|
139 |
("mistral_7b",),
|
140 |
("mixtral_8_7b",),
|
141 |
|
|
|
211 |
|
212 |
|
213 |
class TokenizerImpl(Enum):
|
214 |
+
|
215 |
"""
|
216 |
+
https://github.com/google/sentencepiece,支持 sentencepiece(BPE,unigram,char,word), wordpiece,
|
217 |
+
spm_train --model_type unigram/bpe/char/word
|
218 |
"""
|
219 |
+
SentencePiece = auto()
|
220 |
|
221 |
# https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/gpt2/tokenization_gpt2.py#L104
|
222 |
# 构造词典:
|
223 |
+
# GPT2Tokenizer = auto()
|
224 |
+
# BertTokenizer = auto() #
|
225 |
+
|
226 |
+
"""
|
227 |
+
"""
|
228 |
+
HFTokenizer = auto() # https://github.com/huggingface/tokenizers, 支持
|
229 |
|
230 |
|
231 |
def load_tokener(model_name):
|
vocab/character_glm_6b/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("thu-coai/CharacterGLM-6B", trust_remote_code=True)
|