xu-song commited on
Commit
f0f84b2
1 Parent(s): f02dd94

add character glm

Browse files
vocab/__init__.py CHANGED
@@ -19,32 +19,37 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.c
19
  - 特征
20
  - 词典:有##开头的token,表示subword
21
  - 示例:
22
- - google/sentencepiece:
23
  - 特征:
24
  - 训练:
25
- - 文件: *.sp_model 或 *.model (可选文件 .vocab,)
26
  - 实现:
 
27
  - 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
28
  - 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
29
  - 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
 
 
30
  - 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
31
  - 示例:google-t5, llama,baichuan, orion,
32
  - icetk: sentencepiece的分支,支持image_tokenizer
33
  - glm, chatglm1, chatglm2
34
  - openai/tiktoken
35
- - hf_tokenizer
 
36
  - 特征:
37
  - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
38
  - added_tokens 在vocab中不一定存在。
39
  - 实现:
40
- - 训练:
41
  - 加载:
42
- - 方法:
43
  - .model 是 tokenizer.models.BPE 类型
44
  - 词典有 Ġ "\u0120" 开头
45
- - .model.from_file .model.save .model.token_to_id .model.tokenize
 
46
  - 示例:gpt2, gpt_neox_20b, moss, bloom, qwen2
47
- - 构造词典:
48
  - ss
49
  - tiktoken
50
  - 特征:空格就是空格,
@@ -65,71 +70,72 @@ uniq_tokenizers = [
65
  ""
66
  ]
67
 
68
- # TODO: alias/abbr, hf_path, tokenizer_class, comments,
69
  all_tokenizers = [
70
  ##### bert 系列
71
- ("bert_base_cased", "", ""),
72
- ("bert_base_uncased","",),
73
- ("bert_base_chinese",),
74
- ("roberta_chinese_clue",),
75
  ("kplug",),
76
  ("gpt2_chinese",),
77
 
78
  ##### GPT2Tokenizer
79
- ("gpt2",), #
80
- ("moss",),
81
- ("bloom",),
82
  # ("bloomz_6b4_zh",
83
  # ("belle_7b_2m", # 模型和词典都基于bloom
84
  #
85
- ("gpt_nexo_20b",), # 5万
86
- ("qwen1_5_14b_chat",), # 15万,速度有点慢
87
- ("starchat_alpha",),
88
 
89
  ####### google/sentencepiece tokenizer:
90
  # T5 llama internlm
91
- ("t5_small",),
92
- ("t5_base",),
93
- ("t5_large",),
94
- ("chatyuan_large_v2",),
95
- ("prompt_clue",),
96
-
97
- ("llama",), # '中文单字': 700, '中文多字': 0
98
- ("llama2",),
99
- ("chinese_llama",), #
100
- ("chinese_llama2",), #
101
  # ("chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
102
  # ("belle_llama_ext_7b",
103
  # ("alpaca_7b",
104
- ("baichuan",),
105
- ("baichuan2",),
106
- ("internlm_chat_7b",),
107
- ("internlm2_chat_7b",),
108
- ("internlm2_math_7b",),
109
- ("internlm_xcomposer_7b",),
110
- ("falcon_7b",),
111
- ("falcon_180b",),
 
 
112
  # "goat",
113
 
114
  # ##### glm系列
115
  # "glm_chinese",),
116
- ("chatglm_6b",),
117
- ("chatglm2_6b",),
118
- ("chatglm3_6b",),
119
-
120
 
121
  # tiktoken 系列
122
- ("qwen_1_8b_chat",),
123
- ("qwen_7b_chat",),
124
- ("qwen_72b_chat",),
125
- ("text_davinci_003",),
126
- ("code_davinci_002",),
127
- ("gpt_35_turbo",),
128
- ("gpt_4",),
129
 
130
  # 未分类
131
- ("skywork_13b_base",),
132
- ("skywork_13b_math",),
133
  ("mistral_7b",),
134
  ("mixtral_8_7b",),
135
 
@@ -205,15 +211,21 @@ class TokenizerType(Enum):
205
 
206
 
207
  class TokenizerImpl(Enum):
 
208
  """
 
 
209
  """
210
- SentencePiece = auto() #
211
 
212
  # https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/gpt2/tokenization_gpt2.py#L104
213
  # 构造词典:
214
- #
215
- GPT2Tokenizer = auto()
216
- BertTokenizer = auto() #
 
 
 
217
 
218
 
219
  def load_tokener(model_name):
 
19
  - 特征
20
  - 词典:有##开头的token,表示subword
21
  - 示例:
22
+ - bpe-google/sentencepiece:
23
  - 特征:
24
  - 训练:
25
+ - 文件: *.sp_model 或 *.model (可选文件 .vocab,) spm简称
26
  - 实现:
27
+ - 依赖: protobuf
28
  - 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
29
  - 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
30
  - 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
31
+ - 分词:
32
+ - pre_tokenizers.ByteLevel(add_prefix_space=True, use_regex=False)
33
  - 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
34
  - 示例:google-t5, llama,baichuan, orion,
35
  - icetk: sentencepiece的分支,支持image_tokenizer
36
  - glm, chatglm1, chatglm2
37
  - openai/tiktoken
38
+ - bpe-hf_tokenizer
39
+ - ss
40
  - 特征:
41
  - 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
42
  - added_tokens 在vocab中不一定存在。
43
  - 实现:
44
+ - 训练: `from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer`
45
  - 加载:
46
+ - 方法: .model.from_file .model.save .model.token_to_id .model.tokenize
47
  - .model 是 tokenizer.models.BPE 类型
48
  - 词典有 Ġ "\u0120" 开头
49
+ - 优势
50
+ -
51
  - 示例:gpt2, gpt_neox_20b, moss, bloom, qwen2
52
+ - 优势:相对sentence piece,hf_tokenizer支持pre-tokenization的正则表达式,对tab和换行支持更好 ()
53
  - ss
54
  - tiktoken
55
  - 特征:空格就是空格,
 
70
  ""
71
  ]
72
 
73
+ # TODO: alias/abbr, hf_path, tokenizer_class/type, comments,
74
  all_tokenizers = [
75
  ##### bert 系列
76
+ ("bert_base_cased", "", "bert"),
77
+ ("bert_base_uncased", "", "bert"),
78
+ ("bert_base_chinese", "", "bert"),
79
+ ("roberta_chinese_clue", "", "bert"),
80
  ("kplug",),
81
  ("gpt2_chinese",),
82
 
83
  ##### GPT2Tokenizer
84
+ ("gpt2", "", "GPT2Tokenizer",), #
85
+ ("moss", "", "GPT2Tokenizer",),
86
+ ("bloom", "", "GPT2Tokenizer",),
87
  # ("bloomz_6b4_zh",
88
  # ("belle_7b_2m", # 模型和词典都基于bloom
89
  #
90
+ ("gpt_nexo_20b", "", "GPT2Tokenizer",), # 5万
91
+ ("qwen1_5_14b_chat", "", "GPT2Tokenizer",), # 15万,速度有点慢
92
+ ("starchat_alpha", "", "GPT2Tokenizer",),
93
 
94
  ####### google/sentencepiece tokenizer:
95
  # T5 llama internlm
96
+ ("t5_small", "", "sentencepiece"),
97
+ ("t5_base", "", "sentencepiece"),
98
+ ("t5_large", "", "sentencepiece"),
99
+ ("chatyuan_large_v2", "", "sentencepiece"),
100
+ ("prompt_clue", "", "sentencepiece"),
101
+
102
+ ("llama", "", "sentencepiece"), # '中文单字': 700, '中文多字': 0
103
+ ("llama2", "", "sentencepiece"),
104
+ ("chinese_llama", "", "sentencepiece"), #
105
+ ("chinese_llama2", "", "sentencepiece"), #
106
  # ("chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
107
  # ("belle_llama_ext_7b",
108
  # ("alpaca_7b",
109
+ ("baichuan", "", "sentencepiece"),
110
+ ("baichuan2", "", "sentencepiece"),
111
+ ("internlm_chat_7b", "", "sentencepiece"),
112
+ ("internlm2_chat_7b", "", "sentencepiece"),
113
+ ("internlm2_math_7b", "", "sentencepiece"),
114
+ ("internlm_xcomposer_7b", "", "sentencepiece"),
115
+ ("falcon_7b", "", "sentencepiece"),
116
+ ("falcon_180b", "", "sentencepiece"),
117
+ ("skywork_13b_base",),
118
+ ("skywork_13b_math",),
119
  # "goat",
120
 
121
  # ##### glm系列
122
  # "glm_chinese",),
123
+ ("chatglm_6b", "", "sentencepiece"),
124
+ ("chatglm2_6b", "", "sentencepiece"),
125
+ ("chatglm3_6b", "", "sentencepiece"),
126
+ ("character_glm_6b", "", "sentencepiece"),
127
 
128
  # tiktoken 系列
129
+ ("qwen_1_8b_chat", "", "tiktoken"),
130
+ ("qwen_7b_chat", "", "tiktoken"),
131
+ ("qwen_72b_chat", "", "tiktoken"),
132
+ ("text_davinci_003", "", "tiktoken"),
133
+ ("code_davinci_002", "", "tiktoken"),
134
+ ("gpt_35_turbo", "", "tiktoken"),
135
+ ("gpt_4", "", "tiktoken"),
136
 
137
  # 未分类
138
+
 
139
  ("mistral_7b",),
140
  ("mixtral_8_7b",),
141
 
 
211
 
212
 
213
  class TokenizerImpl(Enum):
214
+
215
  """
216
+ https://github.com/google/sentencepiece,支持 sentencepiece(BPE,unigram,char,word), wordpiece,
217
+ spm_train --model_type unigram/bpe/char/word
218
  """
219
+ SentencePiece = auto()
220
 
221
  # https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/gpt2/tokenization_gpt2.py#L104
222
  # 构造词典:
223
+ # GPT2Tokenizer = auto()
224
+ # BertTokenizer = auto() #
225
+
226
+ """
227
+ """
228
+ HFTokenizer = auto() # https://github.com/huggingface/tokenizers, 支持
229
 
230
 
231
  def load_tokener(model_name):
vocab/character_glm_6b/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("thu-coai/CharacterGLM-6B", trust_remote_code=True)