xu-song commited on
Commit
a1b0cd0
1 Parent(s): 3030d21

add more tokenizers

Browse files
vocab/README.md CHANGED
@@ -1,4 +1,6 @@
1
 
 
 
2
  对于OpenAI的模型而言,英文的Token效率是中文的8-12倍,
3
  之前三百字中文以上时Turbo 3.5 16k就会出现逻辑颠倒问题,提示词换成英文后该问题没有出现过。
4
 
 
1
 
2
+ https://arxiv.org/abs/2308.16692 SpeechTokenizer
3
+
4
  对于OpenAI的模型而言,英文的Token效率是中文的8-12倍,
5
  之前三百字中文以上时Turbo 3.5 16k就会出现逻辑颠倒问题,提示词换成英文后该问题没有出现过。
6
 
vocab/__init__.py CHANGED
@@ -55,8 +55,6 @@ uniq_tokenizers = [
55
  all_tokenizers = [
56
  "gpt2",
57
  "gpt2_chinese",
58
- "gpt_35_turbo",
59
- "gpt_4",
60
 
61
  # bert 系列
62
  "bert_base_cased",
@@ -105,6 +103,10 @@ all_tokenizers = [
105
  "qwen_1_8b_chat",
106
  "qwen_7b_chat",
107
  "qwen_72b_chat",
 
 
 
 
108
 
109
  # 未分类
110
  "skywork_13b_base",
@@ -116,6 +118,15 @@ all_tokenizers = [
116
  "flan_t5_base",
117
  "fastchat_t5_3b",
118
  "pko_t5_large",
 
 
 
 
 
 
 
 
 
119
 
120
 
121
  ]
 
55
  all_tokenizers = [
56
  "gpt2",
57
  "gpt2_chinese",
 
 
58
 
59
  # bert 系列
60
  "bert_base_cased",
 
103
  "qwen_1_8b_chat",
104
  "qwen_7b_chat",
105
  "qwen_72b_chat",
106
+ "text_davinci_003",
107
+ "code_davinci_002",
108
+ "gpt_35_turbo",
109
+ "gpt_4",
110
 
111
  # 未分类
112
  "skywork_13b_base",
 
118
  "flan_t5_base",
119
  "fastchat_t5_3b",
120
  "pko_t5_large",
121
+ "wizardcoder_15b_v1",
122
+ "wizardcoder_python_7b_v1",
123
+ "wizardlm_7b_v1",
124
+ "wizardmath_70b_v1",
125
+ "tigerbot_70b_chat_v4_4k",
126
+ "tigerbot_13b_chat_v2",
127
+ "deepseek_coder_33b_instruct",
128
+ "deepseek_llm_7b_base",
129
+
130
 
131
 
132
  ]
vocab/chatglm_6b/__init__.py CHANGED
@@ -6,15 +6,16 @@ import os
6
  import config
7
  from transformers import AutoTokenizer
8
 
9
- os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
10
 
11
 
12
- if config.USE_REMOTE:
13
- pass
14
- else:
15
- CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
16
- TOKENIZER_DIR = os.path.join(CURRENT_DIR, "chatglm_6b")
17
- tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
 
 
18
 
19
  # https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py#L153
20
  tokenizer.comments = f"num_image_tokens: {tokenizer.sp_tokenizer.num_image_tokens}; num_image_tokens: {tokenizer.sp_tokenizer.num_text_tokens} "
 
6
  import config
7
  from transformers import AutoTokenizer
8
 
 
9
 
10
 
11
+
12
+ # if config.USE_REMOTE:
13
+ tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
14
+ # else:
15
+ # os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
16
+ # CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
17
+ # TOKENIZER_DIR = os.path.join(CURRENT_DIR, "chatglm_6b")
18
+ # tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
19
 
20
  # https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py#L153
21
  tokenizer.comments = f"num_image_tokens: {tokenizer.sp_tokenizer.num_image_tokens}; num_image_tokens: {tokenizer.sp_tokenizer.num_text_tokens} "
vocab/code_davinci_002/__init__.py CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+
3
+ from vocab.text_davinci_003 import tokenizer
vocab/deepseek_coder_33b_instruct/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """
2
+ https://huggingface.co/spaces/deepseek-ai/deepseek-coder-7b-instruct
3
+ """
4
+
5
+ from transformers import AutoTokenizer
6
+
7
+ tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-33b-instruct", trust_remote_code=True)
vocab/deepseek_llm_7b_base/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+
3
+ from transformers import AutoTokenizer
4
+
5
+ tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-llm-7b-base", trust_remote_code=True)
vocab/gpt_35_turbo/__init__.py CHANGED
@@ -42,8 +42,6 @@ def get_vocab(self, token_type="str"):
42
  key_error_list = []
43
  unicode_decode_error_list = []
44
  for i in range(self.vocab_size):
45
- if i == 100256:
46
- print(i)
47
  try:
48
  token_byte = self.convert_ids_to_tokens([i])[0]
49
  if token_byte is None:
 
42
  key_error_list = []
43
  unicode_decode_error_list = []
44
  for i in range(self.vocab_size):
 
 
45
  try:
46
  token_byte = self.convert_ids_to_tokens([i])[0]
47
  if token_byte is None:
vocab/text_davinci_003/__init__.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TODO
3
+ """
4
+
5
+ import tiktoken
6
+ from tiktoken import Encoding
7
+ from utils.log_util import logger
8
+
9
+ tokenizer = tiktoken.encoding_for_model('text-davinci-003')
10
+ tokenizer.vocab_size = tokenizer.n_vocab
11
+
12
+ tokenizer.comments = ""
13
+ tokenizer.reversible = True
14
+
15
+
16
+
17
+
18
+ def decode(self, tokens, errors="replace", skip_special_tokens=False):
19
+ """
20
+ 默认的decode,可能会报错,详见 decode_test.py
21
+ skip_special_tokens 是为了兼容 hf_tokenizer
22
+ """
23
+ try:
24
+ decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
25
+ except:
26
+ decode_str = "null"
27
+ return decode_str
28
+
29
+ def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
30
+ """
31
+ 为什么没有这个方法?
32
+ """
33
+ try:
34
+ return tokenizer.decode_tokens_bytes(tokens)
35
+ except:
36
+ # 什么要返回None?见zh_util.py
37
+ # 16个空闲id, 100256 100261-100275
38
+ return [None for token in tokens]
39
+
40
+ def get_vocab(self, token_type="str"):
41
+ """Returns vocab as a dict
42
+ :param token_type: ["str", "byte"]
43
+ :return:
44
+ """
45
+ vocab = {}
46
+ key_error_list = []
47
+ unicode_decode_error_list = []
48
+ for i in range(self.vocab_size):
49
+ try:
50
+ token_byte = self.convert_ids_to_tokens([i])[0]
51
+ if token_byte is None:
52
+ continue
53
+ # token_str = token_byte.decode("utf-8")
54
+ vocab[token_byte] = i
55
+
56
+ except UnicodeDecodeError: # 773 UnicodeDecodeError
57
+ unicode_decode_error_list.append((i, str(token_byte)))
58
+ vocab[token_byte] = i
59
+
60
+ # vocab.update(self.added_tokens_encoder)
61
+ logger.info(f"text-davinci-003 {len(key_error_list)} KeyError: {key_error_list}")
62
+ logger.info(f"text-davinci-003 {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
63
+ return vocab
64
+
65
+
66
+
67
+ # tiktoken patch
68
+ Encoding.decode = decode
69
+ Encoding.convert_ids_to_tokens = convert_ids_to_tokens
70
+ Encoding.get_vocab = get_vocab
vocab/tigerbot_13b_chat_v2/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+
3
+ from transformers import AutoTokenizer
4
+
5
+ tokenizer = AutoTokenizer.from_pretrained("TigerResearch/tigerbot-13b-chat-v2", trust_remote_code=True)
vocab/tigerbot_70b_chat_v4_4k/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+
3
+ from transformers import AutoTokenizer
4
+
5
+ tokenizer = AutoTokenizer.from_pretrained("TigerResearch/tigerbot-70b-chat-v4-4k", trust_remote_code=True)
vocab/wizardcoder_15b_v1/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardCoder-15B-V1.0", trust_remote_code=True)
vocab/wizardcoder_python_7b_v1/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardCoder-Python-7B-V1.0", trust_remote_code=True)
vocab/wizardlm_7b_v1/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardLM-7B-V1.0", trust_remote_code=True)
vocab/wizardmath_70b_v1/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardMath-70B-V1.0", trust_remote_code=True)