Spaces:

xu-song
/

tokenizer-arena

Running

App Files Files Community

xu-song commited on Feb 25

Commit

a6c67ec

•

1 Parent(s): 7011963

fix tiktoken

Browse files

Files changed (38) hide show

examples.py +22 -5
tokenizer/tiktoken_patch.py +69 -0
util.py +1 -0
vocab/__init__.py +1 -1
vocab/gpt_35_turbo/__init__.py +2 -69
vocab/gpt_neox_chinese_v1/20B_tokenizer.tmp.json +0 -0
vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.json +0 -0
vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.mock.json +0 -0
vocab/gpt_neox_chinese_v1/README.md +0 -64
vocab/gpt_neox_chinese_v1/__init__.py +0 -14
vocab/gpt_neox_chinese_v1/build_tokenizer_chinese.py +0 -61
vocab/gpt_neox_chinese_v1/build_tokenizer_chinese_2.py +0 -50
vocab/gpt_neox_chinese_v1/mock.py +0 -32
vocab/gpt_neox_chinese_v1/test_tokenizer.py +0 -43
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.append.json +0 -0
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.insert.json +0 -0
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.json +0 -0
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.2.json +0 -0
vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.tmp.json +0 -0
vocab/gpt_neox_chinese_v1/to_v2/README.md +0 -3
vocab/gpt_neox_chinese_v1/to_v2/add_token_utils.py +0 -185
vocab/gpt_neox_chinese_v1/to_v2/get_unused_id.py +0 -205
vocab/gpt_neox_chinese_v1/to_v2/oov.add.txt +0 -0
vocab/gpt_neox_chinese_v1/to_v2/oov.txt +0 -0
vocab/gpt_neox_chinese_v1/to_v2/sort_test.py +0 -18
vocab/gpt_neox_chinese_v1/to_v2/test2.py +0 -42
vocab/gpt_neox_chinese_v1/to_v2/test_oov.py +0 -69
vocab/gpt_neox_chinese_v1/to_v2/test_queue.py +0 -20
vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.remove.jsonl +0 -0
vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.sort_by_count.jsonl +0 -0
vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.txt +0 -0
vocab/gpt_neox_chinese_v1/tokenizer/__init__.py +0 -16
vocab/gpt_neox_chinese_v1/tokenizer/gpt2_tokenization.py +0 -368
vocab/gpt_neox_chinese_v1/tokenizer/tokenizer.py +0 -402
vocab/gpt_neox_chinese_v1/tokenizer/train_tokenizer.py +0 -126
vocab/gpt_neox_chinese_v1/trouble-shooting.md +0 -22
vocab/moss/__init__.py +1 -1
vocab/text_davinci_003/__init__.py +14 -59

examples.py CHANGED Viewed

@@ -1,12 +1,30 @@
 examples = {
     "en": [
-        ["spaces:  2spaces        8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"],  # chatglm 有blank_n,
         # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
         ["punctuation: ,.:/?+=\"，。！？；【】〔〕〖〗", "baichuan", "llama"],
         ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
-        ["number: (10086 + 98) = 100184", "baichuan", "llama"]
-    ]
-    ,
     "zh": [
         ["空格测试：  2个空格        8个空格", "llama", "chatglm2_6b"],  # chatglm 有blank_n,
         ["标点测试：，。！？；", "baichuan_7b", "llama"],
@@ -14,7 +32,6 @@ examples = {
         ["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
         ["中文简体：宽带，繁体：樂來", "baichuan_7b", "llama"],
     ]
 }
 more_examples = [

+"""
+## characters
+- alphanumeric characters
+- numeric characters
+- special characters: A special character is a character that is not an alphabetic or numeric character.
+    - ASCII control characters
+    - punctuation marks
+    - accent marks
+    - 数学符号
+    - whitespace:
+        - https://en.wikipedia.org/wiki/Whitespace_character
+        - https://emptycharacter.com/
+https://www.computerhope.com/jargon/s/specchar.htm
+"""
 examples = {
     "en": [
+        ["number: (10086 + 98) = 100184", "llama", "bloom"],
+        ["whitespace:  2spaces        8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"],  # chatglm 有blank_n,
         # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
         ["punctuation: ,.:/?+=\"，。！？；【】〔〕〖〗", "baichuan", "llama"],
         ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
+    ],
     "zh": [
         ["空格测试：  2个空格        8个空格", "llama", "chatglm2_6b"],  # chatglm 有blank_n,
         ["标点测试：，。！？；", "baichuan_7b", "llama"],
         ["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
         ["中文简体：宽带，繁体：樂來", "baichuan_7b", "llama"],
     ]
 }
 more_examples = [

tokenizer/tiktoken_patch.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from tiktoken import Encoding
+from utils.log_util import logger
+def decode(self, tokens, errors="replace", skip_special_tokens=False):
+    """
+    默认的decode，可能会报错，详见 decode_test.py
+    skip_special_tokens 是为了兼容 hf_tokenizer
+    """
+    try:
+        decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
+    except:
+        decode_str = "null"
+    return decode_str
+def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
+    """
+    为什么没有这个方法？
+    """
+    try:
+        return self.decode_tokens_bytes(tokens)
+    except Exception as e:
+        # 什么要返回None？见zh_util.py
+        # 16个空闲id, 100256 100261-100275
+        logger.error(e)
+        return [None for _ in tokens]
+def get_vocab(self, token_type="str"):
+    """Returns vocab as a dict
+    :param token_type: ["str", "byte"]
+    :return:
+    """
+    vocab = {}
+    key_error_list = []
+    unicode_decode_error_list = []
+    for i in range(self.vocab_size):
+        try:
+            token_byte = self.convert_ids_to_tokens([i])[0]
+            if token_byte is None:
+                continue
+            # token_str = token_byte.decode("utf-8")
+            vocab[token_byte] = i
+        except UnicodeDecodeError:  # 773 UnicodeDecodeError
+            unicode_decode_error_list.append((i, str(token_byte)))
+            vocab[token_byte] = i
+    # vocab.update(self.added_tokens_encoder)
+    logger.info(f"{self.name} {len(key_error_list)} KeyError: {key_error_list}")
+    logger.info(f"{self.name} {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
+    return vocab
+def encode(self, *args, **kwargs):
+    """
+    add_special_token 是为了兼容 hf_tokenizer
+    """
+    kwargs.pop("add_special_tokens", None)
+    return self._encode(*args, **kwargs)
+# tiktoken patch
+Encoding._encode = Encoding.encode
+Encoding.encode = encode
+Encoding.decode = decode
+Encoding.convert_ids_to_tokens = convert_ids_to_tokens
+Encoding.get_vocab = get_vocab

util.py CHANGED Viewed

@@ -52,6 +52,7 @@ def tokenize(text, tokenizer_type, color_num=5):
             # continue
         # ⭐
         table.append(
             {"TokenID": token_id,
              "Token": token_str,  # utf-8解码后的字符串，为什么有些是 <0xE7>，表示什么？比如llama

             # continue
         # ⭐
+        # TODO: gpt3.5_turbo错误： 只有id和text是对的，token和 utf8都是错的。说明 convert_ids_to_tokens 出错了。
         table.append(
             {"TokenID": token_id,
              "Token": token_str,  # utf-8解码后的字符串，为什么有些是 <0xE7>，表示什么？比如llama

vocab/__init__.py CHANGED Viewed

@@ -85,7 +85,7 @@ all_tokenizers = [
     # "gpt_neox_chinese_v1",
     #
     # ##### glm系列
-    "glm_chinese",
     "chatglm_6b",
     "chatglm2_6b",
     "chatglm3_6b",

     # "gpt_neox_chinese_v1",
     #
     # ##### glm系列
+    # "glm_chinese",
     "chatglm_6b",
     "chatglm2_6b",
     "chatglm3_6b",

vocab/gpt_35_turbo/__init__.py CHANGED Viewed

@@ -1,10 +1,9 @@
 """
-，请
 """
 import tiktoken
-from tiktoken import Encoding
-from utils.log_util import logger
 tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
 tokenizer.vocab_size = tokenizer.n_vocab
@@ -12,69 +11,3 @@ tokenizer.vocab_size = tokenizer.n_vocab
 tokenizer.comments = "tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"
 tokenizer.reversible = True  # It's reversible and lossless, so you can convert tokens back into the original text
-def decode(self, tokens, errors="replace", skip_special_tokens=False):
-    """
-    默认的decode，可能会报错，详见 decode_test.py
-    skip_special_tokens 是为了兼容 hf_tokenizer
-    """
-    try:
-        decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
-    except:
-        decode_str = "null"
-    return decode_str
-def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
-    """
-    为什么没有这个方法？
-    """
-    try:
-        return self.decode_tokens_bytes(tokens)
-    except Exception as e:
-        # 什么要返回None？见zh_util.py
-        # 16个空闲id, 100256 100261-100275
-        logger.error(e)
-        return [None for _ in tokens]
-def get_vocab(self, token_type="str"):
-    """Returns vocab as a dict
-    :param token_type: ["str", "byte"]
-    :return:
-    """
-    vocab = {}
-    key_error_list = []
-    unicode_decode_error_list = []
-    for i in range(self.vocab_size):
-        try:
-            token_byte = self.convert_ids_to_tokens([i])[0]
-            if token_byte is None:
-                continue
-            # token_str = token_byte.decode("utf-8")
-            vocab[token_byte] = i
-        except UnicodeDecodeError:  # 773 UnicodeDecodeError
-            unicode_decode_error_list.append((i, str(token_byte)))
-            vocab[token_byte] = i
-    # vocab.update(self.added_tokens_encoder)
-    logger.info(f"gpt_35_turbo {len(key_error_list)} KeyError: {key_error_list}")
-    logger.info(f"gpt_35_turbo {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
-    return vocab
-def encode(self, *args, **kwargs):
-    """
-    add_special_token 是为了兼容 hf_tokenizer
-    """
-    kwargs.pop("add_special_tokens", None)
-    return self._encode(*args, **kwargs)
-# tiktoken patch
-Encoding._encode = Encoding.encode
-Encoding.encode = encode
-Encoding.decode = decode
-Encoding.convert_ids_to_tokens = convert_ids_to_tokens
-Encoding.get_vocab = get_vocab

 """
 """
 import tiktoken
+import tokenizer.tiktoken_patch
 tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
 tokenizer.vocab_size = tokenizer.n_vocab
 tokenizer.comments = "tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"
 tokenizer.reversible = True  # It's reversible and lossless, so you can convert tokens back into the original text

vocab/gpt_neox_chinese_v1/20B_tokenizer.tmp.json DELETED Viewed

The diff for this file is too large to render. See raw diff

vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.json DELETED Viewed

The diff for this file is too large to render. See raw diff

vocab/gpt_neox_chinese_v1/20B_tokenizer_chinese.mock.json DELETED Viewed

The diff for this file is too large to render. See raw diff

vocab/gpt_neox_chinese_v1/README.md DELETED Viewed

@@ -1,64 +0,0 @@
-```
-added vocab (size: 54634) with 22 dummy tokens (new size: 54656)
-Vocab size: 54634
-训练数据
-```
-https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
-## 20B
-[configs/20B.yml](https://github.com/EleutherAI/gpt-neox/blob/main/configs/20B.yml#L7)
-```
-  "vocab-file": "./20B_checkpoints/20B_tokenizer.json",
-```
-Vocab size: 50277
-self.padded_vocab_size = 50304
-padded vocab (size: 50277) with 27 dummy tokens (new size: 50304)
-## 词典
-见 convert_vocab_to_txt.py
-```
-{"id": 13609, "token": "\u00e4\u00b8\u0143", "token_decode": "\u4e2d"}	中
-# 多个符号拼接在一起的
-{"id": 13663, "token": ".*]{}", "token_decode": ".*]{}"}	.*]{}
-# ss
-```
-## 中文支持
-基本没有OOV。
-gpt-neox是在800G英文数据集上训练的，为啥词典支持中文？因为是byte-level BPE
-```
-丁 [3218, 212]
-七 [3218, 214]
-万 [3218, 218]
-诀 [11894, 211]
-证 [11894, 212]
-```
-编码长度统计： Counter({2: 4190, 3: 1295, 1: 285})
-平均编码长度： 2.1750433275563257
-## ss

vocab/gpt_neox_chinese_v1/__init__.py DELETED Viewed

@@ -1,14 +0,0 @@
-import os
-from tokenizers import Tokenizer
-CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
-TOKENIZER_DIR = os.path.join(CURRENT_DIR, "20B_tokenizer_chinese.json")
-tokenizer = Tokenizer.from_file(TOKENIZER_DIR)
-tokenizer.vocab_size = tokenizer.get_vocab_size(with_added_tokens=True)
-# vocab_size = len(tokenizer.get_vocab())
-# vocab_size = tokenizer.vocab_size

vocab/gpt_neox_chinese_v1/build_tokenizer_chinese.py DELETED Viewed

@@ -1,61 +0,0 @@
-"""
-merge 是干嘛的？
-## 结果
-共merge 4357 个 token
-"""
-import json
-from tokenizers import Tokenizer
-from data_sample.oov_base import jd_vocab_tokens
-from zhon.hanzi import punctuation as zh_punc
-def load_base_tokenizer(vocab_path):
-    data = json.load(open(vocab_path, "r", encoding="utf-8"))
-    tokenizer = Tokenizer.from_file(vocab_path)
-    print("vocab_size with added_tokens:", )
-    return data, tokenizer
-data, base_tokenizer = load_base_tokenizer("../gpt_nexo_20b/20B_tokenizer.json")
-vocab = data["model"]["vocab"]
-merges = data["model"]["merges"]
-vocab_size = base_tokenizer.get_vocab_size(with_added_tokens=True)
-"""
-方式一：原有的added_tokens保持id不变。方式二：原有的added_tokens进行id移位。
-以下采用方式一。
-"""
-new_added_tokens = {}
-for word in jd_vocab_tokens + list(zh_punc):
-    if len(word) > 1 or word in new_added_tokens:
-        continue
-    encoding = base_tokenizer.encode(word)
-    # if len(encoding.ids) > 1:
-    if len(encoding.ids) == 2:  # 3个的，怎么处理？
-        tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
-        # print("merging", vocab_size, word, json.dumps(tokens))
-        vocab["".join(tokens)] = vocab_size
-        new_added_tokens[word] = vocab_size
-        vocab_size += 1
-        merges.append(" ".join(tokens))
-print("共merge %d 个 token" % (len(new_added_tokens)))
-with open("20B_tokenizer_chinese.json", "w", encoding="utf-8") as f_out:
-    json.dump(data, f_out, indent=2)
-## check
-tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.json")
-all_error_ids = []
-for word, idx in new_added_tokens.items():
-    decode_str = tokenizer.decode([idx])
-    if word != decode_str:
-        all_error_ids.append(idx)
-        print(idx, word, decode_str)
-print(all_error_ids)

vocab/gpt_neox_chinese_v1/build_tokenizer_chinese_2.py DELETED Viewed

@@ -1,50 +0,0 @@
-"""
-merge 是干嘛的？
-## 结果
-共merge 4357 个 token
-"""
-import json
-from tokenizers import Tokenizer
-from data_sample.oov_base import jd_vocab_tokens
-from zhon.hanzi import punctuation as zh_punc
-def load_base_tokenizer():
-    old_vocab_path = "../gpt_nexo_20b/20B_tokenizer.json"
-    data = json.load(open(old_vocab_path, "r", encoding="utf-8"))
-    tokenizer = Tokenizer.from_file(old_vocab_path)
-    print("vocab_size with added_tokens:", )
-    return data, tokenizer
-data, base_tokenizer = load_base_tokenizer()
-vocab = data["model"]["vocab"]
-merges = data["model"]["merges"]
-vocab_size = base_tokenizer.get_vocab_size(with_added_tokens=True)
-"""
-方式一：原有的added_tokens保持id不变。方式二：原有的added_tokens进行id移位。
-以下采用方式一。
-"""
-new_added_tokens = set()
-for word in jd_vocab_tokens + list(zh_punc):
-    if len(word) > 1 or word in new_added_tokens:
-        continue
-    encoding = base_tokenizer.encode(word)
-    # if len(encoding.ids) > 1:
-    if len(encoding.ids) == 2:  # 3个的，怎么处理？
-        tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
-        print("merging", vocab_size, word, json.dumps(tokens))
-        vocab["".join(tokens)] = vocab_size
-        vocab_size += 1
-        merges.append(" ".join(tokens))
-        new_added_tokens.add(word)
-print("共merge %d 个 token" % (len(new_added_tokens)))
-f_out = open("20B_tokenizer_chinese_2.json", "w", encoding="utf-8")
-json.dump(data, f_out, indent=2)

vocab/gpt_neox_chinese_v1/mock.py DELETED Viewed

@@ -1,32 +0,0 @@
-import copy
-import json
-from tokenizers import Tokenizer
-def export_mock_tokenizer():
-    input_path = "20B_tokenizer_chinese.json"
-    tokenizer = json.load(open(input_path, "r", encoding="utf-8"))
-    vocab = tokenizer["model"]["vocab"]
-    added_tokens = [token["id"] for token in tokenizer["added_tokens"]]
-    for k, v in copy.deepcopy(vocab).items():
-        if v not in added_tokens:
-            vocab[str(v)] = v
-            vocab.pop(k)
-    out_path = input_path.replace(".json", ".mock.json")
-    with open(out_path, "w", encoding="utf-8") as f_out:
-        f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2))
-def mock2():
-    pass
-def load_mock_tokenizer():
-    tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.mock.json")
-    print('')
-export_mock_tokenizer()
-load_mock_tokenizer()

vocab/gpt_neox_chinese_v1/test_tokenizer.py DELETED Viewed

@@ -1,43 +0,0 @@
-import json
-from tokenizers import Tokenizer
-tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.json")
-print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True))
-print("vocab_size without added_tokens:", tokenizer.get_vocab_size(with_added_tokens=False))
-def test_token():
-    """
-    :return:
-    """
-    text = " \t\n中国解决方法黑白侗鸩玥，。！"
-    # text = open("../../data_sample/EBKE20150806001_epub_30198917_30198917.txt", "r", encoding="utf-8").readline()
-    encoding = tokenizer.encode(text)
-    decoding = tokenizer.decode(encoding.ids)
-    print(decoding)
-    for word in text:
-        encoding = tokenizer.encode(word)
-        for token_id in encoding.ids:
-            decode_str = tokenizer.decode([token_id])  # 特殊字符解码后会统一变成 �，对应 "\ufffd"
-            token = tokenizer.id_to_token(token_id)
-            print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
-def test_encode():
-    text = "中国解决方法黑白侗鸩，。！？；一个人去哪里疗疗<|endoftext|>一 个刹车卉"
-    encoding = tokenizer.encode(text)
-    print(tokenizer.decode(encoding.ids))
-    for token_id in encoding.ids:
-        decode_str = tokenizer.decode([token_id])  # 特殊字符解码后会统一变成 �，对应 "\ufffd"
-        token = tokenizer.id_to_token(token_id)
-        print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
-def test_decode():
-    encoding = [30903, 20287, 20005, 52300, 25949, 30329, 50039, 31949, 25538,
-       34698, 18764,  5225, 53915,   163,   223]
-    decode_str = tokenizer.decode(encoding, skip_special_tokens=False)
-    print(decode_str)
-# test_token()
-test_encode()
-# test_decode()

vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.append.json DELETED Viewed

The diff for this file is too large to render. See raw diff

vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.insert.json DELETED Viewed

The diff for this file is too large to render. See raw diff

vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.1.json DELETED Viewed

The diff for this file is too large to render. See raw diff

vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.2.json DELETED Viewed

The diff for this file is too large to render. See raw diff

vocab/gpt_neox_chinese_v1/to_v2/20B_tokenizer.tmp.json DELETED Viewed

The diff for this file is too large to render. See raw diff

vocab/gpt_neox_chinese_v1/to_v2/README.md DELETED Viewed

	@@ -1,3 +0,0 @@
1	-
2	- 扩充词典到 v2
3	-

vocab/gpt_neox_chinese_v1/to_v2/add_token_utils.py DELETED Viewed

@@ -1,185 +0,0 @@
-import shutil
-import json
-from queue import Queue
-from tokenizers import Tokenizer
-from data_sample.oov_base import jd_vocab_tokens
-from zhon.hanzi import punctuation as zh_punc
-def load_base_tokenizer(tokenizer_path):
-    print("loading", tokenizer_path)
-    data = json.load(open(tokenizer_path, "r", encoding="utf-8"))
-    tokenizer = Tokenizer.from_file(tokenizer_path)
-    print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True))
-    return data, tokenizer
-def insert_token(word, index):
-    pass
-# 不能删除的token。比如初始统计是低频的，可以删除，但是新增词典里包含的。
-def load_reserve_tokens(word_list, base_tokenizer):
-    data, base_tokenizer = base_tokenizer
-    reserved_token = set()
-    for word in word_list:
-        encoding = base_tokenizer.encode(word)
-        tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
-        for i in range(0, len(encoding.ids)):
-            reserved_token.add("".join(tokens[:i+1]))
-    return reserved_token
-reserved_token = set()
-def append_token(word_list, base_tokenizer, output_tokenizer_path, unused_ids=None):
-    """
-    append token to the end of vocab
-    """
-    new_vocab = set()
-    new_merges = set()
-    data, base_tokenizer = base_tokenizer
-    vocab = data["model"]["vocab"]
-    merges = data["model"]["merges"]
-    vocab_size = base_tokenizer.basic_count(with_added_tokens=True)
-    for word in word_list:
-        encoding = base_tokenizer.encode(word)
-        if len(encoding.ids) == 1:
-            continue
-        if len(encoding.ids) >= 4:
-            print("[ERROR]: encoding不能超过4", word, encoding)
-        tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
-        # print("merging", word, json.dumps(tokens))
-        for i in range(1, len(encoding.ids)):
-            new_vocab.add("".join(tokens[:i+1]))
-            new_merges.add("".join(tokens[:i]) + " " + tokens[i])
-    # append to the end of vocab
-    # print("new_vocab size", len(new_vocab))
-    # print("new_merges size", len(new_merges))
-    if unused_ids == None:
-        for token in new_vocab:
-            vocab[token] = vocab_size
-            vocab_size += 1
-        merges += new_merges
-    else:
-        for iddx, token in enumerate(new_vocab):
-            # print(unused_ids.qsize())
-            unused_token_id, unused_token_str, unused_merges = unused_ids.get()
-            if unused_token_id == 39468:
-                print("catch")
-            if unused_token_str in reserved_token:
-                print("skip unused token", unused_token_id, unused_token_str)
-                unused_token_id, unused_token_str, unused_merges = unused_ids.get()
-            print("[%d]merging %s to unused %s %s" % (unused_ids.qsize(), json.dumps(token), unused_token_id, json.dumps(unused_token_str)) )
-            vocab[token] = unused_token_id
-            if unused_token_id != vocab.pop(unused_token_str):
-                print("ERROR")
-            # assert unused_token_id == vocab.pop(unused_token_str)
-            merges.remove(unused_merges)
-        # print(new_merges)
-        merges += new_merges
-    # print("共merge %d 个 token" % (len(new_vocab)))
-    # print(json.dumps(list(new_vocab)))
-    with open(output_tokenizer_path, "w", encoding="utf-8") as f_out:
-        json.dump(data, f_out, indent=2)
-    return data, base_tokenizer
-    # data, base_tokenizer = load_base_tokenizer(output_tokenizer_path)
-    # encoding = base_tokenizer.encode(word)
-    # print(encoding.ids)
-def load_unused_id():
-    unused_ids = Queue(maxsize=0)
-    for line in open("word_count.corpus.remove.jsonl", "r", encoding="utf-8"):
-        line_data = json.loads(line)
-        token_id = line_data["id"]
-        token_str = line_data["token"]
-        merges = line_data["merges"]
-        unused_ids.put((token_id, token_str, merges))
-    # for i in range(2000):
-    #     unused_ids.get()
-    return unused_ids
-def check_tokenize(base_tokenizer, word):
-    data, base_tokenizer = base_tokenizer
-    encodings = base_tokenizer.encode(word)
-    assert len(encodings.ids) == 1
-    assert base_tokenizer.decode(encodings.ids) == word
-def add_tokens():
-    unused_ids = load_unused_id()
-    add_tokens = [line.strip() for line in open("oov.add.txt", "r", encoding="utf-8")]
-    add_chars = [char for token in add_tokens for char in token]
-    add_chars = list(set(add_chars))
-    add_words = [token for token in add_tokens if len(token) > 1]
-    tokenizer_path = "../20B_tokenizer_chinese.json"
-    # tokenizer_path = "../../gpt_nexo_20b/20B_tokenizer.json"
-    base_tokenizer = load_base_tokenizer(tokenizer_path)
-    reserved_token.update(load_reserve_tokens(add_chars, base_tokenizer))
-    ## add chars
-    append_token(add_chars, base_tokenizer, "20B_tokenizer.1.json", unused_ids=unused_ids)
-    print(unused_ids.qsize())  # 22320
-    new_tokenizer = load_base_tokenizer("20B_tokenizer.1.json")
-    append_token(add_words,
-                 new_tokenizer, "20B_tokenizer.2.json", unused_ids=unused_ids)
-    new_tokenizer = load_base_tokenizer("20B_tokenizer.2.json")
-    #
-    # ## add words
-    # while unused_ids.qsize() != 22320:
-    #     unused_ids.get()
-    # assert unused_ids.qsize() == 22320
-    #
-    # shutil.copyfile("20B_tokenizer.1.json", "20B_tokenizer.2.json")
-    # while len(add_words) > 0:
-    #     new_tokenizer = load_base_tokenizer("20B_tokenizer.2.json")
-    #     append_token([add_words.pop()],
-    #                  new_tokenizer, "20B_tokenizer.2.json", unused_ids=unused_ids)
-    #     # new_tokenizer = load_base_tokenizer("20B_tokenizer.2.json")
-def check_all_tokens():
-    add_tokens = [line.strip() for line in open("oov.add.txt", "r", encoding="utf-8")]
-    add_chars = [char for token in add_tokens for char in token]
-    add_chars = list(set(add_chars))
-    add_words = [token for token in add_tokens if len(token) > 1]
-    # add_chars = ['吳']
-    base_tokenizer = load_base_tokenizer("20B_tokenizer.2.json")
-    for k in add_chars:
-        check_tokenize(base_tokenizer, k)
-    for word in add_words:
-        # print(word)
-        check_tokenize(base_tokenizer, word)
-add_tokens()
-check_all_tokens()

vocab/gpt_neox_chinese_v1/to_v2/get_unused_id.py DELETED Viewed

@@ -1,205 +0,0 @@
-"""
-获取超低频token，用于裁剪
-"""
-import copy
-import glob
-import json
-from collections import defaultdict
-def word_count():
-    from collections import Counter
-    from megatron.data.indexed_dataset import MMapIndexedDataset
-    counter = Counter()
-    for file_name in glob.glob("data/jd/*.bin"):
-        print(file_name)
-        file_name = file_name[:-4]
-        dataset = MMapIndexedDataset(file_name, skip_warmup=True)
-        for doc in dataset:
-            counter.update(doc)
-    f_out = open("word_count.txt", "w", encoding="utf-8")
-    for token_id, count in counter.most_common():
-        f_out.write("%d\t%d\n" % (token_id, count))
-def get_unused_id():
-    pass
-def print_word_count():
-    from tokenizers import Tokenizer
-    tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json")
-    data = json.load(open("../20B_tokenizer_chinese.json", "r", encoding="utf-8"))
-    vocab = data["model"]["vocab"]
-    merges = data["model"]["merges"]
-    merge_dict = {}
-    sorted_parts = []
-    for merge in merges:
-        idx = merge.find(" ")
-        token_str = merge[:idx] + merge[idx + 1:]
-        merge_dict[token_str] = (merge[:idx], merge[idx + 1:])
-        sorted_parts += [token_str, merge[:idx], merge[idx + 1:]]
-    id2vocab = {idx: token for token, idx in vocab.items()}
-    # 补充 sorted_parts，并排序
-    all_tokens = [line.strip().split("\t") for line in open("word_count.corpus.txt", "r", encoding="utf-8")]
-    raw_token_count = {int(token_id): int(count) for token_id, count in all_tokens}
-    sorted_parts = set(sorted_parts)
-    for token_id in raw_token_count:
-        if token_id in [35448, 40519]:
-            print("ddd")
-        token_str = id2vocab[token_id]
-        if token_str not in sorted_parts:
-            sorted_parts.add(token_str)
-            # print(token_id, token_str, json.dumps(token_str), raw_token_count[token_id], " not in parts")
-    sorted_parts = sorted(set(sorted_parts), key=lambda k: len(k), reverse=True)
-    # 重新计算merge的频率
-    # token_count = copy.deepcopy(raw_token_count)
-    token_count = defaultdict(int)
-    for token_str in sorted_parts:  # 从长到短 遍历 (否则要深度遍历，)
-        token_id = vocab[token_str]
-        if token_id in [35448, 40519]:
-            print("ddd")
-        count = raw_token_count.get(token_id, 0)
-        token_count[token_id] += count  # 原token 的词频
-        if token_str in merge_dict:
-            if vocab[merge_dict[token_str][0]] in [35448, 40519] or vocab[merge_dict[token_str][1]] in [35448, 40519]:
-                print("ddd")
-            token_count[vocab[merge_dict[token_str][0]]] += token_count[token_id]
-            token_count[vocab[merge_dict[token_str][1]]] += token_count[token_id]
-        else:
-            print(token_id, json.dumps(token_str))
-    # 重新排序 (按频率升序排列，相同频率按长度降序排列)
-    sorted_token_count = sorted(token_count.items(), key=lambda kv: (kv[1], -len(id2vocab[kv[0]])))
-    f_out = open("word_count.corpus.sort_by_count.jsonl", "w", encoding="utf-8")
-    for token_id, count in sorted_token_count:
-        # for token_str, count in token_count.items():
-        token_str = id2vocab[token_id]
-        # token_id = vocab[token_str]
-        decode_str = tokenizer.decode([token_id])  # 解码会失真
-        if token_str in merge_dict:
-            merges = " ".join(merge_dict[token_str])
-        else:
-            merges = "NULL"
-        f_out.write(json.dumps(
-            {"id": token_id, "token": token_str, "merges": merges, "raw_count": raw_token_count.get(token_id, 0),
-             "count": count, "decode_str": decode_str}) + "\n")
-def get_remove_words():
-    from tokenizers import Tokenizer
-    tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json")
-    data = json.load(open("../20B_tokenizer_chinese.json", "r", encoding="utf-8"))
-    added_tokens = [token["id"] for token in data["added_tokens"]]
-    vocab = data["model"]["vocab"]
-    merges = data["model"]["merges"]
-    id2vocab = {idx: token for token, idx in vocab.items()}
-    merge_dict = {k.replace(" ", "", 1): k for k in merges}
-    token_count = {}
-    for line in open("word_count.corpus.sort_by_count.jsonl", "r", encoding="utf-8"):
-        line_data = json.loads(line)
-        token_id = int(line_data["id"])
-        count = int(line_data["count"])
-        token_count[token_id] = count
-    f_out = open("word_count.corpus.remove.jsonl", "w", encoding="utf-8")
-    remove_vocab_set = set()
-    # # 1. 去掉错误token
-    # error_tokens = [54611, 54612, 54613, 54614, 54615, 54616, 54617, 54618, 54619, 54620, 54621, 54622,
-    #                 54623, 54624, 54625, 54626, 54627, 54628, 54629, 54630, 54631, 54632, 54633]
-    # for token_id in error_tokens:
-    #     token_str = id2vocab[token_id]
-    #     # token_str = tokenizer.id_to_token(token_id)  # 失真
-    #     remove_vocab_set.add(token_id)
-    #     f_out.write(json.dumps(
-    #         {"id": token_id, "token": token_str, "merges": merge_dict.get(token_str), "count": 0,
-    #          "type": "error-char"}) + "\n")
-    # 2. 去掉超长token
-    # for token_id in range(tokenizer.get_vocab_size()):
-    #     if token_id in added_tokens:
-    #         continue
-    #     token_str = id2vocab[token_id]
-    #     # token_str = tokenizer.id_to_token(token_id)  # 也会失真，比如 54611 个token
-    #     decode_str = tokenizer.decode([token_id])  # decode会失真，比如 Ġ 会变成空格
-    #     if len(decode_str) > 8 and len(set(decode_str)) < 3:
-    #         if token_id in remove_vocab_set:
-    #             continue
-    #         remove_vocab_set.add(token_id)
-    #         f_out.write(
-    #             json.dumps({"id": token_id, "token": token_str,
-    #                         "merges": merge_dict.get(token_str), "count": token_count.get(token_id, 0),
-    #                         "type": "按长度过滤"}, ensure_ascii=False) + "\n")
-    #
-    #         # 删除依赖，(否则会造成 merges中存在oov的token)
-    #         #
-    #         for merge in merges:
-    #             if token_str in merge:
-    #                 # if token_str + " " in merge or " " + token_str in merge:
-    #                 parent_token_str = merge.replace(" ", "", 1)
-    #                 parent_token_id = vocab[parent_token_str]
-    #                 if parent_token_id in remove_vocab_set:
-    #                     continue
-    #                 remove_vocab_set.add(parent_token_id)
-    #                 f_out.write(
-    #                     json.dumps({"id": parent_token_id, "token": parent_token_str,
-    #                                 "merges": merge, "count": token_count.get(parent_token_id, 0),
-    #                                 "type": "按长度过滤-依赖删除"}, ensure_ascii=False) + "\n")
-    # 3. 去掉低频token
-    for token_id, count in list(token_count.items())[:25000]:
-        # token_id = 6460
-        if token_id in added_tokens:
-            continue
-        if token_id in remove_vocab_set:
-            continue
-        token_str = tokenizer.id_to_token(token_id)
-        # token_str = tokenizer.decode([int(token_id)])
-        if len(token_str.strip()) > 1:
-            remove_vocab_set.add(token_id)
-            f_out.write(json.dumps(
-                {"id": token_id, "token": token_str, "merges": merge_dict.get(token_str), "count": count,
-                 "type": "remove by frequency"}) + "\n")
-            ######## 已经按频率排序的，就不需要删除依赖了
-            # # 删除依赖，(否则会造成 merges中存在oov的token)
-            # for merge in merges:
-            #     # if token_str + " " in merge or " " + token_str in merge:
-            #     if token_str in merge:
-            #         parent_token_str = merge.replace(" ", "", 1)
-            #         parent_token_id = vocab[parent_token_str]
-            #         if parent_token_id in remove_vocab_set:
-            #             continue
-            #         remove_vocab_set.add(parent_token_id)
-            #         f_out.write(
-            #             json.dumps({"id": parent_token_id, "token": parent_token_str,
-            #                         "merges": merge, "count": token_count.get(parent_token_id, 0),
-            #                         "type": "按频率过滤-依赖删除"}, ensure_ascii=False) + "\n")
-    # remove 24969 tokens
-    print("remove %d tokens" % (len(remove_vocab_set)))
-def ss():
-    pass
-# word_count()
-# print_word_count()
-get_remove_words()

vocab/gpt_neox_chinese_v1/to_v2/oov.add.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

vocab/gpt_neox_chinese_v1/to_v2/oov.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

vocab/gpt_neox_chinese_v1/to_v2/sort_test.py DELETED Viewed

@@ -1,18 +0,0 @@
-a = {
-    "aa", 1,
-"aaa", 1,
-"aaaa", 1,
-"aaaaaa", 1,
-"aaaaaaa", 1,
-    "baa", 3,
-"baaa", 2,
-"baaaa", 2,
-"baaaaaa", 2,
-"baaaaaaa", 2,
-}
-sorted(a.items(), key=lambda kv:(kv[1], ))

vocab/gpt_neox_chinese_v1/to_v2/test2.py DELETED Viewed

@@ -1,42 +0,0 @@
-import json
-from tokenizers import Tokenizer
-from data_sample.oov_base import jd_vocab_tokens
-from zhon.hanzi import punctuation as zh_punc
-def load_base_tokenizer(tokenizer_path):
-    print("loading", tokenizer_path)
-    data = json.load(open(tokenizer_path, "r", encoding="utf-8"))
-    tokenizer = Tokenizer.from_file(tokenizer_path)
-    print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True))
-    return data, tokenizer
-def append_token(word_list, base_tokenizer,  unused_ids=None):
-    """
-    append token to the end of vocab
-    """
-    new_vocab = set()
-    new_merges = set()
-    data, base_tokenizer = base_tokenizer
-    vocab = data["model"]["vocab"]
-    merges = data["model"]["merges"]
-    vocab_size = base_tokenizer.basic_count(with_added_tokens=True)
-    for word in word_list:
-        encoding = base_tokenizer.encode(word)
-        if len(encoding.ids) == 1:
-            continue
-        if len(encoding.ids) >= 4:
-            print("[ERROR]: encoding不能超过4", word, encoding)
-        tokens = [base_tokenizer.id_to_token(token_id) for token_id in encoding.ids]
-        if "\u00e6\u00a5\u0143" in tokens:
-            print(word)
-add_tokens = [line.strip() for line in open("oov.add.txt", "r", encoding="utf-8")]
-add_words = [token for token in add_tokens if len(token) > 1]
-new_tokenizer = load_base_tokenizer("20B_tokenizer.1.json")
-append_token(add_words, new_tokenizer)

vocab/gpt_neox_chinese_v1/to_v2/test_oov.py DELETED Viewed

@@ -1,69 +0,0 @@
-from tokenizers import Tokenizer
-tokenizer = Tokenizer.from_file("../20B_tokenizer_chinese.json")
-def get_oov():
-    f_out = open("oov.txt", "w", encoding="utf-8")
-    all_words = open("../../vocab.freq.zh.txt", "r", encoding="utf-8")
-    for line in all_words:
-        word, count = line.strip().split("\t")
-        if "�" in word or word in ["之长", "个好", "亿亿", "余个", "聊了", "与该", "多花"]:
-            continue
-        encoding = tokenizer.encode(word)
-        if len(encoding.ids) > 1:
-            f_out.write(line)
-def build_vocab():
-    pass
-def convert_oov_to_merges():
-    """将词拆分成merge分组，必须是两个一组，
-    比如
-    承担 -> 承 担
-    天津市 -> 天津 市
-    社会保障 -> 社会 保障
-    的一部分 -> 的 一部分 ->  一 部分
-    """
-    all_tokens_and_counts = [line.strip().split("\t") for line in open("oov.txt", "r", encoding="utf-8")]
-    all_tokens = [token for token,count in all_tokens_and_counts if int(count) > 2]  # 至少3个词典中出现过
-    len1 = [token for token in all_tokens if len(token) == 1]
-    len2 = [token for token in all_tokens if len(token) == 2]
-    len3 = [token for token in all_tokens if len(token) == 3]
-    len4 = [token for token in all_tokens if len(token) == 4]
-    print(len(len1), len(len2), len(len3), len(len4))
-    # vocab = set(["天津", "社会", "保障", "部分", "一部分", "需要", "数据", "使用", "我们", "一个",] + len2)
-    # vocab = set(["天津", "社会", "保障", "部分", "需要", "数据", "使用", "我们", "一个"] + len2)
-    with open("oov.add.txt", "w", encoding="utf-8") as f_out:
-        for token in len1:
-            f_out.write(token + "\n")
-        for token in len2[:20000]:
-            f_out.write(token + "\n")
-            # f_out.write(token[0] + " " + token[1] + "\n")
-        # for token in len3:
-        #     idx = -1
-        #     for part in len2:
-        #         if part in token:
-        #             idx = token.find(part)
-        #             break
-        #     if idx == -1:
-        #         print("not found", token)
-        #     elif idx == 0:
-        #         f_out.write(token[0] + " " + token[1:] + "\n")
-        #     else:
-        #         f_out.write(token[:2] + " " + token[2] + "\n")
-get_oov()
-convert_oov_to_merges()

vocab/gpt_neox_chinese_v1/to_v2/test_queue.py DELETED Viewed

@@ -1,20 +0,0 @@
-from queue import Queue
-q = Queue(maxsize=0)
-#写入队列数据
-q.put(0)
-q.put(1)
-q.put(2)
-#输出当前队列所有数据
-print(q.queue)
-#删除队列数据，并返回该数据
-q.get()
-#输也所有队列数据
-print(q.queue)
-for i in range(10):
-    print(q.get(), q.qsize())

vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.remove.jsonl DELETED Viewed

The diff for this file is too large to render. See raw diff

vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.sort_by_count.jsonl DELETED Viewed

The diff for this file is too large to render. See raw diff

vocab/gpt_neox_chinese_v1/to_v2/word_count.corpus.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

vocab/gpt_neox_chinese_v1/tokenizer/__init__.py DELETED Viewed

@@ -1,16 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .tokenizer import build_tokenizer

vocab/gpt_neox_chinese_v1/tokenizer/gpt2_tokenization.py DELETED Viewed

@@ -1,368 +0,0 @@
-# Copyright (c) 2021, EleutherAI
-# This file is based on code by the authors denoted below and has been modified from its original version.
-#
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for OpenAI GPT."""
-from __future__ import absolute_import, division, print_function, unicode_literals
-import sys
-import json
-import logging
-import os
-import regex as re
-from io import open
-from functools import lru_cache
-logger = logging.getLogger(__name__)
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
-}
-PRETRAINED_MERGES_ARCHIVE_MAP = {
-    "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
-}
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
-    "gpt2": 1024,
-}
-VOCAB_NAME = "vocab.json"
-MERGES_NAME = "merges.txt"
-SPECIAL_TOKENS_NAME = "special_tokens.txt"
-@lru_cache()
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a significant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    _chr = unichr if sys.version_info[0] == 2 else chr
-    bs = (
-        list(range(ord("!"), ord("~") + 1))
-        + list(range(ord("¡"), ord("¬") + 1))
-        + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [_chr(n) for n in cs]
-    return dict(zip(bs, cs))
-def get_pairs(word):
-    """Return set of symbol pairs in a word.
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-class GPT2Tokenizer(object):
-    """
-    GPT-2 BPE tokenizer. Peculiarities:
-        - Byte-level BPE
-    """
-    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs
-    ):
-        """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
-            special_tokens_file = None
-        else:
-            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
-            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
-            special_tokens_file = os.path.join(
-                pretrained_model_name_or_path, SPECIAL_TOKENS_NAME
-            )
-            if not os.path.exists(special_tokens_file):
-                special_tokens_file = None
-            else:
-                logger.info(
-                    "loading special tokens file {}".format(special_tokens_file)
-                )
-        # redirect to the cache, if necessary
-        try:
-            from .file_utils import cached_path
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ", ".join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    pretrained_model_name_or_path,
-                    vocab_file,
-                    merges_file,
-                )
-            )
-            return None
-        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-            logger.info("loading merges file {}".format(merges_file))
-        else:
-            logger.info(
-                "loading vocabulary file {} from cache at {}".format(
-                    vocab_file, resolved_vocab_file
-                )
-            )
-            logger.info(
-                "loading merges file {} from cache at {}".format(
-                    merges_file, resolved_merges_file
-                )
-            )
-        if (
-            pretrained_model_name_or_path
-            in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
-        ):
-            # if we're using a pretrained model, ensure the tokenizer won't index sequences longer
-            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
-                pretrained_model_name_or_path
-            ]
-            kwargs["max_len"] = min(kwargs.get("max_len", int(1e12)), max_len)
-        # Instantiate tokenizer.
-        if special_tokens_file and "special_tokens" not in kwargs:
-            special_tokens = (
-                open(special_tokens_file, encoding="utf-8").read().split("\n")[:-1]
-            )
-        else:
-            special_tokens = kwargs.pop("special_tokens", [])
-        tokenizer = cls(
-            resolved_vocab_file,
-            resolved_merges_file,
-            special_tokens=special_tokens,
-            *inputs,
-            **kwargs
-        )
-        return tokenizer
-    def __init__(
-        self,
-        vocab_file,
-        merges_file,
-        errors="replace",
-        special_tokens=None,
-        max_len=None,
-    ):
-        self.max_len = max_len if max_len is not None else int(1e12)
-        self.encoder = json.load(open(vocab_file))
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        bpe_data = open(merges_file, encoding="utf-8").read().split("\n")[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        # Should haved added re.IGNORECASE so BPE merges can happen for
-        # capitalized versions of contractions
-        self.pat = re.compile(
-            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
-        )
-        self.special_tokens = {}
-        self.special_tokens_decoder = {}
-        self.set_special_tokens(special_tokens)
-    def __len__(self):
-        return len(self.encoder) + len(self.special_tokens)
-    def set_special_tokens(self, special_tokens):
-        """Add a list of additional tokens to the encoder.
-        The additional tokens are indexed starting from the last index of the
-        current vocabulary in the order of the `special_tokens` list.
-        """
-        if not special_tokens:
-            self.special_tokens = {}
-            self.special_tokens_decoder = {}
-            return
-        self.special_tokens = dict(
-            (tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens)
-        )
-        self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()}
-        logger.info("Special tokens {}".format(self.special_tokens))
-    @lru_cache(maxsize=131072)
-    def bpe(self, token):
-        word = tuple(token)
-        pairs = get_pairs(word)
-        if not pairs:
-            return token
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except BaseException:
-                    new_word.extend(word[i:])
-                    break
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        return word
-    def tokenize(self, text):
-        """Tokenize a string."""
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            if sys.version_info[0] == 2:
-                token = "".join(self.byte_encoder[ord(b)] for b in token)
-            else:
-                token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-    def convert_tokens_to_ids(self, tokens):
-        """Converts a sequence of tokens into ids using the vocab."""
-        ids = []
-        if isinstance(tokens, str) or (
-            sys.version_info[0] == 2 and isinstance(tokens, unicode)
-        ):
-            if tokens in self.special_tokens:
-                return self.special_tokens[tokens]
-            else:
-                return self.encoder.get(tokens, 0)
-        for token in tokens:
-            if token in self.special_tokens:
-                ids.append(self.special_tokens[token])
-            else:
-                ids.append(self.encoder.get(token, 0))
-        if len(ids) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this OpenAI GPT model ({} > {}). Running this"
-                " sequence through the model will result in indexing errors".format(
-                    len(ids), self.max_len
-                )
-            )
-        return ids
-    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-        """Converts a sequence of ids in BPE tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            if i in self.special_tokens_decoder:
-                if not skip_special_tokens:
-                    tokens.append(self.special_tokens_decoder[i])
-            else:
-                tokens.append(self.decoder[i])
-        return tokens
-    def encode(self, text):
-        return self.convert_tokens_to_ids(self.tokenize(text))
-    def decode(self, tokens):
-        text = "".join([self.decoder[token] for token in tokens])
-        text = bytearray([self.byte_decoder[c] for c in text]).decode(
-            "utf-8", errors=self.errors
-        )
-        return text
-    def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary and merge files to a directory."""
-        if not os.path.isdir(vocab_path):
-            logger.error(
-                "Vocabulary path ({}) should be a directory".format(vocab_path)
-            )
-            return
-        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
-        merge_file = os.path.join(vocab_path, MERGES_NAME)
-        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, ensure_ascii=False))
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(
-                self.bpe_ranks.items(), key=lambda kv: kv[1]
-            ):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!".format(
-                            merge_file
-                        )
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-        index = len(self.encoder)
-        with open(special_tokens_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(
-                self.special_tokens.items(), key=lambda kv: kv[1]
-            ):
-                if index != token_index:
-                    logger.warning(
-                        "Saving special tokens vocabulary to {}: BPE indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!".format(
-                            special_tokens_file
-                        )
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return vocab_file, merge_file, special_tokens_file

vocab/gpt_neox_chinese_v1/tokenizer/tokenizer.py DELETED Viewed

@@ -1,402 +0,0 @@
-# Copyright (c) 2021, EleutherAI
-# This file is based on code by the authors denoted below and has been modified from its original version.
-#
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Megatron tokenizers."""
-from abc import ABC
-from abc import abstractmethod
-from tokenizers import Tokenizer
-from transformers import GPT2Tokenizer, GPT2TokenizerFast
-import numpy as np
-import sentencepiece as spm
-from typing import List, Union
-from .gpt2_tokenization import GPT2Tokenizer
-def build_tokenizer(args):
-    """Initialize tokenizer."""
-    if args.rank == 0:
-        print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True)
-    # Select and instantiate the tokenizer.
-    if args.tokenizer_type.lower() == "GPT2BPETokenizer".lower():
-        assert args.vocab_file is not None
-        assert args.merge_file is not None
-        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
-    elif args.tokenizer_type.lower() == "SPMTokenizer".lower():
-        assert args.vocab_file is not None
-        tokenizer = SentencePieceTokenizer(args.vocab_file)
-    elif args.tokenizer_type.lower() == "HFTokenizer".lower():
-        assert args.vocab_file is not None
-        tokenizer = HFTokenizer(args.vocab_file)
-    elif args.tokenizer_type.lower() == "HFGPT2Tokenizer".lower():
-        if args.vocab_file is None:
-            print(
-                "WARNING: No vocab file found, loading Huggingface's pretrained GPT2Tokenizer"
-            )
-        tokenizer = HFGPT2Tokenizer(args.vocab_file)
-    elif args.tokenizer_type.lower() == "CharLevelTokenizer".lower():
-        tokenizer = CharLevelTokenizer(vocab_size=512)
-    elif args.tokenizer_type.lower() == "TiktokenTokenizer".lower():
-        assert args.vocab_file is not None
-        tokenizer = TiktokenTokenizer(args.vocab_file)
-    else:
-        raise NotImplementedError(
-            "{} tokenizer is not " "implemented.".format(args.tokenizer_type)
-        )
-    # Add vocab size.
-    args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args)
-    return tokenizer
-def _vocab_size_with_padding(orig_vocab_size, args):
-    """Pad vocab size so it is divisible by model parallel size and
-    still having GPU friendly size."""
-    after = orig_vocab_size
-    multiple = args.make_vocab_size_divisible_by * args.model_parallel_size
-    while (after % multiple) != 0:
-        after += 1
-    if args.rank == 0:
-        print(
-            " > padded vocab (size: {}) with {} dummy tokens "
-            "(new size: {})".format(orig_vocab_size, after - orig_vocab_size, after),
-            flush=True,
-        )
-    return after
-class AbstractTokenizer(ABC):
-    """Abstract class for tokenizer."""
-    def __init__(self, name):
-        self.name = name
-        super().__init__()
-    @property
-    @abstractmethod
-    def vocab_size(self):
-        pass
-    @property
-    @abstractmethod
-    def vocab(self):
-        """Dictionary from vocab text token to id token."""
-        pass
-    @property
-    @abstractmethod
-    def inv_vocab(self):
-        """Dictionary from vocab id token to text token."""
-        pass
-    @abstractmethod
-    def tokenize(self, text):
-        pass
-    def detokenize(self, token_ids):
-        raise NotImplementedError(
-            "detokenizer is not implemented for {} " "tokenizer".format(self.name)
-        )
-    @property
-    def cls(self):
-        raise NotImplementedError(
-            "CLS is not provided for {} " "tokenizer".format(self.name)
-        )
-    @property
-    def sep(self):
-        raise NotImplementedError(
-            "SEP is not provided for {} " "tokenizer".format(self.name)
-        )
-    @property
-    def pad(self):
-        raise NotImplementedError(
-            "PAD is not provided for {} " "tokenizer".format(self.name)
-        )
-    @property
-    def eod(self):
-        raise NotImplementedError(
-            "EOD is not provided for {} " "tokenizer".format(self.name)
-        )
-    @property
-    def mask(self):
-        raise NotImplementedError(
-            "MASK is not provided for {} " "tokenizer".format(self.name)
-        )
-class _GPT2BPETokenizer(AbstractTokenizer):
-    """Original GPT2 BPE tokenizer."""
-    def __init__(self, vocab_file, merge_file):
-        name = "GPT2 BPE"
-        super().__init__(name)
-        self.tokenizer = GPT2Tokenizer(
-            vocab_file, merge_file, errors="replace", special_tokens=[], max_len=None
-        )
-        self.eod_id = self.tokenizer.encoder["<|endoftext|>"]
-    @property
-    def vocab_size(self):
-        return len(self.tokenizer.encoder)
-    @property
-    def vocab(self):
-        return self.tokenizer.encoder
-    @property
-    def inv_vocab(self):
-        return self.tokenizer.decoder
-    def tokenize(self, text):
-        return self.tokenizer.encode(text)
-    def detokenize(self, token_ids):
-        return self.tokenizer.decode(token_ids)
-    @property
-    def eod(self):
-        return self.eod_id
-class SentencePieceTokenizer(AbstractTokenizer):
-    """Designed to Integrate SP's Tokenizer."""
-    def __init__(self, vocab_file):
-        name = "SPM"
-        super().__init__(name)
-        self.tokenizer = spm.SentencePieceProcessor(model_file=vocab_file)
-        self.eod_id = self.tokenizer.piece_to_id("<|endoftext|>")
-    @property
-    def vocab_size(self):
-        return self.tokenizer.get_piece_size()
-    @property
-    def vocab(self):
-        return {
-            self.tokenizer.id_to_piece(idx): idx
-            for idx in range(self.tokenizer.get_piece_size())
-        }
-    @property
-    def inv_vocab(self):
-        return {
-            idx: self.tokenizer.id_to_piece(idx)
-            for idx in range(self.tokenizer.get_piece_size())
-        }
-    def tokenize(self, text):
-        return self.tokenizer.encode(text)
-    def detokenize(self, token_ids):
-        return self.tokenizer.decode(token_ids)
-    @property
-    def eod(self):
-        return self.eod_id
-class HFTokenizer(AbstractTokenizer):
-    """Designed to Integrate HF's Tokenizer library."""
-    def __init__(self, vocab_file):
-        name = "HFTokenizer"
-        super().__init__(name)
-        self.tokenizer = Tokenizer.from_file(vocab_file)
-        self.eod_id = self.tokenizer.token_to_id("<|endoftext|>")
-        self.pad_id = self.tokenizer.token_to_id("<|padding|>")
-    @property
-    def vocab_size(self):
-        return self.tokenizer.get_vocab_size()
-    @property
-    def vocab(self):
-        return self.tokenizer.get_vocab()
-    @property
-    def inv_vocab(self):
-        return self.tokenizer.decoder
-    def tokenize(self, text: str):
-        return self.tokenizer.encode(text).ids
-    def tokenize_batch(self, text_batch: Union[List[str], str]):
-        return self.tokenizer.encode_batch(text_batch)
-    def detokenize(self, token_ids):
-        return self.tokenizer.decode(token_ids)
-    @property
-    def eod(self):
-        return self.eod_id
-class HFGPT2Tokenizer(AbstractTokenizer):
-    """Designed to Integrate the pretrained OpenAI GPT2 Tokenizers from HF"""
-    def __init__(self, vocab_file=None, fast=True):
-        name = "HFGPT2Tokenizer"
-        if fast:
-            name += "Fast"
-        super().__init__(name)
-        if vocab_file is None:
-            vocab_file = "gpt2"
-        if fast:
-            self.tokenizer = GPT2TokenizerFast.from_pretrained(vocab_file)
-        else:
-            self.tokenizer = GPT2Tokenizer.from_pretrained(vocab_file)
-        self.tokenizer.add_special_tokens({"pad_token": "<|padding|>"})
-        self.eod_id = self.tokenizer.eos_token_id
-        self.pad_id = self.tokenizer.pad_token_id
-    @property
-    def vocab_size(self):
-        return len(self.tokenizer)
-    @property
-    def vocab(self):
-        return self.tokenizer.get_vocab()
-    @property
-    def inv_vocab(self):
-        return self.tokenizer._tokenizer.decoder
-    def tokenize(self, text: str):
-        return self.tokenizer.encode(text)
-    def tokenize_batch(self, text_batch: Union[List[str], str]):
-        if isinstance(text_batch, str):
-            text_batch = [text_batch]
-        return [self.tokenize(t) for t in text_batch]
-    def detokenize(self, token_ids):
-        return self.tokenizer.decode(token_ids)
-    @property
-    def eod(self):
-        return self.eod_id
-class CharLevelTokenizer(AbstractTokenizer):
-    """Character Level Tokenizer"""
-    def __init__(self, vocab_size):
-        name = "CharLevelTokenizer"
-        super().__init__(name)
-        self._vocab_size = vocab_size
-        self.eod_id = 0
-        self.pad_id = 1
-    def clamp(self, n):
-        return max(32, min(n, self.vocab_size))
-    @property
-    def vocab_size(self):
-        return self._vocab_size
-    @property
-    def vocab(self):
-        raise NotImplementedError
-    @property
-    def inv_vocab(self):
-        raise NotImplementedError
-    def decode_token(self, token: int):
-        return str(chr(self.clamp(token)))
-    def tokenize(self, text: str):
-        return list(np.fromstring(text, dtype=np.uint8))
-    def tokenize_batch(self, text_batch: Union[List[str], str]):
-        if isinstance(text_batch, list):
-            return [self.tokenize(s) for s in text_batch]
-        else:
-            return self.tokenize(text_batch)
-    def detokenize(self, token_ids):
-        return "".join(list(map(self.decode_token, token_ids)))
-    @property
-    def eod(self):
-        return self.eod_id
-class TiktokenTokenizer(AbstractTokenizer):
-    """Tokenizer from OpenAI's tiktoken implementation"""
-    def __init__(self, vocab_file):
-        try:
-            import tiktoken
-        except ModuleNotFoundError:
-            print("Please install tiktoken: (https://github.com/openai/tiktoken)")
-            raise Exception
-        name = "TiktokenTokenizer"
-        super().__init__(name)
-        self.tokenizer = tiktoken.get_encoding(vocab_file)
-        self.eod_id = self.tokenizer.eot_token
-        self.pad_id = None
-    @property
-    def vocab_size(self):
-        return self.tokenizer.n_vocab
-    @property
-    def vocab(self):
-        raise NotImplementedError(
-            "TiktokenTokenizer does not implement vocabulary access."
-        )
-    @property
-    def inv_vocab(self):
-        raise NotImplementedError(
-            "TiktokenTokenizer does not implement vocabulary access. \
-                To get the idx-th token in vocabulary, use tokenizer.decode([idx]) ."
-        )
-    def tokenize(self, text: str):
-        return self.tokenizer.encode(text)  # ,  allowed_special="all")
-    def tokenize_batch(self, text_batch: List[str]):
-        return self.tokenizer.encode_batch(text_batch, allowed_special="all")
-    def detokenize(self, token_ids):
-        return self.tokenizer.decode(tokens=token_ids, errors="strict")
-    @property
-    def eod(self):
-        return self.eod_id
-    @property
-    def pad(self):
-        raise NotImplementedError

vocab/gpt_neox_chinese_v1/tokenizer/train_tokenizer.py DELETED Viewed

@@ -1,126 +0,0 @@
-# Copyright (c) 2021, EleutherAI
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Assumes a dataset of jsonl files in the same format as the neox training set.
-"""
-from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers
-from tokenizers.normalizers import NFKC
-from glob import glob
-import os
-import json
-import argparse
-def load_jsonl(input_path, quiet=True) -> list:
-    """
-    Read list of objects from a JSON lines file.
-    """
-    data = []
-    with open(input_path, "r", encoding="utf-8") as f:
-        for line in f:
-            data.append(json.loads(line.rstrip("\n|\r")))
-    if not quiet:
-        print("Loaded {} records from {}".format(len(data), input_path))
-    return data
-def json_iterator(input_dir, text_key="text"):
-    all_jsonls = glob(f"{input_dir}/*.jsonl") + glob(f"{input_dir}/*.json")
-    for j in all_jsonls:
-        data = load_jsonl(j)
-        for doc in data:
-            yield doc[text_key]
-def train_tokenizer(
-    input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000
-):
-    """
-    Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path`
-    :param input_dir: input directory containing jsonl files
-    :param save_path: path to save tokenizer to
-    :param tokenizer_type: type of tokenizer to train.
-    :param vocab_size: int, size of tokenizer's vocab
-    :return:
-    """
-    if tokenizer_type == "BPE":
-        model = models.BPE()
-    else:
-        raise NotImplementedError(f"Tokenizer type {tokenizer_type} not implemented")
-    tokenizer = Tokenizer(model)
-    # Customize pre-tokenization and decoding
-    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
-    tokenizer.decoder = decoders.ByteLevel()
-    tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
-    tokenizer.normalizer = NFKC()
-    # And then train
-    trainer = trainers.BpeTrainer(
-        vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"]
-    )
-    tokenizer.train_from_iterator(json_iterator(input_dir), trainer)
-    # And Save it
-    tokenizer.save(save_path, pretty=True)
-    print(f"Tokenizer saved at {save_path}")
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="script for training a multilingual "
-        "HF tokenizer on CC dumps with upweighting for low resource languages"
-    )
-    parser.add_argument(
-        "--json_input_dir",
-        type=str,
-        help="Path to folder containing tokenizer training data in jsonl format",
-    )
-    parser.add_argument(
-        "--tokenizer_output_path",
-        type=str,
-        help="Path to which your trained tokenizer will be saved (should end in .json)",
-    )
-    parser.add_argument(
-        "--tokenizer_type",
-        type=str,
-        help="type of tokenizer to train, currently only BPE is supported",
-        choices=["BPE"],
-        default=["BPE"],
-    )
-    parser.add_argument(
-        "-v",
-        "--vocab_size",
-        help="vocabulary size of tokenizer, default=52k",
-        type=int,
-        default=52000,
-    )
-    return parser.parse_args()
-if __name__ == "__main__":
-    args = parse_args()
-    train_tokenizer(
-        args.json_input_dir,
-        save_path=args.tokenizer_output_path,
-        tokenizer_type=args.tokenizer_type,
-        vocab_size=args.vocab_size,
-    )

vocab/gpt_neox_chinese_v1/trouble-shooting.md DELETED Viewed

@@ -1,22 +0,0 @@
-## Exception: data did not match any variant of untagged enum ModelWrapper at line 108219 column 3
-## The OrderedVocab you are attempting to save contains a hole for index 50254, your vocabulary could be corrupted !
-```
-The OrderedVocab you are attempting to save contains a hole for index 50254, your vocabulary could be corrupted !
-The OrderedVocab you are attempting to save contains a hole for index 50255, your vocabulary could be corrupted !
-The OrderedVocab you are attempting to save contains a hole for index 50256, your vocabulary could be corrupted !
-```
-原因：50254 这些token并未在vocab中定义，只在 `added_tokens` 里定义了。
-## ss

vocab/moss/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
-from transformers import AutoTokenizer, BloomTokenizerFast
 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 TOKENIZER_DIR = os.path.join(CURRENT_DIR, "moss-moon-003-sft")

 import os
+from transformers import AutoTokenizer
 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 TOKENIZER_DIR = os.path.join(CURRENT_DIR, "moss-moon-003-sft")

vocab/text_davinci_003/__init__.py CHANGED Viewed

@@ -1,70 +1,25 @@
 """
-TODO
-"""
-import tiktoken
-from tiktoken import Encoding
-from utils.log_util import logger
-tokenizer = tiktoken.encoding_for_model('text-davinci-003')
-tokenizer.vocab_size = tokenizer.n_vocab
-tokenizer.comments = ""
-tokenizer.reversible = True
-def decode(self, tokens, errors="replace", skip_special_tokens=False):
-    """
-    默认的decode，可能会报错，详见 decode_test.py
-    skip_special_tokens 是为了兼容 hf_tokenizer
-    """
-    try:
-        decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
-    except:
-        decode_str = "null"
-    return decode_str
-def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
-    """
-    为什么没有这个方法？
-    """
-    try:
-        return tokenizer.decode_tokens_bytes(tokens)
-    except:
-        # 什么要返回None？见zh_util.py
-        # 16个空闲id, 100256 100261-100275
-        return [None for token in tokens]
-def get_vocab(self, token_type="str"):
-    """Returns vocab as a dict
-    :param token_type: ["str", "byte"]
-    :return:
-    """
-    vocab = {}
-    key_error_list = []
-    unicode_decode_error_list = []
-    for i in range(self.vocab_size):
-        try:
-            token_byte = self.convert_ids_to_tokens([i])[0]
-            if token_byte is None:
-                continue
-            # token_str = token_byte.decode("utf-8")
-            vocab[token_byte] = i
-        except UnicodeDecodeError:  # 773 UnicodeDecodeError
-            unicode_decode_error_list.append((i, str(token_byte)))
-            vocab[token_byte] = i
-    # vocab.update(self.added_tokens_encoder)
-    logger.info(f"text-davinci-003 {len(key_error_list)} KeyError: {key_error_list}")
-    logger.info(f"text-davinci-003 {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
-    return vocab
-# tiktoken patch
-Encoding.decode = decode
-Encoding.convert_ids_to_tokens = convert_ids_to_tokens
-Encoding.get_vocab = get_vocab

 """
+，请
+## tiktoken API
+    tokens = enc.encode("hello world")
+    assert enc.decode(tokens) == "hello world"
+    assert enc.decode_bytes(tokens) == b"hello world"
+    assert enc.decode_tokens_bytes(tokens) == [b"hello", b" world"]
+    decode_single_token_bytes
+"""
+import tiktoken
+import tokenizer.tiktoken_patch
+tokenizer = tiktoken.encoding_for_model('text-davinci-003')
+tokenizer.vocab_size = tokenizer.n_vocab
+tokenizer.comments = ""
+tokenizer.reversible = True