Spaces:

xu-song
/

tokenizer-arena

Running

App Files Files Community

xu-song commited on Apr 22

Commit

814ee6b

•

1 Parent(s): a6aee1d

add compress rate

Browse files

Files changed (40) hide show

.gitattributes +2 -2
README.md +116 -1
app.py +66 -9
config.py +11 -1
examples.py +2 -0
requirements.txt +1 -1
tokenizer/chinese_sptokenizer_patch.py +5 -0
tokenizer/sptokenizer_patch.py +97 -0
tokenizer/tiktoken_patch.py +7 -1
tokenizer/tokenizer_patcher.py +5 -0
util.py +17 -6
utils/compress_rate_util.py +176 -2
utils/digit_util.py +6 -0
utils/text_util.py +54 -10
utils/zh_util.py +98 -42
vocab/README.md +39 -1
vocab/__init__.py +11 -3
vocab/bert_base_chinese/test_zh_coding_len.py +2 -2
vocab/bloom/test_zh_coding_len.py +1 -1
vocab/bloomz_6b4_zh/__init__.py +0 -2
vocab/glm/test_tokenizer.py +1 -1
vocab/glm_chinese/__init__.py +21 -0
vocab/glm_chinese/test.py +5 -2
vocab/gpt2/README.md +10 -31
vocab/gpt_35_turbo/__init__.py +0 -1
vocab/gpt_35_turbo/decode_test.py +9 -2
vocab/gpt_35_turbo/test_tiktoken.py +4 -1
vocab/gpt_35_turbo/vocab.jsonl +311 -0
vocab/gpt_nexo_20b/README.md +14 -1
vocab/gpt_nexo_20b/test_tokenizer.py +47 -3
vocab/gpt_nexo_20b/tokenzier_hf/README.md +0 -6
vocab/jamba_v0_1/__init__.py +9 -0
vocab/kplug/__init__.py +1 -1
vocab/llama/gpt_neox/get_oov_zh_tokens.py +2 -2
vocab/llama3/Meta-Llama-3-70B/special_tokens_map.json +4 -0
vocab/llama3/Meta-Llama-3-70B/tokenizer.json +3 -0
vocab/llama3/Meta-Llama-3-70B/tokenizer_config.json +2062 -0
vocab/llama3/__init__.py +9 -0
vocab/mobilenet_v2/__init__.py +4 -0
vocab/moss/test_zh_coding_len.py +2 -2

.gitattributes CHANGED Viewed

@@ -37,5 +37,5 @@ vocab/belle_7b_2m/belle-7b-2m/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 vocab/bloom/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 vocab/gemma_7b/gemma-7b/tokenizer.model filter=lfs diff=lfs merge=lfs -text
 vocab/gemma_7b/gemma-7b/tokenizer.json filter=lfs diff=lfs merge=lfs -text
-vocab/

 vocab/bloom/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 vocab/gemma_7b/gemma-7b/tokenizer.model filter=lfs diff=lfs merge=lfs -text
 vocab/gemma_7b/gemma-7b/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+vocab/grok_1/tokenizer.model filter=lfs diff=lfs merge=lfs -text
+vocab/llama3/Meta-Llama-3-70B/tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -32,4 +32,119 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
 -
-https://huggingface.co/spaces/yenniejun/tokenizers-languages

 -
+https://huggingface.co/spaces/yenniejun/tokenizers-languages
+## gradio app
+- https://arena.lmsys.org/
+## lang
+## number
+## diff
+## Compress Rate
+**简介**
+we tokenize in cc-100
+| tokenizer                   |   vocab_size |   g_bytes/b_tokens |   t_bytes/t_tokens |   b_tokens/g_bytes |
+|:----------------------------|-------------:|-------------------:|-------------------:|-------------------:|
+| amber                       |        32000 |               1.84 |               1.8  |               0.54 |
+| aya_101                     |       250100 |               3.89 |               3.79 |               0.26 |
+| baichuan                    |        64000 |               3.92 |               3.82 |               0.26 |
+| baichuan2                   |       125696 |               4.53 |               4.42 |               0.22 |
+| bert_base_cased             |        28996 |               2.73 |               2.66 |               0.37 |
+| bert_base_chinese           |        21128 |               2.74 |               2.67 |               0.37 |
+| bert_base_uncased           |        30522 |               2.73 |               2.67 |               0.37 |
+| bloom                       |       250680 |               4.28 |               4.18 |               0.23 |
+| byt5_small                  |          256 |               0.93 |               0.91 |               1.08 |
+| character_glm_6b            |        64794 |               4.2  |               4.1  |               0.24 |
+| chatglm2_6b                 |        64794 |               4.2  |               4.1  |               0.24 |
+| chatglm3_6b                 |        64798 |               4.2  |               4.1  |               0.24 |
+| chatglm_6b                  |       150344 |               4.65 |               4.54 |               0.22 |
+| chatyuan_large_v2           |        32128 |               4.34 |               4.24 |               0.23 |
+| chinese_llama               |        49953 |               3.93 |               3.84 |               0.25 |
+| chinese_llama2              |        55296 |               3.92 |               3.83 |               0.26 |
+| code_davinci_002            |        50281 |               1.31 |               1.28 |               0.77 |
+| crystal_coder               |        32000 |               1.86 |               1.81 |               0.54 |
+| deepseek_coder_33b_instruct |        32000 |               3.4  |               3.32 |               0.29 |
+| deepseek_llm_7b_base        |       100000 |               4.05 |               3.96 |               0.25 |
+| falcon_180b                 |        65024 |               2.18 |               2.13 |               0.46 |
+| falcon_7b                   |        65024 |               2.18 |               2.13 |               0.46 |
+| fastchat_t5_3b              |        32000 |              13.7  |              13.38 |               0.07 |
+| flan_t5_base                |        32100 |              14.13 |              13.8  |               0.07 |
+| gemma_7b                    |       256000 |               3.82 |               3.73 |               0.26 |
+| gpt2                        |        50257 |               1.31 |               1.28 |               0.77 |
+| gpt2_chinese                |        21128 |               2.73 |               2.66 |               0.37 |
+| gpt_35_turbo                |       100277 |               2.26 |               2.21 |               0.44 |
+| gpt_4                       |       100277 |               2.26 |               2.21 |               0.44 |
+| gpt_nexo_20b                |        50254 |               2.01 |               1.96 |               0.5  |
+| internlm2_chat_7b           |        92544 |               4.23 |               4.13 |               0.24 |
+| internlm2_math_7b           |        92544 |               4.23 |               4.13 |               0.24 |
+| internlm_chat_7b            |       103168 |               4.23 |               4.14 |               0.24 |
+| internlm_xcomposer_7b       |       103168 |               4.23 |               4.14 |               0.24 |
+| kplug                       |        10261 |               2.72 |               2.65 |               0.37 |
+| llama                       |        32000 |               1.84 |               1.8  |               0.54 |
+| llama2                      |        32000 |               1.84 |               1.8  |               0.54 |
+| mistral_7b                  |        32000 |               2.36 |               2.3  |               0.42 |
+| mixtral_8_7b                |        32000 |               2.36 |               2.3  |               0.42 |
+| mobilebert_uncased          |        30522 |               2.73 |               2.67 |               0.37 |
+| moss                        |       106029 |               4.4  |               4.3  |               0.23 |
+| mt5_large                   |       250100 |               3.89 |               3.79 |               0.26 |
+| olmo_7b                     |        50280 |               2.01 |               1.96 |               0.5  |
+| orion_14b_chat              |        84608 |               4.63 |               4.52 |               0.22 |
+| phi_1                       |        50257 |               1.31 |               1.28 |               0.77 |
+| phi_2                       |        50257 |               1.31 |               1.28 |               0.77 |
+| pko_t5_large                |        50258 |               0.97 |               0.95 |               1.03 |
+| prompt_clue                 |        32128 |               4.34 |               4.24 |               0.23 |
+| qwen1_5_14b_chat            |       151643 |               4.16 |               4.06 |               0.24 |
+| qwen_1_8b_chat              |       151851 |               4.16 |               4.06 |               0.24 |
+| qwen_72b_chat               |       151851 |               4.16 |               4.06 |               0.24 |
+| qwen_7b_chat                |       151851 |               4.16 |               4.06 |               0.24 |
+| roberta_chinese_clue        |         8021 |               2.7  |               2.64 |               0.37 |
+| skywork_13b_base            |        65519 |               3.69 |               3.61 |               0.27 |
+| skywork_13b_math            |        65519 |               3.69 |               3.61 |               0.27 |
+| solar_10_7b                 |        32000 |               2.36 |               2.3  |               0.42 |
+| starchat_alpha              |        49152 |               2.78 |               2.72 |               0.36 |
+| switch_c_2048               |        32100 |              14.13 |              13.8  |               0.07 |
+| t5_base                     |        32100 |              14.13 |              13.8  |               0.07 |
+| t5_large                    |        32100 |              14.13 |              13.8  |               0.07 |
+| t5_small                    |        32100 |              14.13 |              13.8  |               0.07 |
+| text_davinci_003            |        50281 |               1.31 |               1.28 |               0.77 |
+| tigerbot_13b_chat_v2        |        60512 |               4.25 |               4.15 |               0.24 |
+| tigerbot_70b_chat_v4_4k     |        65107 |               4.25 |               4.15 |               0.24 |
+| wizardcoder_15b_v1          |        49152 |               2.78 |               2.72 |               0.36 |
+| wizardcoder_python_7b_v1    |        32000 |               1.84 |               1.8  |               0.54 |
+| wizardlm_7b_v1              |        32000 |               1.84 |               1.8  |               0.54 |
+| wizardmath_70b_v1           |        32000 |               1.84 |               1.8  |               0.54 |
+| xlm_roberta                 |       250002 |               3.96 |               3.86 |               0.25 |
+| yi_34b                      |        64000 |               4.17 |               4.07 |               0.24 |
+| yi_6b                       |        64000 |               4.17 |               4.07 |               0.24 |
+| yi_vl34b                    |        64000 |               4.11 |               4.02 |               0.24 |
+| zephyr_7b_beta              |        32000 |               2.36 |               2.3  |               0.42 |
+**结论**
+larger vocabulary sizes
+## Reference
+- Getting the most out of your tokenizer for pre-training and domain adaptation
+- Efficient and Effective Text Encoding for Chinese LLaMA and Alpaca
+- https://huggingface.co/spaces/Xenova/the-tokenizer-playground

app.py CHANGED Viewed

@@ -73,6 +73,31 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
         show_label=False,
     )
     gr.Markdown("## Tokenization")
     with gr.Row():
         with gr.Column(scale=6):
             with gr.Group():
@@ -86,13 +111,19 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
                     """
                     with gr.Row():
                         stats_vocab_size_1 = gr.TextArea(
-                            label="VocabSize",
                             lines=1,
                             elem_classes="statistics"
                         )
                         stats_zh_token_size_1 = gr.TextArea(
                             label="ZH char/word",
                             lines=1,
                             elem_classes="statistics"
                         )
                         stats_overlap_token_size_1 = gr.TextArea(
@@ -126,13 +157,20 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
                         stats_zh_token_size_2 = gr.TextArea(
                             label="ZH char/word",  # 中文字/词
                             lines=1,
                             elem_classes="statistics"
                         )
-                        # stats_6 = gr.TextArea(
-                        #     label="Compress Rate",
-                        #     lines=1,
-                        #     elem_classes="statistics"
-                        # )
                         stats_overlap_token_size_2 = gr.TextArea(
                             label="Overlap Tokens",
                             lines=1,
@@ -141,6 +179,7 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
     # TODO: 图 表 压缩率
     with gr.Row():
         with gr.Column():
             output_text_1 = gr.Highlightedtext(
                 show_legend=True,
@@ -156,12 +195,21 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
         output_table_1 = gr.Dataframe()
         output_table_2 = gr.Dataframe()
     tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
                             [output_text_1, output_table_1])
     tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
     tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
                             [stats_overlap_token_size_1, stats_overlap_token_size_2])
     user_input.change(tokenize_pair,
                       [user_input, tokenizer_type_1, tokenizer_type_2],
                       [output_text_1, output_table_1, output_text_2, output_table_2])  # , pass_request=1
@@ -171,6 +219,15 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
     tokenizer_type_2.change(basic_count, [tokenizer_type_2], [stats_vocab_size_2, stats_zh_token_size_2])
     tokenizer_type_2.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
                             [stats_overlap_token_size_1, stats_overlap_token_size_2])
     dropdown_examples.change(
         example_fn,
@@ -178,15 +235,15 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
         [user_input, tokenizer_type_1, tokenizer_type_2]
     )
-    demo.load(_js=open("js/onload.js", "r", encoding="utf-8").read())
     demo.load(
         fn=on_load,
         inputs=[user_input],  # 这里只需要传个空object即可。
         outputs=[user_input, tokenizer_type_1, tokenizer_type_2],
-        _js=get_window_url_params
     )
 if __name__ == "__main__":
     # demo.queue(max_size=20).launch()
     demo.launch()

         show_label=False,
     )
     gr.Markdown("## Tokenization")
+    # compress rate setting
+    with gr.Accordion("Compress Rate Setting", open=True):
+        gr.Markdown("Please select corpus and unit of compress rate, get more details at [github](https://github.com/xu-song/tokenizer-arena/). ")
+        with gr.Row():
+            compress_rate_corpus = gr.CheckboxGroup(
+                ["cc100-en", "cc100-zh-Hans", "cc100-es", "code"],
+                value=["cc100-en", "cc100-zh-Hans"],
+                label="corpus",
+                # info=""
+            )
+            compress_rate_unit = gr.Radio(
+                ["b_tokens/g_bytes", "g_bytes/b_tokens", "t_tokens/t_bytes", "t_bytes/t_tokens"],
+                value="b_tokens/g_bytes",
+                label="unit",
+            )
+    # TODO: Token Setting
+    # with gr.Accordion("Token Filter Setting", open=False):
+    #     gr.Markdown(
+    #         "Get total number of tokens which contain the following character)")
+    #     gr.Radio(
+    #         ["zh-Hans", "", "number", "space"],
+    #         value="zh",
+    #     )
     with gr.Row():
         with gr.Column(scale=6):
             with gr.Group():
                     """
                     with gr.Row():
                         stats_vocab_size_1 = gr.TextArea(
+                            label="Vocab Size",
                             lines=1,
                             elem_classes="statistics"
                         )
                         stats_zh_token_size_1 = gr.TextArea(
                             label="ZH char/word",
                             lines=1,
+                            elem_classes="statistics",
+                            visible=False
+                        )
+                        stats_compress_rate_1 = gr.TextArea(
+                            label="Compress Rate",
+                            lines=1,
                             elem_classes="statistics"
                         )
                         stats_overlap_token_size_1 = gr.TextArea(
                         stats_zh_token_size_2 = gr.TextArea(
                             label="ZH char/word",  # 中文字/词
                             lines=1,
+                            elem_classes="statistics",
+                            visible=False
+                        )
+                        stats_compress_rate_2 = gr.TextArea(
+                            label="Compress Rate",
+                            lines=1,
                             elem_classes="statistics"
                         )
+                        stats_filtered_token_2 = gr.TextArea(
+                            label="filtered tokens",
+                            lines=1,
+                            elem_classes="statistics",
+                            visible=False
+                        )
                         stats_overlap_token_size_2 = gr.TextArea(
                             label="Overlap Tokens",
                             lines=1,
     # TODO: 图 表 压缩率
     with gr.Row():
+        # dynamic change label
         with gr.Column():
             output_text_1 = gr.Highlightedtext(
                 show_legend=True,
         output_table_1 = gr.Dataframe()
         output_table_2 = gr.Dataframe()
+    # setting
+    # compress_rate_unit.change(compress_rate_unit_change, [compress_rate_unit],
+    #                             [stats_compress_rate_1, stats_compress_rate_2])
     tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
                             [output_text_1, output_table_1])
     tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
     tokenizer_type_1.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
                             [stats_overlap_token_size_1, stats_overlap_token_size_2])
+    tokenizer_type_1.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
+                            [stats_compress_rate_1])
+    # TODO: every=3
     user_input.change(tokenize_pair,
                       [user_input, tokenizer_type_1, tokenizer_type_2],
                       [output_text_1, output_table_1, output_text_2, output_table_2])  # , pass_request=1
     tokenizer_type_2.change(basic_count, [tokenizer_type_2], [stats_vocab_size_2, stats_zh_token_size_2])
     tokenizer_type_2.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
                             [stats_overlap_token_size_1, stats_overlap_token_size_2])
+    tokenizer_type_2.change(get_compress_rate, [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
+                            [stats_compress_rate_2])
+    compress_rate_unit.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
+                            [stats_compress_rate_1])
+    compress_rate_unit.change(get_compress_rate, [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
+                            [stats_compress_rate_2])
     dropdown_examples.change(
         example_fn,
         [user_input, tokenizer_type_1, tokenizer_type_2]
     )
+    demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
     demo.load(
         fn=on_load,
         inputs=[user_input],  # 这里只需要传个空object即可。
         outputs=[user_input, tokenizer_type_1, tokenizer_type_2],
+        js=get_window_url_params
     )
 if __name__ == "__main__":
     # demo.queue(max_size=20).launch()
     demo.launch()
+    # demo.launch(share=True)

config.py CHANGED Viewed

@@ -1,2 +1,12 @@
-USE_REMOTE = False
 ADD_SPECIAL_TOKEN = False

+USE_REMOTE = False  # use remote tokenizer or local tokenizer
+# load_vocab_with_SPECIAL_TOKEN = True # 如果不包含会导致计算词典大小错误、overlap_token计算不一致。
+# encoding config
 ADD_SPECIAL_TOKEN = False
+#
+LAZY_IMPORT = True
+# DEBUG: 设置环境变量 RUST_BACKTRACE=full
+#

examples.py CHANGED Viewed

@@ -24,6 +24,7 @@ examples = {
         # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
         ["punctuation: ,.:/?+=\"，。！？；【】〔〕〖〗", "gemma_7b", "llama"],  # llama词典有点小
         ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
     ],
     "zh": [
         ["空格测试：  2个空格        8个空格", "llama", "chatglm2_6b"],  # chatglm 有blank_n,
@@ -38,6 +39,7 @@ more_examples = [
     # bert VS clue
     # bert系列
     ("bert_base_cased", "bert_base_uncased", ""),  # # clue VS kplug， bert VS clue
     # llama系列 (基于sentencepiece)
     ("baichuan", "baichuan2", "baichuan2支持多空格   ，多个换行\n\n\n，do not add dummy prefix as Baichuan1"),

         # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
         ["punctuation: ,.:/?+=\"，。！？；【】〔〕〖〗", "gemma_7b", "llama"],  # llama词典有点小
         ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
+        ["special: [PAD] [UNK] [CLS] [SEP] [MASK] "],
     ],
     "zh": [
         ["空格测试：  2个空格        8个空格", "llama", "chatglm2_6b"],  # chatglm 有blank_n,
     # bert VS clue
     # bert系列
     ("bert_base_cased", "bert_base_uncased", ""),  # # clue VS kplug， bert VS clue
+    ("bert_base_cased", "clue", ""),
     # llama系列 (基于sentencepiece)
     ("baichuan", "baichuan2", "baichuan2支持多空格   ，多个换行\n\n\n，do not add dummy prefix as Baichuan1"),

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-transformers==4.38.0
 sentencepiece
 tiktoken
 icetk

+transformers
 sentencepiece
 tiktoken
 icetk

tokenizer/chinese_sptokenizer_patch.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""
+ref: glm_chinese
+"""

tokenizer/sptokenizer_patch.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""
+## usage
+- grok
+## 风险评估
+- 会干扰 sentencepiece.SentencePieceProcessor的正常使用吗？
+"""
+import sentencepiece
+@property
+def vocab_size(self):
+    """Returns vocab size"""
+    return self.get_piece_size()
+def get_vocab(self):
+    """Returns vocab as a dict"""
+    vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+    # vocab.update(self.added_tokens_encoder)
+    return vocab
+def _tokenize(self, text):
+    """Returns a tokenized string."""
+    return self.encode(text, out_type=str)
+def _convert_token_to_id(self, token):
+    """Converts a token (str) in an id using the vocab."""
+    return self.piece_to_id(token)
+def _convert_id_to_token(self, index):
+    """Converts an index (integer) in a token (str) using the vocab."""
+    token = self.IdToPiece(index)
+    return token
+def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+    """ copy from transformers.PreTrainedTokenizer
+    Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
+    added tokens.
+    Args:
+        ids (`int` or `List[int]`):
+            The token id (or token ids) to convert to tokens.
+        skip_special_tokens (`bool`, *optional*, defaults to `False`):
+            Whether or not to remove special tokens in the decoding.
+    Returns:
+        `str` or `List[str]`: The decoded token(s).
+    """
+    self._added_tokens_decoder = {}  # add by xs
+    if isinstance(ids, int):
+        if ids in self._added_tokens_decoder:
+            return self._added_tokens_decoder[ids].content
+        else:
+            return self._convert_id_to_token(ids)
+    tokens = []
+    for index in ids:
+        index = int(index)
+        if skip_special_tokens and index in self.all_special_ids:
+            continue
+        if index in self._added_tokens_decoder:
+            tokens.append(self._added_tokens_decoder[index].content)
+        else:
+            tokens.append(self._convert_id_to_token(index))
+    return tokens
+def encode(self, *args, **kwargs):
+    """
+    add_special_token 是为了兼容 hf_tokenizer
+    """
+    kwargs.pop("add_special_tokens", None)
+    kwargs.pop("allowed_special", None)
+    return self.Encode(*args, **kwargs)
+def decode(self, *args, **kwargs):
+    kwargs.pop("skip_special_tokens", None)
+    return self.Decode(*args, **kwargs)
+sentencepiece.SentencePieceProcessor.vocab_size = vocab_size
+sentencepiece.SentencePieceProcessor.get_vocab = get_vocab
+sentencepiece.SentencePieceProcessor._convert_id_to_token = _convert_id_to_token
+sentencepiece.SentencePieceProcessor.convert_ids_to_tokens = convert_ids_to_tokens
+# sentencepiece.SentencePieceProcessor.tokenize = _tokenize
+sentencepiece.SentencePieceProcessor.encode = encode
+sentencepiece.SentencePieceProcessor.decode = decode

tokenizer/tiktoken_patch.py CHANGED Viewed

@@ -17,7 +17,6 @@ def decode(self, tokens, errors="replace", skip_special_tokens=False):
         "namereplace"
     """
     try:
-        print(tokens)
         decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
     except Exception as e:  # 捕捉不到 PyO3PanicException
         logger.error(f"{e} for {tokens} -> return 'null'")
@@ -69,6 +68,12 @@ def get_vocab(self, token_type="str"):
     return vocab
 def encode(self, *args, **kwargs):
     """
     add_special_token 是为了兼容 hf_tokenizer
@@ -84,3 +89,4 @@ Encoding.encode = encode
 Encoding.decode = decode
 Encoding.convert_ids_to_tokens = convert_ids_to_tokens
 Encoding.get_vocab = get_vocab

         "namereplace"
     """
     try:
         decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
     except Exception as e:  # 捕捉不到 PyO3PanicException
         logger.error(f"{e} for {tokens} -> return 'null'")
     return vocab
+@property
+def vocab_size(self):
+    """Returns vocab size"""
+    return self.n_vocab
 def encode(self, *args, **kwargs):
     """
     add_special_token 是为了兼容 hf_tokenizer
 Encoding.decode = decode
 Encoding.convert_ids_to_tokens = convert_ids_to_tokens
 Encoding.get_vocab = get_vocab
+Encoding.vocab_size = vocab_size

tokenizer/tokenizer_patcher.py ADDED Viewed

	@@ -0,0 +1,5 @@

+def patch_tokenizer(tokenizer: "PreTrainedTokenizer") -> None:
+    if "PreTrainedTokenizerBase" not in str(tokenizer._pad.__func__):
+        tokenizer._pad = MethodType(PreTrainedTokenizerBase._pad, tokenizer)

util.py CHANGED Viewed

@@ -1,13 +1,12 @@
 import gradio as gr
 import json
-import socket
 import pandas as pd
 import config
 from vocab import load_tokener
 from utils.zh_util import iter_vocab
 from utils.log_util import logger
 from functools import lru_cache
-from urllib.parse import urlparse, parse_qs
 @lru_cache
@@ -83,8 +82,16 @@ def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
 @lru_cache
 def basic_count(tokenizer_type):
     tokenizer = load_tokener(tokenizer_type)
-    stats = iter_vocab(tokenizer, tokenizer_type)
-    return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
 @lru_cache
@@ -110,8 +117,9 @@ def get_overlap_token_size(tokenizer_type_1, tokenizer_type_2):
     return overlap_token_size, overlap_token_size
-default_user_input = """Replace this text in the input field to see how tokenization works
-华为发布Mate60手机
 ラグビーワールドカップ2023フランス"""
 default_tokenizer_type_1 = "llama"
 # default_tokenizer_type_2 = "internlm_chat_7b"
@@ -147,6 +155,9 @@ def on_load(url_params, request: gr.Request):
     return text, tokenizer_type_1, tokenizer_type_2
 def test_coding():
     bytes1 = b'\xe4\xb8\xad'
     print(bytes1)  # b'\xe4\xb8\xad'

 import gradio as gr
 import json
 import pandas as pd
 import config
 from vocab import load_tokener
 from utils.zh_util import iter_vocab
 from utils.log_util import logger
+from utils.compress_rate_util import tokenize_corpus, unit_convertor
 from functools import lru_cache
 @lru_cache
 @lru_cache
 def basic_count(tokenizer_type):
     tokenizer = load_tokener(tokenizer_type)
+    stats = iter_vocab(tokenizer)
+    return tokenizer.vocab_size, f'{stats["中文token数"]}'
+    # return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
+def get_compress_rate(tokenizer_type, all_corpus, unit):
+    corpus_name = all_corpus[0]
+    tokenizer = load_tokener(tokenizer_type)
+    compress_rate_stats = tokenize_corpus(tokenizer, corpus_name)
+    compress_rate = unit_convertor(compress_rate_stats, unit)
+    return compress_rate
 @lru_cache
     return overlap_token_size, overlap_token_size
+default_user_input = """Replace this text in the input field to see how tokenization works.
+Buenos días!
+华为发布Mate60手机。
 ラグビーワールドカップ2023フランス"""
 default_tokenizer_type_1 = "llama"
 # default_tokenizer_type_2 = "internlm_chat_7b"
     return text, tokenizer_type_1, tokenizer_type_2
+def compress_rate_unit_change(unit):
+    return gr.update(label=f"Compress Rate: {unit}"), gr.update(label=f"Compress Rate: {unit}"),
 def test_coding():
     bytes1 = b'\xe4\xb8\xad'
     print(bytes1)  # b'\xe4\xb8\xad'

utils/compress_rate_util.py CHANGED Viewed

@@ -1,7 +1,181 @@
 """
 中文数据：clue superclue
 英文数据：glue cnn_dailymail gigaword
-"""

 """
 中文数据：clue superclue
 英文数据：glue cnn_dailymail gigaword
+代码数据:
+数字：
+## 参考
+- https://github.com/baichuan-inc/Baichuan-7B  记录了不同分词器的压缩率
+  - 指标：猜测是 n_tokens/n_chars  (baichuan小，说明百川token少，压缩率高)
+  - Baichuan 0.73; llama 1.31;
+- https://github.com/QwenLM/Qwen/blob/main/tech_memo.md  记录了不同分词器的压缩率
+  - 以 XLM-RoBERTa为基准 (Unsupervised Cross-lingual Representation Learning at Scale ) ，
+  - Qwen-7B 在很多语言上压缩率都较高压缩率 (high compression rate)
+  - 中文： llama7b 2.2; baichuan7b 1.1; chatglm2-6b 0.9;  qwen7b 0.95
+  - 英文：
+  - 指标：猜测是 n_tokens / n_tokens_xlmR
+- https://github.com/hpcaitech/ColossalAI/blob/4b8312c08e8d05a5f41453d63c8671aab601ed1c/applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py#L134
+  - 有压缩率的计算方式
+  - https://github.com/hpcaitech/ColossalAI/blob/main/applications/Colossal-LLaMA-2/README.md#tokenizer
+  - 记录了不同分词器的压缩率
+  - 指标：
+- https://github.com/AUGMXNT/shisa/blob/6a823d77a71acbd18ab8f68a6b02f4b87ec9dddd/eval/tokenizer-efficiency-ja.py#L24
+  - 有压缩率的计算方式 = {n_chars} / {n_tokens}
+  -
+- https://github.com/huggingface/transformers/blob/cec773345aeffce3c04e8891303a3f748de7141e/src/transformers/models/whisper/generation_whisper.py#L354
+  - 这个可能不是
+- https://github.com/bojone/bytepiece/blob/main/README_en.md
+  - "bytes/token": the average number of bytes per token
+- Getting the most out of your tokenizer for pre-training and domain adaptation 👍
+  - 定义：
+    - NSL: 两个分词器的编码长度 比例，通常以 llama为基准
+    - average number of bytes per token. {n_bytes} / {n_tokens}
+  - higher compression rate  --
+- *** https://github.com/microsoft/LLMLingua/blob/main/llmlingua/prompt_compressor.py
+  - 定义：{Compressed Size}/{Raw Size}, 来自论文 Language modeling is compression. 数值<=1.0，用 % 来表示。也有>1的情况。
+    -
+    - {Compressed Size} 指的是？
+  - 这里的压缩指的是 模型参数相关的。
+"""
+import json
+import os
+import pandas as pd
+from datasets import load_dataset
+from utils.log_util import logger
+from vocab import load_tokener
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+def get_n_bytes_of_string(string_text):
+    n_bytes = len(string_text.encode("utf-8"))
+    return n_bytes
+def unit_convertor(stat, unit):
+    n_tokens = stat["n_tokens"]
+    n_chars = stat["n_chars"]
+    n_bytes = stat["n_bytes"]
+    n_tokens_in_billion = n_tokens / (1000 * 1000 * 1000)
+    n_tokens_in_trillion = n_tokens / (1000 * 1000 * 1000 * 1000)
+    n_bytes_in_mb = n_bytes / (1024 * 1024)
+    n_bytes_in_gb = n_bytes_in_mb / 1024
+    n_bytes_in_tb = n_bytes_in_gb / 1024
+    # n_chars_in_billion = n_chars / (1000 * 1000 * 1000)
+    if unit == "n_tokens/n_bytes":
+        value = n_tokens / n_bytes
+    elif unit == "n_chars/n_tokens":  # 重要：平均一个token包含多少个字符。
+        value = n_chars / n_tokens
+    elif unit == "n_tokens/n_chars":  # 一个中文汉字需要几个token？
+        value = n_tokens / n_chars
+    elif unit == "g_bytes/b_tokens":
+        value = n_bytes_in_gb / n_tokens_in_billion
+    elif unit == "t_bytes/t_tokens":  # 重要：
+        value = n_bytes_in_tb / n_tokens_in_trillion
+    elif unit == "b_tokens/g_bytes":
+        value = n_tokens_in_billion / n_bytes_in_gb
+    else:
+        raise "measure not support"
+    return round(value, 2)
+all_units = ["g_bytes/b_tokens", "t_bytes/t_tokens", "b_tokens/g_bytes"]
+def pprint(stats):
+    table = []
+    for tokenizer_name, stat in stats.items():
+        columns = {"tokenizer": tokenizer_name, "vocab_size": stat["vocab_size"]}
+        for unit in all_units:
+            if unit not in stat:
+                columns[unit] = unit_convertor(stat, unit)
+            else:
+                pass
+        table.append(columns)
+    df = pd.DataFrame(table)
+    # print(df.to_markdown(index=False, tablefmt='fancy_grid'))
+    logger.info(df.to_markdown(index=False))
+    return
+cache = {}
+def tokenize_corpus(tokenizer, lang, cache_dir="stats/compress_rate"):
+    """
+    这个要独立的cache，因为速度慢。
+    :param tokenizer:
+    :param lang:
+    :param cache_dir:
+    :return:
+    """
+    def _tokenize(tokenizer, dataset):
+        n_tokens = 0
+        n_chars = 0
+        n_bytes = 0
+        for item in dataset:
+            text = item["text"]
+            n_bytes += get_n_bytes_of_string(text)
+            n_chars += len(text)
+            encodings = tokenizer.encode(text)
+            n_tokens += len(encodings)
+        stat = {
+            "vocab_size": tokenizer.vocab_size,
+            "n_bytes": n_bytes,
+            "n_tokens": n_tokens,
+            "n_chars": n_chars,
+        }
+        return stat
+    tokenizer_name = tokenizer.alias
+    lang = lang.replace("cc100-", "")
+    cache_id = f"{tokenizer_name}.{lang}"
+    # L1: in-memory cache
+    if cache_id in cache:
+        logger.info(f"loading {cache_id} from in-memory cache")
+        return cache[cache_id]
+    # L2: file cache
+    cache_dir = os.path.join(CURRENT_DIR, f"../{cache_dir}")
+    os.makedirs(cache_dir, exist_ok=True)
+    cache_path = os.path.join(cache_dir, f"{cache_id}.json")
+    if os.path.exists(cache_path):
+        logger.info(f"loading {cache_id} from file cache")
+        stat = json.load(open(cache_path, "r", encoding="utf-8"))
+        cache[cache_id] = stat
+        return stat
+    # tokenize corpus
+    dataset = load_dataset("eson/cc100-samples", lang, split="train")
+    stat = _tokenize(tokenizer, dataset)
+    logger.info(f"saving {cache_id} to {cache_path}")
+    json.dump(stat, open(cache_path, "w", encoding="utf-8"))
+    logger.info(f"saving {cache_id} to in-memory cache")
+    cache[cache_id] = stat
+    return stat
+def main():
+    from vocab import all_tokenizers
+    stats = {}
+    for lang in ["en", "zh-Hans"]:
+        print("###" * 10 + lang)
+        for tokenizer_name in ['llama', 'llama2', 'llama3']:
+            # for tokenizer_name in all_tokenizers:
+            tokenizer = load_tokener(tokenizer_name)
+            stat = tokenize_corpus(tokenizer, lang)
+            # ["qwen1_5_14b_chat", "gpt_35_turbo",]:
+            stats[tokenizer_name] = stat
+        pprint(stats)
+if __name__ == "__main__":
+    main()

utils/digit_util.py CHANGED Viewed

	@@ -0,0 +1,6 @@

+"""
+qwen segments numbers by single digits.
+"""

utils/text_util.py CHANGED Viewed

@@ -1,9 +1,7 @@
-def is_chinese(uchar):
     """
     https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48
     re.compile("([\u4E00-\u9FD5]+)", re.U)
@@ -11,18 +9,33 @@ def is_chinese(uchar):
     return u'\u4e00' <= uchar <= u'\u9fa5'
-def has_chinese(text):
     """ contains Chinese characters """
-    return any(is_chinese(ch) for ch in text)
 def get_zh_count(text):
-    return sum([is_chinese(uchar) for uchar in text])
-def is_all_chinese(text):
-    return all(is_chinese(char) for char in text)
 def get_digit_count(text):
@@ -31,3 +44,34 @@ def get_digit_count(text):
         if char in "0123456789":
             digit_count += 1
     return digit_count

+from zhon.hanzi import punctuation as zh_punc
+def is_zh_char(uchar):
     """
     https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48
     re.compile("([\u4E00-\u9FD5]+)", re.U)
     return u'\u4e00' <= uchar <= u'\u9fa5'
+def has_zh(text):
     """ contains Chinese characters """
+    return any(is_zh_char(ch) for ch in text)
 def get_zh_count(text):
+    return sum([is_zh_char(uchar) for uchar in text])
+def is_all_zh(text):
+    return all(is_zh_char(char) for char in text)
+def is_all_en(text):
+    return text.encode('utf-8').isalpha()
+def is_digit_char(uchar):
+    return uchar in "0123456789"
+def has_digit(text):
+    return any(is_digit_char(ch) for ch in text)
+def is_all_digit(text):
+    return all(is_digit_char(char) for char in text)
 def get_digit_count(text):
         if char in "0123456789":
             digit_count += 1
     return digit_count
+def has_zh_punc(text):
+    """
+    是否包含中文标点
+    """
+    return any(ch in zh_punc for ch in text)
+def is_space_char(uchar):
+    """
+    https://emptycharacter.com/
+    """
+def has_space(text):
+    pass
+def is_all_space(text):
+    pass
+def get_space_count(text):
+    space_count = 0
+    for char in text:
+        if len(char.strip()) == 0:
+            space_count += 1
+    return space_count

utils/zh_util.py CHANGED Viewed

@@ -4,15 +4,18 @@ TODO: 繁体、简体、语种、
 import os
 import json
 from collections import Counter
-from utils.text_util import is_chinese, get_zh_count, get_digit_count
-from zhon.hanzi import punctuation as zh_punc
 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 zh_tokens = [line.strip() for line in open(os.path.join(CURRENT_DIR, "vocab.jd.txt.v2"), "r", encoding="utf-8") if
-             is_chinese(line.strip())]
 def zh_iterator():
     for idx in range(ord(u'\u4e00'), ord(u'\u9fa5')):
         yield (chr(idx))
@@ -28,7 +31,11 @@ def get_coding_length(tokenizer, vocab, filter=None):
             continue
         if filter is not None and filter(word):
             continue
-        tokens = tokenizer.encode(word)
         all_length.append(len(tokens))
         # if len(tokens.ids) > 1:
         # if len(tokens) > 3:
@@ -39,21 +46,6 @@ def get_coding_length(tokenizer, vocab, filter=None):
     return dist_length, mean_length
-def has_zh_punc(text):
-    """
-    是否包含中文标点
-    """
-    return any(ch in zh_punc for ch in text)
-def get_space_count(text):
-    space_count = 0
-    for char in text:
-        if len(char.strip()) == 0:
-            space_count += 1
-    return space_count
 def remove_special_char():
     """
@@ -67,13 +59,39 @@ def remove_special_char():
 cache = {}
-def iter_vocab(tokenizer, name="", from_cache=True):
     if from_cache and name in cache:
         return cache[name]
-    f_out = open(name + "_vocab.jsonl", "w", encoding="utf-8")
-    zh_token_count = {"total": 0, "中文单字": 0, "中文多字": 0}
     # zh_token_count = {"total": 0, "包含1个中文单字": 0, "中文多字": 0}
@@ -81,56 +99,89 @@ def iter_vocab(tokenizer, name="", from_cache=True):
     all_single_zh_tokens = set()
     zh_symbol_count = 0
     for token_id in range(tokenizer.vocab_size):
         decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
         token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
         # tokenizer.convert_tokens_to_string(tokens)
         if token is None:  # 有些词典有空的id（不连续）
             continue
         if isinstance(token, bytes):
             token = token.decode("utf-8", errors="ignore")
         digit_count = get_digit_count(decode_str)
-        zh_count = get_zh_count(decode_str)
         space_count = get_space_count(decode_str)
-        f_out.write(json.dumps(
             {"id": token_id,
              "token": token,
              "token_decode": decode_str,
              "token_len": len(decode_str),
-             "zh_count": zh_count,
-             "space_count": space_count,
-             "digit_count": digit_count,
              "zh_symbol_count": zh_symbol_count,
              },
-            ensure_ascii=False) + "\n"
-                    )
-        if zh_count >= 1:
-            zh_token_count["total"] += 1
-            if zh_count > 1:
-                zh_token_count["中文多字"] += 1
-            else:
-                zh_token_count["中文单字"] += 1
-                all_single_zh_tokens.add(decode_str.strip().replace("#", ""))
     #
-    dist_length, mean_length = get_coding_length(tokenizer, zh_tokens, filter=lambda k: not is_chinese(k))
     # TODO: 繁体字，简体字
-    zh_token_count["中文单字-去重后"] = len(all_single_zh_tokens)
     result = {
         "name": name,
         "impl": str(tokenizer.__class__),
         "vocab_size": tokenizer.vocab_size,
-        "中文汉字数": zh_token_count,
         "中文标点数": zh_symbol_count,
         "中文汉字编码长度均值": mean_length,
         "中文汉字编码长度分布": json.dumps(dist_length),
     }
     cache[name] = result
     return result
@@ -140,9 +191,14 @@ if __name__ == "__main__":
     # test_coding_length(zh_punc)
     # test_coding_length(zh_iterator())
-    from vocab.chatglm2_6b import tokenizer; name = "chatglm2_6b"
     # from vocab.chatglm_6b import tokenizer; name="chatglm_6b"
     # from vocab.baichuan2 import tokenizer;  name="baichuan2"
-    # from vocab.gpt_4 import tokenizer; name="gpt4"
     print(iter_vocab(tokenizer, name=name))

 import os
 import json
 from collections import Counter
+from utils.log_util import logger
+from utils.text_util import is_zh_char, is_all_zh, has_zh, is_all_digit, has_digit, get_zh_count, get_digit_count, get_space_count
 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 zh_tokens = [line.strip() for line in open(os.path.join(CURRENT_DIR, "vocab.jd.txt.v2"), "r", encoding="utf-8") if
+             is_zh_char(line.strip())]
+def to_unicode(text):
+    return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
 def zh_iterator():
     for idx in range(ord(u'\u4e00'), ord(u'\u9fa5')):
         yield (chr(idx))
             continue
         if filter is not None and filter(word):
             continue
+        try:
+            tokens = tokenizer.encode(word)
+        except Exception as e:
+            print(e)
         all_length.append(len(tokens))
         # if len(tokens.ids) > 1:
         # if len(tokens) > 3:
     return dist_length, mean_length
 def remove_special_char():
     """
 cache = {}
+def iter_vocab(tokenizer, from_cache=True, cache_dir="stats/iter_vocab"):
+    """
+    由于速度较快，建议不采用文件缓存。
+    :param tokenizer:
+    :param from_cache:
+    :return:
+    """
+    cache_dir = os.path.join(CURRENT_DIR, f"../{cache_dir}")
+    os.makedirs(cache_dir, exist_ok=True)
+    name = tokenizer.alias
+    # L1 cache
     if from_cache and name in cache:
+        logger.info(f"load {name} from cache")
         return cache[name]
+    # L2 cache: not recommended
+    # has_zh_token_stats = {"total_tokens": 0, "mean_token_length": 0}
+    # all_zh_token_stats = {"total_tokens": 0, "mean_token_length": 0}
+    # has_number_token_stats = {"total_tokens": 0, "mean_token_length": 0}
+    # all_number_token_stats = {"total_tokens": 0, "mean_token_length": 0}
+    has_zh_tokens = []
+    all_zh_tokens = []
+    has_digit_tokens = []
+    all_digit_tokens = []
+    has_space_tokens = []
+    all_space_tokens = []
+    # zh_tags = ["all_zh", "has_zh"]
+    # digit_tags = ["all_digit", "has_digit"]
     # zh_token_count = {"total": 0, "包含1个中文单字": 0, "中文多字": 0}
     all_single_zh_tokens = set()
     zh_symbol_count = 0
+    buffer = []
     for token_id in range(tokenizer.vocab_size):
         decode_str = tokenizer.decode([token_id], skip_special_tokens=False)
         token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
         # tokenizer.convert_tokens_to_string(tokens)
+        tags = []
         if token is None:  # 有些词典有空的id（不连续）
             continue
         if isinstance(token, bytes):
             token = token.decode("utf-8", errors="ignore")
         digit_count = get_digit_count(decode_str)
+        if is_all_zh(decode_str):
+            tags.append("all_zh")
+            all_zh_tokens.append(decode_str)
+        elif has_zh(decode_str):
+            tags.append("has_zh")
+            has_zh_tokens.append(decode_str)
+        if is_all_digit(decode_str):
+            tags.append("all_digit")
+            all_digit_tokens.append(decode_str)
+        elif has_digit(decode_str):
+            tags.append("has_digit")
+            has_digit_tokens.append(decode_str)
         space_count = get_space_count(decode_str)
+        zh_count = get_zh_count(decode_str)
+        buffer.append(json.dumps(
             {"id": token_id,
              "token": token,
              "token_decode": decode_str,
+             "token_dumps": json.dumps(token),
+             "token_unicode": to_unicode(token),
              "token_len": len(decode_str),
+             "zh_count": zh_count,  # 包含汉字的数目
+             "tags": tags,
              "zh_symbol_count": zh_symbol_count,
+             "": "",
              },
+            ensure_ascii=False) + "\n")
+    #     if zh_count >= 1:
+    #         zh_token_count["total"] += 1
+    #         if zh_count > 1:
+    #             zh_token_count["中文多字"] += 1
+    #         else:
+    #             zh_token_count["中文单字"] += 1
+    #             all_single_zh_tokens.add(decode_str.strip().replace("#", ""))
     #
+    # zh_token_count["中文单字-去重后"] = len(all_single_zh_tokens)
+    dist_length, mean_length = get_coding_length(tokenizer, zh_tokens, filter=lambda k: not is_zh_char(k))
     # TODO: 繁体字，简体字
     result = {
         "name": name,
         "impl": str(tokenizer.__class__),
         "vocab_size": tokenizer.vocab_size,
+        "中文token数": len(has_zh_tokens),
+        "中文token的平均长度": None,
+        "纯中文token的平均长度": None,
         "中文标点数": zh_symbol_count,
         "中文汉字编码长度均值": mean_length,
         "中文汉字编码长度分布": json.dumps(dist_length),
+        "纯数字token数": digit_count,
+        "纯数字token的平均长度": None,
+        "纯中文token数": None,
+        "纯space的token数": space_count,
+        "纯space的token的平均长度": None,
     }
+    out_path = os.path.join(cache_dir, f"{name}.vocab.jsonl")
+    logger.info(f"saving vocab to {out_path}")
+    with open(out_path, "w", encoding="utf-8") as f_out:
+        f_out.write(json.dumps(result, ensure_ascii=False) + "\n")
+        for line in buffer:
+            f_out.write(line)
     cache[name] = result
     return result
     # test_coding_length(zh_punc)
     # test_coding_length(zh_iterator())
+    # from vocab.chatglm2_6b import tokenizer; name = "chatglm2_6b"
     # from vocab.chatglm_6b import tokenizer; name="chatglm_6b"
     # from vocab.baichuan2 import tokenizer;  name="baichuan2"
+    from vocab.gpt_4 import tokenizer; name="gpt4"
+    # from vocab.gpt2 import tokenizer; name="gpt2"
+    # from vocab.qwen1_5_14b_chat import tokenizer; name="qwen1_5_14b_chat"
+    # from vocab.gpt_nexo_20b import tokenizer; name="gpt_nexo_20b"
+    # from vocab.fastchat_t5_3b import tokenizer; name="fastchat_t5_3b"
     print(iter_vocab(tokenizer, name=name))

vocab/README.md CHANGED Viewed

@@ -36,6 +36,14 @@ chatglm
 bloom
 ## bert
 ```
@@ -87,10 +95,40 @@ https://github.com/pytorch/fairseq/blob/master/tests/test_noising.py#L37
 - 类似的还有：moss
 ## 空格、tab、换行
 ## reversible and lossless
-It's reversible and lossless, so you can convert tokens back into the original text

 bloom
+## 最小词典
+mobilenet
+## ss
 ## bert
 ```
 - 类似的还有：moss
+### Ġ是什么
+It's a feature of byte-level BPE(an encoded space character).
+Ġ 表示空格，有的版本用Ä代替Ġ。
+```sh
+What's up with the tokenizer?
+# BPE后
+['What', "'s", 'Ġup', 'Ġwith', 'Ġthe', 'Ġtoken', 'izer', '?']
+# 经过vocab.json编码后
+[ 2061,   338,  510,    351,    262,    11241,    7509,   30]
+# 经过dict.txt编码后（fairseq特有）
+[           其他数字                                         ]
+```
+<>
+疑问：up会加Ġ，为什么what不加Ġ，因为有个pre
+- https://github.com/pytorch/fairseq/issues/1716
+- https://github.com/huggingface/transformers/issues/1083
 ## 空格、tab、换行
 ## reversible and lossless
+It's reversible and lossless, so you can convert tokens back into the original text
+## diff

vocab/__init__.py CHANGED Viewed

@@ -70,7 +70,8 @@ uniq_tokenizers = [
     ""
 ]
-# TODO: alias/abbr, description, hf_path, tokenizer_class/type, comments, Organization
 all_tokenizers = [
     ##### bert 系列
     ("bert_base_cased", "", "bert"),
@@ -101,6 +102,7 @@ all_tokenizers = [
     ("llama", "", "sentencepiece", "llama use single digits and thus uses 4 tokens to encode the number 1000"),  # '中文单字': 700, '中文多字': 0
     ("llama2", "", "sentencepiece"),
     ("chinese_llama", "", "sentencepiece"),  #
     ("chinese_llama2", "", "sentencepiece"),  #
     # ("chinese_alpaca_lora_7b",  # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
@@ -154,7 +156,7 @@ all_tokenizers = [
     ("phi_2",),
     ("solar_10_7b",),
     ("mobilebert_uncased",),
-    ("mobilenet_v2",),
     ("switch_c_2048",),
     ("byt5_small",),
     ("mt5_large",),
@@ -168,7 +170,12 @@ all_tokenizers = [
     ("gemma_7b",),
     ("olmo_7b",),
     ("aya_101",),
-    ("zephyr_7b_beta",)
 ]
 all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]
@@ -234,6 +241,7 @@ class TokenizerImpl(Enum):
 def load_tokener(model_name):
     tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
     return tokenizer

     ""
 ]
+# format: alias/abbr, description, hf_path, tokenizer_class/type, comments, Organization
+# TODO: append link and description to the end of dropdown button.
 all_tokenizers = [
     ##### bert 系列
     ("bert_base_cased", "", "bert"),
     ("llama", "", "sentencepiece", "llama use single digits and thus uses 4 tokens to encode the number 1000"),  # '中文单字': 700, '中文多字': 0
     ("llama2", "", "sentencepiece"),
+    ("llama3", "", "sentencepiece"),
     ("chinese_llama", "", "sentencepiece"),  #
     ("chinese_llama2", "", "sentencepiece"),  #
     # ("chinese_alpaca_lora_7b",  # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
     ("phi_2",),
     ("solar_10_7b",),
     ("mobilebert_uncased",),
+    # ("mobilenet_v2",),  # error
     ("switch_c_2048",),
     ("byt5_small",),
     ("mt5_large",),
     ("gemma_7b",),
     ("olmo_7b",),
     ("aya_101",),
+    ("zephyr_7b_beta",),
+    ("jamba_v0_1", ),
+    ("dbrx_instruct", ),
+    ("grok_1",),
+    # ("claude",),
 ]
 all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]
 def load_tokener(model_name):
     tokenizer = importlib.import_module("." + model_name, 'vocab').tokenizer
+    tokenizer.alias = model_name
     return tokenizer

vocab/bert_base_chinese/test_zh_coding_len.py CHANGED Viewed

@@ -16,7 +16,7 @@
 from collections import Counter
 from transformers import AutoTokenizer
 from data_sample.oov_base import jd_vocab_tokens
-from utils.text_util import is_chinese, has_chinese
 from zhon.hanzi import punctuation as zh_punc
@@ -55,7 +55,7 @@ def iter_vocab():
     zh_symbol_count = 0
     for idx, word in enumerate(vocab):
-        if has_chinese(decode_str):
             zh_token_count += 1
             f_out.write("%d\t%s\t中文汉字\n" % (idx, decode_str))
         elif has_zh_char(decode_str):

 from collections import Counter
 from transformers import AutoTokenizer
 from data_sample.oov_base import jd_vocab_tokens
+from utils.text_util import is_zh_char, has_zh
 from zhon.hanzi import punctuation as zh_punc
     zh_symbol_count = 0
     for idx, word in enumerate(vocab):
+        if has_zh(decode_str):
             zh_token_count += 1
             f_out.write("%d\t%s\t中文汉字\n" % (idx, decode_str))
         elif has_zh_char(decode_str):

vocab/bloom/test_zh_coding_len.py CHANGED Viewed

@@ -16,7 +16,7 @@
 from collections import Counter
 from transformers import AutoTokenizer, BloomTokenizerFast
 from data_sample.oov_base import jd_vocab_tokens
-from utils.text_util import is_chinese
 from zhon.hanzi import punctuation as zh_punc
 # tokenizer = AutoTokenizer.from_pretrained("tokenizer")

 from collections import Counter
 from transformers import AutoTokenizer, BloomTokenizerFast
 from data_sample.oov_base import jd_vocab_tokens
+from utils.text_util import is_zh_char
 from zhon.hanzi import punctuation as zh_punc
 # tokenizer = AutoTokenizer.from_pretrained("tokenizer")

vocab/bloomz_6b4_zh/__init__.py CHANGED Viewed

@@ -7,5 +7,3 @@ TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")
 tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
-# vocab_size = len(tokenizer.get_vocab())
-# vocab_size = tokenizer.vocab_size


7
8	tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
9

vocab/glm/test_tokenizer.py CHANGED Viewed

@@ -3,7 +3,7 @@
 默认采用：GLMGPT2Tokenizer
 """
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-10b", trust_remote_code=True)
 tokens_id = [3856, 11030]

 默认采用：GLMGPT2Tokenizer
 """
+from transformers import AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-10b", trust_remote_code=True)
 tokens_id = [3856, 11030]

vocab/glm_chinese/__init__.py CHANGED Viewed

@@ -26,5 +26,26 @@ tokenizer.vocab_size = tokenizer.num_tokens
 # vocab_size = len(tokenizer.get_vocab())
 # vocab_size = tokenizer.vocab_size

+def get_vocab(self, token_type="str"):
+    """Returns vocab as a dict
+    :return:
+    """
+    vocab = {}
+    for i in range(self.vocab_size):
+        try:
+            token_byte = self.convert_ids_to_tokens([i])[0]
+            if token_byte is None:
+                continue
+            # token_str = token_byte.decode("utf-8")
+            vocab[token_byte] = i
+        except Exception as e:  # 773 UnicodeDecodeError
+            print("exception")
+    return vocab
+ChineseSPTokenizer.get_vocab = get_vocab
 # vocab_size = len(tokenizer.get_vocab())
 # vocab_size = tokenizer.vocab_size

vocab/glm_chinese/test.py CHANGED Viewed

@@ -1,4 +1,7 @@
-from glm_chinese import tokenizer
-print(tokenizer.decode([20]))

+from vocab.glm_chinese import tokenizer
+print(tokenizer.decode([20]))
+vocab = tokenizer.get_vocab()
+print(vocab)

vocab/gpt2/README.md CHANGED Viewed

@@ -40,42 +40,21 @@ byte-level BPE
 - [vocab.json](https://huggingface.co/gpt2-large/resolve/main/vocab.json): 50257个kv-pair. https://huggingface.co/gpt2/resolve/main/vocab.json
 - [merges.txt](https://huggingface.co/gpt2-large/resolve/main/merges.txt): 50001行，https://huggingface.co/gpt2/resolve/main/merges.txt
   - merges.txts是否包含所有的组合？https://github.com/huggingface/transformers/issues/4777
-### fairseq = 官方
-- vocab.bpe：50001行
-- encoder.json: 50257个kv-pair
-- dict.txt: 50260行  是纯数字的，是由fairseq-preprocess生成的  https://github.com/pytorch/fairseq/issues/1186
-- https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
-- https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
-- https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
-# 相关疑问
-### Ġ是什么
-It's a feature of byte-level BPE(an encoded space character).
-Ġ 表示空格，有的版本用Ä代替Ġ。
-```
-What's up with the tokenizer?
-# BPE后
-['What', "'s", 'Ġup', 'Ġwith', 'Ġthe', 'Ġtoken', 'izer', '?']
-# 经过vocab.json编码后
-[ 2061,   338,  510,    351,    262,    11241,    7509,   30]
-# 经过dict.txt编码后（fairseq特有）
-[           其他数字                                         ]
-```
-疑问：up会加Ġ，为什么what不加Ġ
-- https://github.com/pytorch/fairseq/issues/1716
-- https://github.com/huggingface/transformers/issues/1083

 - [vocab.json](https://huggingface.co/gpt2-large/resolve/main/vocab.json): 50257个kv-pair. https://huggingface.co/gpt2/resolve/main/vocab.json
 - [merges.txt](https://huggingface.co/gpt2-large/resolve/main/merges.txt): 50001行，https://huggingface.co/gpt2/resolve/main/merges.txt
   - merges.txts是否包含所有的组合？https://github.com/huggingface/transformers/issues/4777
+- [tokenizer.json](https://huggingface.co/openai-community/gpt2-large/blob/main/tokenizer.json)
+  - 这个是给
+词典加载 https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/tokenization_gpt2.py
+### fairseq = 官方
+- [vocab.bpe](https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe)：50001行
+  - 等于 hf的 `merges.txt`
+- [encoder.json](https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json): 50257个kv-pair
+  - 等于 hf的 `vocab.json`
+- [dict.txt](https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt): 50260行  这是词频，是由fairseq-preprocess生成的  https://github.com/pytorch/fairseq/issues/1186
+词典加载 https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/tokenization_gpt2.py

vocab/gpt_35_turbo/__init__.py CHANGED Viewed

@@ -6,7 +6,6 @@ import tiktoken
 import tokenizer.tiktoken_patch
 tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
-tokenizer.vocab_size = tokenizer.n_vocab
 tokenizer.comments = "tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"
 tokenizer.reversible = True  # It's reversible and lossless, so you can convert tokens back into the original text

 import tokenizer.tiktoken_patch
 tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
 tokenizer.comments = "tiktoken is a fast BPE tokeniser for use with OpenAI's models. There are 16 tokens KeyError"
 tokenizer.reversible = True  # It's reversible and lossless, so you can convert tokens back into the original text

vocab/gpt_35_turbo/decode_test.py CHANGED Viewed

@@ -9,5 +9,12 @@ encoding = tokenizer.encode(text)
 print(tokenizer.decode([6744]))
 print(tokenizer.convert_ids_to_tokens([6744]))
-print(tokenizer.decode([100256]))
-print(tokenizer.convert_ids_to_tokens([100256]))

 print(tokenizer.decode([6744]))
 print(tokenizer.convert_ids_to_tokens([6744]))
+print(tokenizer.decode([100256]))   # 是没有这个token吗？
+print(tokenizer.convert_ids_to_tokens([100256]))
+print(tokenizer.decode([100262]))
+print(tokenizer.convert_ids_to_tokens([100262]))
+print(tokenizer.decode([100273]))
+print(tokenizer.convert_ids_to_tokens([100273]))

vocab/gpt_35_turbo/test_tiktoken.py CHANGED Viewed

@@ -9,15 +9,18 @@ https://github.com/openai/tiktoken
 import json
 import tiktoken
 tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
 text = "你好，请告诉我聚乙烯是什么"
 # text = "a bcjik今天天气颗粒剂范大将军发卡卡萨"
-encoding = tokenizer.encode(text)
 decoding_bytes = tokenizer.decode_tokens_bytes(encoding)
 print(encoding)
 print(decoding_bytes)
 # for token in tokens:
 #     token_str = encoding.decode([token])

 import json
 import tiktoken
+# from tokenizer import tiktoken_patch
 tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
 text = "你好，请告诉我聚乙烯是什么"
 # text = "a bcjik今天天气颗粒剂范大将军发卡卡萨"
+text = "'<|endoftext|>"
+encoding = tokenizer.encode(text, allowed_special="all")
 decoding_bytes = tokenizer.decode_tokens_bytes(encoding)
 print(encoding)
 print(decoding_bytes)
+# 100256
 # for token in tokens:
 #     token_str = encoding.decode([token])

vocab/gpt_35_turbo/vocab.jsonl CHANGED Viewed

@@ -99964,3 +99964,314 @@
 {"id": 99963, "token": "\" Geg\""}
 {"id": 99964, "token": "\"\\tdto\""}
 {"id": 99965, "token": "\".defaultValue\""}

 {"id": 99963, "token": "\" Geg\""}
 {"id": 99964, "token": "\"\\tdto\""}
 {"id": 99965, "token": "\".defaultValue\""}
+{"id": 99966, "token": "\" Kami\""}
+{"id": 99967, "token": "\" ASE\""}
+{"id": 99968, "token": "\"optimized\""}
+{"id": 99969, "token": "\" \\ud3ec\""}
+{"id": 99970, "token": "\" originates\""}
+{"id": 99971, "token": "\"errMsg\""}
+{"id": 99972, "token": "\" espa\\u00e7o\""}
+{"id": 99973, "token": "\"(SYS\""}
+{"id": 99974, "token": "\" McB\""}
+{"id": 99975, "token": "\"dance\""}
+{"id": 99976, "token": "\"_detected\""}
+{"id": 99977, "token": "\" fr\\u00fc\""}
+{"id": 99978, "token": "\"\\t\\t    \\t\\t\""}
+{"id": 99979, "token": "\"<Date\""}
+{"id": 99980, "token": "\"(comb\""}
+{"id": 99981, "token": "\" Decide\""}
+{"id": 99982, "token": "\"\\\\Field\""}
+{"id": 99983, "token": "\" Proposed\""}
+{"id": 99984, "token": "\"Rib\""}
+{"id": 99985, "token": "\" dislikes\""}
+{"id": 99986, "token": "\" Wien\""}
+{"id": 99987, "token": "\"\\tDocument\""}
+{"id": 99988, "token": "\" traf\""}
+{"id": 99989, "token": "\" storia\""}
+{"id": 99990, "token": "\" Tells\""}
+{"id": 99991, "token": "\"')==\""}
+{"id": 99992, "token": "\"Cri\""}
+{"id": 99993, "token": "\"(VALUE\""}
+{"id": 99994, "token": "\" Burnett\""}
+{"id": 99995, "token": "\",void\""}
+{"id": 99996, "token": "\" danh\""}
+{"id": 99997, "token": "\" ccp\""}
+{"id": 99998, "token": "\"Blockchain\""}
+{"id": 99999, "token": "\":\\\"-\\\"`\\n\""}
+{"id": 100000, "token": "\"IClient\""}
+{"id": 100001, "token": "\"ISODE\""}
+{"id": 100002, "token": "\"Issuer\""}
+{"id": 100003, "token": "\")}\\r\\n\""}
+{"id": 100004, "token": "\",but\""}
+{"id": 100005, "token": "\" Uph\""}
+{"id": 100006, "token": "\"(Sub\""}
+{"id": 100007, "token": "\" t\\u00e9l\\u00e9phone\""}
+{"id": 100008, "token": "\" onDataChange\""}
+{"id": 100009, "token": "\" marshaller\""}
+{"id": 100010, "token": "\"-analytics\""}
+{"id": 100011, "token": "\",content\""}
+{"id": 100012, "token": "\" debacle\""}
+{"id": 100013, "token": "\"_ValueChanged\""}
+{"id": 100014, "token": "\" fauna\""}
+{"id": 100015, "token": "\" #=>\""}
+{"id": 100016, "token": "\" foyer\""}
+{"id": 100017, "token": "\"'utilisation\""}
+{"id": 100018, "token": "\" M\\u00fcller\""}
+{"id": 100019, "token": "\" Fetish\""}
+{"id": 100020, "token": "\" defaultManager\""}
+{"id": 100021, "token": "\" backtrack\""}
+{"id": 100022, "token": "\"Bah\""}
+{"id": 100023, "token": "\"Explicit\""}
+{"id": 100024, "token": "\"_ASCII\""}
+{"id": 100025, "token": "\" mActivity\""}
+{"id": 100026, "token": "\"(Msg\""}
+{"id": 100027, "token": "\" \\uac8c\""}
+{"id": 100028, "token": "\" TERMS\""}
+{"id": 100029, "token": "\" Angie\""}
+{"id": 100030, "token": "\"HSV\""}
+{"id": 100031, "token": "\" Mosque\""}
+{"id": 100032, "token": "\".Names\""}
+{"id": 100033, "token": "\"\\ud2bc\""}
+{"id": 100034, "token": "\"reste\""}
+{"id": 100035, "token": "\"_parms\""}
+{"id": 100036, "token": "\" gaping\""}
+{"id": 100037, "token": "\" cropping\""}
+{"id": 100038, "token": "\"DataFrame\""}
+{"id": 100039, "token": "\" responsiveness\""}
+{"id": 100040, "token": "\"_undo\""}
+{"id": 100041, "token": "\"_tran\""}
+{"id": 100042, "token": "\".terminate\""}
+{"id": 100043, "token": "\" italiane\""}
+{"id": 100044, "token": "\" walkthrough\""}
+{"id": 100045, "token": "\" attractiveness\""}
+{"id": 100046, "token": "\"\\u0434\\u0435\""}
+{"id": 100047, "token": "\"_STS\""}
+{"id": 100048, "token": "\"_learn\""}
+{"id": 100049, "token": "\" chocolates\""}
+{"id": 100050, "token": "\"ierarchical\""}
+{"id": 100051, "token": "\"-thinking\""}
+{"id": 100052, "token": "\" )))\""}
+{"id": 100053, "token": "\"ishments\""}
+{"id": 100054, "token": "\".Logf\""}
+{"id": 100055, "token": "\" TMZ\""}
+{"id": 100056, "token": "\" Canary\""}
+{"id": 100057, "token": "\"foil\""}
+{"id": 100058, "token": "\" Vaccine\""}
+{"id": 100059, "token": "\".vx\""}
+{"id": 100060, "token": "\" Surround\""}
+{"id": 100061, "token": "\"Intermediate\""}
+{"id": 100062, "token": "\" iov\""}
+{"id": 100063, "token": "\"vais\""}
+{"id": 100064, "token": "\"';\\\";\\n\""}
+{"id": 100065, "token": "\"\\uff5e\\n\\n\""}
+{"id": 100066, "token": "\"\\u9001\\u6599\""}
+{"id": 100067, "token": "\"\\u2026it\""}
+{"id": 100068, "token": "\"Seats\""}
+{"id": 100069, "token": "\"Clar\""}
+{"id": 100070, "token": "\"Wars\""}
+{"id": 100071, "token": "\" Hutchinson\""}
+{"id": 100072, "token": "\" Hasan\""}
+{"id": 100073, "token": "\"!')\\n\\n\""}
+{"id": 100074, "token": "\" Richie\""}
+{"id": 100075, "token": "\"cheiden\""}
+{"id": 100076, "token": "\"($('\""}
+{"id": 100077, "token": "\"York\""}
+{"id": 100078, "token": "\" lids\""}
+{"id": 100079, "token": "\" alphanumeric\""}
+{"id": 100080, "token": "\" Glock\""}
+{"id": 100081, "token": "\".shapes\""}
+{"id": 100082, "token": "\" sparking\""}
+{"id": 100083, "token": "\"_epsilon\""}
+{"id": 100084, "token": "\"uplicated\""}
+{"id": 100085, "token": "\".dirty\""}
+{"id": 100086, "token": "\"])==\""}
+{"id": 100087, "token": "\" \\uc704\\uce58\""}
+{"id": 100088, "token": "\" scn\""}
+{"id": 100089, "token": "\" /****************************************************************\""}
+{"id": 100090, "token": "\"_PREVIEW\""}
+{"id": 100091, "token": "\"_HC\""}
+{"id": 100092, "token": "\"ielding\""}
+{"id": 100093, "token": "\"fgets\""}
+{"id": 100094, "token": "\" Addison\""}
+{"id": 100095, "token": "\" productService\""}
+{"id": 100096, "token": "\"-figure\""}
+{"id": 100097, "token": "\"(retval\""}
+{"id": 100098, "token": "\"zano\""}
+{"id": 100099, "token": "\" autob\""}
+{"id": 100100, "token": "\"\\tsd\""}
+{"id": 100101, "token": "\"_numer\""}
+{"id": 100102, "token": "\" SetLastError\""}
+{"id": 100103, "token": "\" Fior\""}
+{"id": 100104, "token": "\"ificance\""}
+{"id": 100105, "token": "\"Untitled\""}
+{"id": 100106, "token": "\" infield\""}
+{"id": 100107, "token": "\" {}));\\n\""}
+{"id": 100108, "token": "\" spac\""}
+{"id": 100109, "token": "\" rookies\""}
+{"id": 100110, "token": "\"(describing\""}
+{"id": 100111, "token": "\"ngen\""}
+{"id": 100112, "token": "\"\\u0bbf\\ufffd\""}
+{"id": 100113, "token": "\".rdf\""}
+{"id": 100114, "token": "\".Mutex\""}
+{"id": 100115, "token": "\" kneeling\""}
+{"id": 100116, "token": "\" QE\""}
+{"id": 100117, "token": "\"setMax\""}
+{"id": 100118, "token": "\"ReadStream\""}
+{"id": 100119, "token": "\" ventas\""}
+{"id": 100120, "token": "\"sut\""}
+{"id": 100121, "token": "\"cmpeq\""}
+{"id": 100122, "token": "\".WriteAllText\""}
+{"id": 100123, "token": "\" Experienced\""}
+{"id": 100124, "token": "\"$__\""}
+{"id": 100125, "token": "\" kaum\""}
+{"id": 100126, "token": "\" LIS\""}
+{"id": 100127, "token": "\" documentos\""}
+{"id": 100128, "token": "\"_HEALTH\""}
+{"id": 100129, "token": "\"icontains\""}
+{"id": 100130, "token": "\" artisans\""}
+{"id": 100131, "token": "\"OWNER\""}
+{"id": 100132, "token": "\" blinked\""}
+{"id": 100133, "token": "\"getDisplay\""}
+{"id": 100134, "token": "\" toen\""}
+{"id": 100135, "token": "\" rowNum\""}
+{"id": 100136, "token": "\" avril\""}
+{"id": 100137, "token": "\" invis\""}
+{"id": 100138, "token": "\" Kear\""}
+{"id": 100139, "token": "\"toBeInTheDocument\""}
+{"id": 100140, "token": "\"apur\""}
+{"id": 100141, "token": "\" racked\""}
+{"id": 100142, "token": "\" McMaster\""}
+{"id": 100143, "token": "\"_ATTRIB\""}
+{"id": 100144, "token": "\"Haz\""}
+{"id": 100145, "token": "\" factura\""}
+{"id": 100146, "token": "\"/ts\""}
+{"id": 100147, "token": "\" \\u0440\\u0430\\u0437\\u043c\\u0435\\u0440\""}
+{"id": 100148, "token": "\" zf\""}
+{"id": 100149, "token": "\" shortfall\""}
+{"id": 100150, "token": "\".fasta\""}
+{"id": 100151, "token": "\" CONSTANT\""}
+{"id": 100152, "token": "\".managed\""}
+{"id": 100153, "token": "\"gems\""}
+{"id": 100154, "token": "\"SharedPointer\""}
+{"id": 100155, "token": "\" blurry\""}
+{"id": 100156, "token": "\"brightness\""}
+{"id": 100157, "token": "\"(components\""}
+{"id": 100158, "token": "\" ...\\\"\\n\\n\""}
+{"id": 100159, "token": "\"SELL\""}
+{"id": 100160, "token": "\" Illustrator\""}
+{"id": 100161, "token": "\".getChannel\""}
+{"id": 100162, "token": "\" trouv\\u00e9\""}
+{"id": 100163, "token": "\"ysters\""}
+{"id": 100164, "token": "\" vois\""}
+{"id": 100165, "token": "\" Linden\""}
+{"id": 100166, "token": "\" emojis\""}
+{"id": 100167, "token": "\" brawl\""}
+{"id": 100168, "token": "\" MSR\""}
+{"id": 100169, "token": "\" Elo\""}
+{"id": 100170, "token": "\" Croatian\""}
+{"id": 100171, "token": "\"PopupMenu\""}
+{"id": 100172, "token": "\"Lewis\""}
+{"id": 100173, "token": "\".JWT\""}
+{"id": 100174, "token": "\" astonished\""}
+{"id": 100175, "token": "\"Bush\""}
+{"id": 100176, "token": "\"(itemId\""}
+{"id": 100177, "token": "\" detachment\""}
+{"id": 100178, "token": "\" Encore\""}
+{"id": 100179, "token": "\"\\u5c14\""}
+{"id": 100180, "token": "\" rekl\""}
+{"id": 100181, "token": "\" cram\""}
+{"id": 100182, "token": "\")$/\""}
+{"id": 100183, "token": "\".getHost\""}
+{"id": 100184, "token": "\"_recommend\""}
+{"id": 100185, "token": "\"-HT\""}
+{"id": 100186, "token": "\"_calibration\""}
+{"id": 100187, "token": "\"Authenticate\""}
+{"id": 100188, "token": "\".firebaseapp\""}
+{"id": 100189, "token": "\"UNIX\""}
+{"id": 100190, "token": "\"\\tCamera\""}
+{"id": 100191, "token": "\" HEAP\""}
+{"id": 100192, "token": "\"Ideal\""}
+{"id": 100193, "token": "\".office\""}
+{"id": 100194, "token": "\" goofy\""}
+{"id": 100195, "token": "\"(Symbol\""}
+{"id": 100196, "token": "\" jouer\""}
+{"id": 100197, "token": "\"_partitions\""}
+{"id": 100198, "token": "\" rapidement\""}
+{"id": 100199, "token": "\" GNUNET\""}
+{"id": 100200, "token": "\"idUser\""}
+{"id": 100201, "token": "\" supervise\""}
+{"id": 100202, "token": "\"(Contact\""}
+{"id": 100203, "token": "\"AWN\""}
+{"id": 100204, "token": "\"\\u3058\""}
+{"id": 100205, "token": "\" naam\""}
+{"id": 100206, "token": "\" aust\""}
+{"id": 100207, "token": "\"\\u5728\\u7ebf\""}
+{"id": 100208, "token": "\"_softmax\""}
+{"id": 100209, "token": "\"AllowAnonymous\""}
+{"id": 100210, "token": "\"ammable\""}
+{"id": 100211, "token": "\"ROUTE\""}
+{"id": 100212, "token": "\"*D\""}
+{"id": 100213, "token": "\" aden\""}
+{"id": 100214, "token": "\" Cristina\""}
+{"id": 100215, "token": "\" Cristiano\""}
+{"id": 100216, "token": "\" bloodstream\""}
+{"id": 100217, "token": "\"subclass\""}
+{"id": 100218, "token": "\"_persona\""}
+{"id": 100219, "token": "\"CHILD\""}
+{"id": 100220, "token": "\"-know\""}
+{"id": 100221, "token": "\" navigationOptions\""}
+{"id": 100222, "token": "\" Zukunft\""}
+{"id": 100223, "token": "\" Pixar\""}
+{"id": 100224, "token": "\"Tyler\""}
+{"id": 100225, "token": "\" underworld\""}
+{"id": 100226, "token": "\" sincerity\""}
+{"id": 100227, "token": "\" dispenser\""}
+{"id": 100228, "token": "\" kter\""}
+{"id": 100229, "token": "\"idders\""}
+{"id": 100230, "token": "\".addNode\""}
+{"id": 100231, "token": "\"-checked\""}
+{"id": 100232, "token": "\" keyst\""}
+{"id": 100233, "token": "\" WTO\""}
+{"id": 100234, "token": "\".signals\""}
+{"id": 100235, "token": "\" adventurer\""}
+{"id": 100236, "token": "\" Pang\""}
+{"id": 100237, "token": "\"\\\\R\""}
+{"id": 100238, "token": "\"=pos\""}
+{"id": 100239, "token": "\" dispensaries\""}
+{"id": 100240, "token": "\" Closet\""}
+{"id": 100241, "token": "\"(\\\"{\\\\\\\"\""}
+{"id": 100242, "token": "\"ideon\""}
+{"id": 100243, "token": "\" n\\u00e9cessaire\""}
+{"id": 100244, "token": "\"()\\\"\\n\""}
+{"id": 100245, "token": "\"_RECEIVED\""}
+{"id": 100246, "token": "\" r\\u00e9sultats\""}
+{"id": 100247, "token": "\" moden\""}
+{"id": 100248, "token": "\" Icelandic\""}
+{"id": 100249, "token": "\";d\""}
+{"id": 100250, "token": "\".allowed\""}
+{"id": 100251, "token": "\"(newUser\""}
+{"id": 100252, "token": "\" merciless\""}
+{"id": 100253, "token": "\".WaitFor\""}
+{"id": 100254, "token": "\" daycare\""}
+{"id": 100255, "token": "\" Conveyor\""}
+{"id": 100256, "token": "\"null\""}
+{"id": 100257, "token": "\"<|endoftext|>\""}
+{"id": 100258, "token": "\"<|fim_prefix|>\""}
+{"id": 100259, "token": "\"<|fim_middle|>\""}
+{"id": 100260, "token": "\"<|fim_suffix|>\""}
+{"id": 100261, "token": "\"null\""}
+{"id": 100262, "token": "\"null\""}
+{"id": 100263, "token": "\"null\""}
+{"id": 100264, "token": "\"null\""}
+{"id": 100265, "token": "\"null\""}
+{"id": 100266, "token": "\"null\""}
+{"id": 100267, "token": "\"null\""}
+{"id": 100268, "token": "\"null\""}
+{"id": 100269, "token": "\"null\""}
+{"id": 100270, "token": "\"null\""}
+{"id": 100271, "token": "\"null\""}
+{"id": 100272, "token": "\"null\""}
+{"id": 100273, "token": "\"null\""}
+{"id": 100274, "token": "\"null\""}
+{"id": 100275, "token": "\"null\""}
+{"id": 100276, "token": "\"<|endofprompt|>\""}

vocab/gpt_nexo_20b/README.md CHANGED Viewed

@@ -18,11 +18,13 @@ self.padded_vocab_size = 50304
 padded vocab (size: 50277) with 27 dummy tokens (new size: 50304)
 ## 词典
 见 convert_vocab_to_txt.py
-```
 {"id": 13609, "token": "\u00e4\u00b8\u0143", "token_decode": "\u4e2d"}	中
 # 多个符号拼接在一起的
@@ -30,8 +32,16 @@ padded vocab (size: 50277) with 27 dummy tokens (new size: 50304)
 # ss
 ```
 ## special_tokens
 https://huggingface.co/EleutherAI/gpt-neox-20b/blob/main/special_tokens_map.json
@@ -83,4 +93,7 @@ gpt-neox是在800G英文数据集上训练的，为啥词典支持中文？因
       "ard less",

 padded vocab (size: 50277) with 27 dummy tokens (new size: 50304)
 ## 词典
 见 convert_vocab_to_txt.py
+```sh
 {"id": 13609, "token": "\u00e4\u00b8\u0143", "token_decode": "\u4e2d"}	中
 # 多个符号拼接在一起的
 # ss
+# 基本字节
+(\u0021-\u007E) + (\u00A1-\u0143)
 ```
 ## special_tokens
 https://huggingface.co/EleutherAI/gpt-neox-20b/blob/main/special_tokens_map.json
       "ard less",
+## hf格式
+https://huggingface.co/EleutherAI/gpt-neox-20b/tree/main

vocab/gpt_nexo_20b/test_tokenizer.py CHANGED Viewed

@@ -12,17 +12,60 @@ print("vocab_size without added_tokens:", tokenizer.get_vocab_size(with_added_to
 vocab = tokenizer.get_vocab()
 def test_single_token():
     """
     单个字符的编码（一个字符可能会编码成多个id）
     """
-    for word in "发大厦三分赛中国解决方法黑白侗鸩，。！？；":
         encoding = tokenizer.encode(word)
         for token_id in encoding.ids:
             decode_str = tokenizer.decode([token_id])  # 特殊字符解码后会统一变成 �，对应 "\ufffd"
             token = tokenizer.id_to_token(token_id)
-            print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
 def test_long_token():
@@ -53,6 +96,7 @@ def test_encode():
         print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
-test_single_token()
 # test_long_token()
 # test_encode()

 vocab = tokenizer.get_vocab()
+def to_unicode(text):
+    return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text)
+def is_UTF_8(str):
+    remain = 0  # 剩余byte数
+    for x in range(len(str)):
+        if remain == 0:
+            if (ord(str[x]) & 0x80) == 0x00:
+                remain = 0
+            elif (ord(str[x]) & 0xE0) == 0xC0:
+                remain = 1
+            elif (ord(str[x]) & 0xF0) == 0xE0:
+                remain = 2
+            elif (ord(str[x]) & 0xF8) == 0xF0:
+                remain = 3
+            else:
+                return False
+        else:
+            if not ((ord(str[x]) & 0xC0) == 0x80):
+                return False
+            remain = remain - 1
+    if remain == 0:  # 最后如果remain不等于零，可能没有匹配完整
+        return True
+    else:
+        return False
+def test_reverse():
+    f_out = open("reverse.jsonl", "w", encoding="utf-8")
+    for token_id in range(tokenizer.get_vocab_size(with_added_tokens=False)):
+        token = tokenizer.id_to_token(token_id)
+        print(token_id, is_UTF_8(token))
+        if "Ġ" in token:
+            continue
+        encoding = tokenizer.encode(token)
+        if len(encoding.ids) > 1 or encoding.ids[0] != token_id:
+            f_out.write(json.dumps({"id": token_id, "token": token, "encoding": encoding.ids, "is_utf8": is_UTF_8(token), "isalpha": token.isalpha()}) + "\n")
 def test_single_token():
     """
     单个字符的编码（一个字符可能会编码成多个id）
     """
+    for word in "发大厦三分赛中国解决方法黑白侗鸩，。！？；ĠABC":
         encoding = tokenizer.encode(word)
         for token_id in encoding.ids:
             decode_str = tokenizer.decode([token_id])  # 特殊字符解码后会统一变成 �，对应 "\ufffd"
             token = tokenizer.id_to_token(token_id)
+            print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token), token.encode("utf-8"), bytes(token, "utf-8"), to_unicode(token))
 def test_long_token():
         print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token))
+test_reverse()
+# test_single_token()
 # test_long_token()
 # test_encode()

vocab/gpt_nexo_20b/tokenzier_hf/README.md DELETED Viewed

@@ -1,6 +0,0 @@
-## hf格式
-https://huggingface.co/EleutherAI/gpt-neox-20b/tree/main

vocab/jamba_v0_1/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""
+Jamba-v0.1
+"""
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")

vocab/kplug/__init__.py CHANGED Viewed

@@ -2,4 +2,4 @@
 from transformers import BertTokenizer
 tokenizer = BertTokenizer.from_pretrained("eson/kplug-base-encoder")
-print(tokenizer)


2	from transformers import BertTokenizer
3
4	tokenizer = BertTokenizer.from_pretrained("eson/kplug-base-encoder")
5	+

vocab/llama/gpt_neox/get_oov_zh_tokens.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from utils.zh_util import is_chinese
 from transformers import LlamaTokenizer
 llama_vocab = LlamaTokenizer.from_pretrained("../tokenizer").get_vocab()
@@ -14,7 +14,7 @@ for token, token_id in vocab.items():
     # token = token.strip("Ġ")
     if len(token) < 1:
         continue
-    if is_chinese(token[0]):
         if token not in llama_vocab:
             f_out.write(token + "\n")

+from utils.zh_util import is_zh_char
 from transformers import LlamaTokenizer
 llama_vocab = LlamaTokenizer.from_pretrained("../tokenizer").get_vocab()
     # token = token.strip("Ġ")
     if len(token) < 1:
         continue
+    if is_zh_char(token[0]):
         if token not in llama_vocab:
             f_out.write(token + "\n")

vocab/llama3/Meta-Llama-3-70B/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "bos_token": "<|begin_of_text|>",
+  "eos_token": "<|end_of_text|>"
+}

vocab/llama3/Meta-Llama-3-70B/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ac333c83e2d107910928928b5912d8ade91594d08c7c73c4606d05c032d7632
+size 9084463

vocab/llama3/Meta-Llama-3-70B/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,2062 @@

+{
+  "added_tokens_decoder": {
+    "128000": {
+      "content": "<|begin_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128001": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128002": {
+      "content": "<|reserved_special_token_0|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128003": {
+      "content": "<|reserved_special_token_1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128004": {
+      "content": "<|reserved_special_token_2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128005": {
+      "content": "<|reserved_special_token_3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128006": {
+      "content": "<|start_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128007": {
+      "content": "<|end_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128008": {
+      "content": "<|reserved_special_token_4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128009": {
+      "content": "<|eot_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128010": {
+      "content": "<|reserved_special_token_5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128011": {
+      "content": "<|reserved_special_token_6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128012": {
+      "content": "<|reserved_special_token_7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128013": {
+      "content": "<|reserved_special_token_8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128014": {
+      "content": "<|reserved_special_token_9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128015": {
+      "content": "<|reserved_special_token_10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128016": {
+      "content": "<|reserved_special_token_11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128017": {
+      "content": "<|reserved_special_token_12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128018": {
+      "content": "<|reserved_special_token_13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128019": {
+      "content": "<|reserved_special_token_14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128020": {
+      "content": "<|reserved_special_token_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128021": {
+      "content": "<|reserved_special_token_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128022": {
+      "content": "<|reserved_special_token_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128023": {
+      "content": "<|reserved_special_token_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128024": {
+      "content": "<|reserved_special_token_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128025": {
+      "content": "<|reserved_special_token_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128026": {
+      "content": "<|reserved_special_token_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128027": {
+      "content": "<|reserved_special_token_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128028": {
+      "content": "<|reserved_special_token_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128029": {
+      "content": "<|reserved_special_token_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128030": {
+      "content": "<|reserved_special_token_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128031": {
+      "content": "<|reserved_special_token_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128032": {
+      "content": "<|reserved_special_token_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128033": {
+      "content": "<|reserved_special_token_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128034": {
+      "content": "<|reserved_special_token_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128035": {
+      "content": "<|reserved_special_token_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128036": {
+      "content": "<|reserved_special_token_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128037": {
+      "content": "<|reserved_special_token_32|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128038": {
+      "content": "<|reserved_special_token_33|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128039": {
+      "content": "<|reserved_special_token_34|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128040": {
+      "content": "<|reserved_special_token_35|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128041": {
+      "content": "<|reserved_special_token_36|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128042": {
+      "content": "<|reserved_special_token_37|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128043": {
+      "content": "<|reserved_special_token_38|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128044": {
+      "content": "<|reserved_special_token_39|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128045": {
+      "content": "<|reserved_special_token_40|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128046": {
+      "content": "<|reserved_special_token_41|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128047": {
+      "content": "<|reserved_special_token_42|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128048": {
+      "content": "<|reserved_special_token_43|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128049": {
+      "content": "<|reserved_special_token_44|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128050": {
+      "content": "<|reserved_special_token_45|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128051": {
+      "content": "<|reserved_special_token_46|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128052": {
+      "content": "<|reserved_special_token_47|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128053": {
+      "content": "<|reserved_special_token_48|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128054": {
+      "content": "<|reserved_special_token_49|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128055": {
+      "content": "<|reserved_special_token_50|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128056": {
+      "content": "<|reserved_special_token_51|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128057": {
+      "content": "<|reserved_special_token_52|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128058": {
+      "content": "<|reserved_special_token_53|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128059": {
+      "content": "<|reserved_special_token_54|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128060": {
+      "content": "<|reserved_special_token_55|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128061": {
+      "content": "<|reserved_special_token_56|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128062": {
+      "content": "<|reserved_special_token_57|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128063": {
+      "content": "<|reserved_special_token_58|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128064": {
+      "content": "<|reserved_special_token_59|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128065": {
+      "content": "<|reserved_special_token_60|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128066": {
+      "content": "<|reserved_special_token_61|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128067": {
+      "content": "<|reserved_special_token_62|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128068": {
+      "content": "<|reserved_special_token_63|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128069": {
+      "content": "<|reserved_special_token_64|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128070": {
+      "content": "<|reserved_special_token_65|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128071": {
+      "content": "<|reserved_special_token_66|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128072": {
+      "content": "<|reserved_special_token_67|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128073": {
+      "content": "<|reserved_special_token_68|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128074": {
+      "content": "<|reserved_special_token_69|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128075": {
+      "content": "<|reserved_special_token_70|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128076": {
+      "content": "<|reserved_special_token_71|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128077": {
+      "content": "<|reserved_special_token_72|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128078": {
+      "content": "<|reserved_special_token_73|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128079": {
+      "content": "<|reserved_special_token_74|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128080": {
+      "content": "<|reserved_special_token_75|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128081": {
+      "content": "<|reserved_special_token_76|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128082": {
+      "content": "<|reserved_special_token_77|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128083": {
+      "content": "<|reserved_special_token_78|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128084": {
+      "content": "<|reserved_special_token_79|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128085": {
+      "content": "<|reserved_special_token_80|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128086": {
+      "content": "<|reserved_special_token_81|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128087": {
+      "content": "<|reserved_special_token_82|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128088": {
+      "content": "<|reserved_special_token_83|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128089": {
+      "content": "<|reserved_special_token_84|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128090": {
+      "content": "<|reserved_special_token_85|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128091": {
+      "content": "<|reserved_special_token_86|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128092": {
+      "content": "<|reserved_special_token_87|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128093": {
+      "content": "<|reserved_special_token_88|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128094": {
+      "content": "<|reserved_special_token_89|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128095": {
+      "content": "<|reserved_special_token_90|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128096": {
+      "content": "<|reserved_special_token_91|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128097": {
+      "content": "<|reserved_special_token_92|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128098": {
+      "content": "<|reserved_special_token_93|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128099": {
+      "content": "<|reserved_special_token_94|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128100": {
+      "content": "<|reserved_special_token_95|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128101": {
+      "content": "<|reserved_special_token_96|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128102": {
+      "content": "<|reserved_special_token_97|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128103": {
+      "content": "<|reserved_special_token_98|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128104": {
+      "content": "<|reserved_special_token_99|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128105": {
+      "content": "<|reserved_special_token_100|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128106": {
+      "content": "<|reserved_special_token_101|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128107": {
+      "content": "<|reserved_special_token_102|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128108": {
+      "content": "<|reserved_special_token_103|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128109": {
+      "content": "<|reserved_special_token_104|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128110": {
+      "content": "<|reserved_special_token_105|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128111": {
+      "content": "<|reserved_special_token_106|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128112": {
+      "content": "<|reserved_special_token_107|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128113": {
+      "content": "<|reserved_special_token_108|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128114": {
+      "content": "<|reserved_special_token_109|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128115": {
+      "content": "<|reserved_special_token_110|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128116": {
+      "content": "<|reserved_special_token_111|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128117": {
+      "content": "<|reserved_special_token_112|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128118": {
+      "content": "<|reserved_special_token_113|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128119": {
+      "content": "<|reserved_special_token_114|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128120": {
+      "content": "<|reserved_special_token_115|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128121": {
+      "content": "<|reserved_special_token_116|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128122": {
+      "content": "<|reserved_special_token_117|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128123": {
+      "content": "<|reserved_special_token_118|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128124": {
+      "content": "<|reserved_special_token_119|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128125": {
+      "content": "<|reserved_special_token_120|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128126": {
+      "content": "<|reserved_special_token_121|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128127": {
+      "content": "<|reserved_special_token_122|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128128": {
+      "content": "<|reserved_special_token_123|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128129": {
+      "content": "<|reserved_special_token_124|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128130": {
+      "content": "<|reserved_special_token_125|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128131": {
+      "content": "<|reserved_special_token_126|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128132": {
+      "content": "<|reserved_special_token_127|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128133": {
+      "content": "<|reserved_special_token_128|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128134": {
+      "content": "<|reserved_special_token_129|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128135": {
+      "content": "<|reserved_special_token_130|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128136": {
+      "content": "<|reserved_special_token_131|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128137": {
+      "content": "<|reserved_special_token_132|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128138": {
+      "content": "<|reserved_special_token_133|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128139": {
+      "content": "<|reserved_special_token_134|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128140": {
+      "content": "<|reserved_special_token_135|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128141": {
+      "content": "<|reserved_special_token_136|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128142": {
+      "content": "<|reserved_special_token_137|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128143": {
+      "content": "<|reserved_special_token_138|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128144": {
+      "content": "<|reserved_special_token_139|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128145": {
+      "content": "<|reserved_special_token_140|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128146": {
+      "content": "<|reserved_special_token_141|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128147": {
+      "content": "<|reserved_special_token_142|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128148": {
+      "content": "<|reserved_special_token_143|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128149": {
+      "content": "<|reserved_special_token_144|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128150": {
+      "content": "<|reserved_special_token_145|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128151": {
+      "content": "<|reserved_special_token_146|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128152": {
+      "content": "<|reserved_special_token_147|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128153": {
+      "content": "<|reserved_special_token_148|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128154": {
+      "content": "<|reserved_special_token_149|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128155": {
+      "content": "<|reserved_special_token_150|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128156": {
+      "content": "<|reserved_special_token_151|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128157": {
+      "content": "<|reserved_special_token_152|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128158": {
+      "content": "<|reserved_special_token_153|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128159": {
+      "content": "<|reserved_special_token_154|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128160": {
+      "content": "<|reserved_special_token_155|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128161": {
+      "content": "<|reserved_special_token_156|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128162": {
+      "content": "<|reserved_special_token_157|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128163": {
+      "content": "<|reserved_special_token_158|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128164": {
+      "content": "<|reserved_special_token_159|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128165": {
+      "content": "<|reserved_special_token_160|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128166": {
+      "content": "<|reserved_special_token_161|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128167": {
+      "content": "<|reserved_special_token_162|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128168": {
+      "content": "<|reserved_special_token_163|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128169": {
+      "content": "<|reserved_special_token_164|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128170": {
+      "content": "<|reserved_special_token_165|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128171": {
+      "content": "<|reserved_special_token_166|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128172": {
+      "content": "<|reserved_special_token_167|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128173": {
+      "content": "<|reserved_special_token_168|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128174": {
+      "content": "<|reserved_special_token_169|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128175": {
+      "content": "<|reserved_special_token_170|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128176": {
+      "content": "<|reserved_special_token_171|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128177": {
+      "content": "<|reserved_special_token_172|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128178": {
+      "content": "<|reserved_special_token_173|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128179": {
+      "content": "<|reserved_special_token_174|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128180": {
+      "content": "<|reserved_special_token_175|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128181": {
+      "content": "<|reserved_special_token_176|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128182": {
+      "content": "<|reserved_special_token_177|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128183": {
+      "content": "<|reserved_special_token_178|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128184": {
+      "content": "<|reserved_special_token_179|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128185": {
+      "content": "<|reserved_special_token_180|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128186": {
+      "content": "<|reserved_special_token_181|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128187": {
+      "content": "<|reserved_special_token_182|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128188": {
+      "content": "<|reserved_special_token_183|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128189": {
+      "content": "<|reserved_special_token_184|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128190": {
+      "content": "<|reserved_special_token_185|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128191": {
+      "content": "<|reserved_special_token_186|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128192": {
+      "content": "<|reserved_special_token_187|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128193": {
+      "content": "<|reserved_special_token_188|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128194": {
+      "content": "<|reserved_special_token_189|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128195": {
+      "content": "<|reserved_special_token_190|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128196": {
+      "content": "<|reserved_special_token_191|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128197": {
+      "content": "<|reserved_special_token_192|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128198": {
+      "content": "<|reserved_special_token_193|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128199": {
+      "content": "<|reserved_special_token_194|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128200": {
+      "content": "<|reserved_special_token_195|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128201": {
+      "content": "<|reserved_special_token_196|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128202": {
+      "content": "<|reserved_special_token_197|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128203": {
+      "content": "<|reserved_special_token_198|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128204": {
+      "content": "<|reserved_special_token_199|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128205": {
+      "content": "<|reserved_special_token_200|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128206": {
+      "content": "<|reserved_special_token_201|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128207": {
+      "content": "<|reserved_special_token_202|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128208": {
+      "content": "<|reserved_special_token_203|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128209": {
+      "content": "<|reserved_special_token_204|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128210": {
+      "content": "<|reserved_special_token_205|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128211": {
+      "content": "<|reserved_special_token_206|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128212": {
+      "content": "<|reserved_special_token_207|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128213": {
+      "content": "<|reserved_special_token_208|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128214": {
+      "content": "<|reserved_special_token_209|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128215": {
+      "content": "<|reserved_special_token_210|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128216": {
+      "content": "<|reserved_special_token_211|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128217": {
+      "content": "<|reserved_special_token_212|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128218": {
+      "content": "<|reserved_special_token_213|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128219": {
+      "content": "<|reserved_special_token_214|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128220": {
+      "content": "<|reserved_special_token_215|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128221": {
+      "content": "<|reserved_special_token_216|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128222": {
+      "content": "<|reserved_special_token_217|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128223": {
+      "content": "<|reserved_special_token_218|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128224": {
+      "content": "<|reserved_special_token_219|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128225": {
+      "content": "<|reserved_special_token_220|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128226": {
+      "content": "<|reserved_special_token_221|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128227": {
+      "content": "<|reserved_special_token_222|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128228": {
+      "content": "<|reserved_special_token_223|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128229": {
+      "content": "<|reserved_special_token_224|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128230": {
+      "content": "<|reserved_special_token_225|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128231": {
+      "content": "<|reserved_special_token_226|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128232": {
+      "content": "<|reserved_special_token_227|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128233": {
+      "content": "<|reserved_special_token_228|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128234": {
+      "content": "<|reserved_special_token_229|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128235": {
+      "content": "<|reserved_special_token_230|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128236": {
+      "content": "<|reserved_special_token_231|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128237": {
+      "content": "<|reserved_special_token_232|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128238": {
+      "content": "<|reserved_special_token_233|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128239": {
+      "content": "<|reserved_special_token_234|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128240": {
+      "content": "<|reserved_special_token_235|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128241": {
+      "content": "<|reserved_special_token_236|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128242": {
+      "content": "<|reserved_special_token_237|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128243": {
+      "content": "<|reserved_special_token_238|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128244": {
+      "content": "<|reserved_special_token_239|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128245": {
+      "content": "<|reserved_special_token_240|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128246": {
+      "content": "<|reserved_special_token_241|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128247": {
+      "content": "<|reserved_special_token_242|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128248": {
+      "content": "<|reserved_special_token_243|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128249": {
+      "content": "<|reserved_special_token_244|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128250": {
+      "content": "<|reserved_special_token_245|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128251": {
+      "content": "<|reserved_special_token_246|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128252": {
+      "content": "<|reserved_special_token_247|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128253": {
+      "content": "<|reserved_special_token_248|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128254": {
+      "content": "<|reserved_special_token_249|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128255": {
+      "content": "<|reserved_special_token_250|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|begin_of_text|>",
+  "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}

vocab/llama3/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import os
+from transformers import AutoTokenizer
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+TOKENIZER_DIR = os.path.join(CURRENT_DIR, "Meta-Llama-3-70B")
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)

vocab/mobilenet_v2/__init__.py CHANGED Viewed

@@ -7,6 +7,10 @@
   File "/home/user/.local/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 748, in __getitem__
     raise KeyError(key)
 KeyError: <class 'transformers.models.mobilenet_v2.configuration_mobilenet_v2.MobileNetV2Config'>
 """
 from transformers import AutoTokenizer

   File "/home/user/.local/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 748, in __getitem__
     raise KeyError(key)
 KeyError: <class 'transformers.models.mobilenet_v2.configuration_mobilenet_v2.MobileNetV2Config'>
+## how to fix?
 """
 from transformers import AutoTokenizer

vocab/moss/test_zh_coding_len.py CHANGED Viewed

@@ -16,7 +16,7 @@
 from collections import Counter
 from transformers import AutoTokenizer
 from data_sample.oov_base import jd_vocab_tokens
-from utils.text_util import is_chinese, has_chinese
 from zhon.hanzi import punctuation as zh_punc
 tokenizer = AutoTokenizer.from_pretrained("tokenizer", trust_remote_code=True)
@@ -56,7 +56,7 @@ def iter_vocab():
     zh_symbol_count = 0
     for idx in range(len(vocab)):
         decode_str = tokenizer.decode([idx])
-        if has_chinese(decode_str):
             zh_token_count["total"] += 1
             if len(decode_str.strip()) > 1:
                 zh_token_count["中文多字"] += 1

 from collections import Counter
 from transformers import AutoTokenizer
 from data_sample.oov_base import jd_vocab_tokens
+from utils.text_util import is_zh_char, has_zh
 from zhon.hanzi import punctuation as zh_punc
 tokenizer = AutoTokenizer.from_pretrained("tokenizer", trust_remote_code=True)
     zh_symbol_count = 0
     for idx in range(len(vocab)):
         decode_str = tokenizer.decode([idx])
+        if has_zh(decode_str):
             zh_token_count["total"] += 1
             if len(decode_str.strip()) > 1:
                 zh_token_count["中文多字"] += 1