Spaces:

xu-song
/

tokenizer-arena

Running

App Files Files Community

xu-song commited on Jan 31, 2024

Commit

d27a756

1 Parent(s): a37f943

fix chatglm; new feature about add_special_tokens;

Browse files

Files changed (13) hide show

app.py +1 -1
config.py +2 -3
examples.py +3 -3
js/onload.js +1 -1
util.py +5 -1
utils/compress_rate_util.py +9 -0
utils/speed_util.py +3 -0
vocab/chatglm_6b/chatglm_6b/tokenization_chatglm.py +2 -1
vocab/chatglm_6b/test_chatglm.py +2 -1
vocab/gpt_35_turbo/README.md +8 -0
vocab/gpt_35_turbo/__init__.py +15 -4
vocab/gpt_35_turbo/decode_test.py +8 -1
vocab/gpt_35_turbo/test_tiktoken.py +3 -1

app.py CHANGED Viewed

@@ -59,7 +59,7 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
         gr.Markdown("## Input Text")
         dropdown_examples = gr.Dropdown(
             # ["空格测试", "标点测试", "符号测试", "数字测试"],
-            ["spaces", "punctuations", "symbols", "digits"],
             value="Examples",
             type="index",
             show_label=False,

         gr.Markdown("## Input Text")
         dropdown_examples = gr.Dropdown(
             # ["空格测试", "标点测试", "符号测试", "数字测试"],
+            ["space", "punctuation", "symbol", "number"],
             value="Examples",
             type="index",
             show_label=False,

config.py CHANGED Viewed

	@@ -1,3 +1,2 @@
1	-
2	-
3	- USE_REMOTE = False


1	+ USE_REMOTE = False
2	+ ADD_SPECIAL_TOKEN = False

examples.py CHANGED Viewed

@@ -2,9 +2,9 @@ examples = {
     "en": [
         ["spaces:  2spaces        8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"],  # chatglm 有blank_n,
         # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
-        ["punctuations: ,.:/?+=\"，。！？；【】〔〕〖〗", "baichuan", "llama"],
-        ["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
-        ["digits: (10086 + 98) = 100184", "baichuan", "llama"]
     ]
     ,
     "zh": [

     "en": [
         ["spaces:  2spaces        8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"],  # chatglm 有blank_n,
         # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
+        ["punctuation: ,.:/?+=\"，。！？；【】〔〕〖〗", "baichuan", "llama"],
+        ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
+        ["number: (10086 + 98) = 100184", "baichuan", "llama"]
     ]
     ,
     "zh": [

js/onload.js CHANGED Viewed

@@ -3,7 +3,7 @@ function() {
     //$("footer a")["href"] = "https://github.com/xu-song/tokenizer-arena/issues"
     //$("footer a").childNodes[0].textContent ="Send Feedback"
-    document.querySelectorAll("footer a")[0].childNodes[0].textContent ="Send Feedback";
     document.querySelectorAll("footer a")[0].href = "https://github.com/xu-song/tokenizer-arena/issues";
     // download button

     //$("footer a")["href"] = "https://github.com/xu-song/tokenizer-arena/issues"
     //$("footer a").childNodes[0].textContent ="Send Feedback"
+    document.querySelectorAll("footer a")[0].childNodes[0].textContent ="Send Feedback";  //  🤔Reporting Issues
     document.querySelectorAll("footer a")[0].href = "https://github.com/xu-song/tokenizer-arena/issues";
     // download button

util.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 import json
 import socket
 import pandas as pd
 from vocab import load_tokener
 from utils.zh_util import iter_vocab
 from utils.log_util import logger
@@ -16,7 +17,10 @@ def tokenize(text, tokenizer_type, color_num=5):
     logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_type}, ensure_ascii=False))
     pos_tokens = []
     tokenizer = load_tokener(tokenizer_type)
-    encoding = tokenizer.encode(text)
     table = []

 import json
 import socket
 import pandas as pd
+import config
 from vocab import load_tokener
 from utils.zh_util import iter_vocab
 from utils.log_util import logger
     logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_type}, ensure_ascii=False))
     pos_tokens = []
     tokenizer = load_tokener(tokenizer_type)
+    if config.ADD_SPECIAL_TOKEN:
+        encoding = tokenizer.encode(text, add_special_tokens=True)
+    else:
+        encoding = tokenizer.encode(text, add_special_tokens=False)
     table = []

utils/compress_rate_util.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""
+中文数据
+英文数据：
+"""

utils/speed_util.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+分词速度
+"""

vocab/chatglm_6b/chatglm_6b/tokenization_chatglm.py CHANGED Viewed

@@ -195,6 +195,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
             padding_side="left",
             **kwargs
     ) -> None:
         super().__init__(
             do_lower_case=do_lower_case,
             remove_space=remove_space,
@@ -212,7 +213,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
         self.mask_token = mask_token
         self.gMASK_token = gmask_token
-        self.sp_tokenizer = SPTokenizer(vocab_file)
         """ Initialisation """

             padding_side="left",
             **kwargs
     ) -> None:
+        self.sp_tokenizer = SPTokenizer(vocab_file)
         super().__init__(
             do_lower_case=do_lower_case,
             remove_space=remove_space,
         self.mask_token = mask_token
         self.gMASK_token = gmask_token
         """ Initialisation """

vocab/chatglm_6b/test_chatglm.py CHANGED Viewed

@@ -33,7 +33,7 @@ from transformers import AutoTokenizer
 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
 # tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
-tokenizer = AutoTokenizer.from_pretrained("tokenizer", trust_remote_code=True)
 def encode_text(text):
@@ -105,6 +105,7 @@ def test_tokens():
 test_tokens()
 # tokenizer.sp_tokenizer.text_tokenizer.convert_token_to_id(x) + tokenizer.sp_tokenizer.num_image_tokens

 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
 # tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained("chatglm_6b/", trust_remote_code=True)
 def encode_text(text):
 test_tokens()
+encode_text("good job d的 算法")
 # tokenizer.sp_tokenizer.text_tokenizer.convert_token_to_id(x) + tokenizer.sp_tokenizer.num_image_tokens

vocab/gpt_35_turbo/README.md CHANGED Viewed

@@ -24,6 +24,14 @@ special_token
 {"id": 100276, "token": "<|endofprompt|>", "token_decode": "<|endofprompt|>", "token_len": 15, "zh_count": 0, "space_count": 0, "digit_count": 0, "zh_symbol_count": 0}
 ```
 ## 词典文件

 {"id": 100276, "token": "<|endofprompt|>", "token_decode": "<|endofprompt|>", "token_len": 15, "zh_count": 0, "space_count": 0, "digit_count": 0, "zh_symbol_count": 0}
 ```
+汉字+符号
+```
+{"id": 39045, "token": "，请", "token_decode": "，请", "token_len": 2, "zh_count": 1, "space_count": 0, "digit_count": 0, "zh_symbol_count": 0}
+```
 ## 词典文件

vocab/gpt_35_turbo/__init__.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import tiktoken
 from tiktoken import Encoding
@@ -22,17 +24,19 @@ def decode(self, tokens, errors="replace", skip_special_tokens=False):
         decode_str = "null"
     return decode_str
 def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
     """
     为什么没有这个方法？
     """
     try:
-        return tokenizer.decode_tokens_bytes(tokens)
     except:
         # 什么要返回None？见zh_util.py
         # 16个空闲id, 100256 100261-100275
         return [None for token in tokens]
 def get_vocab(self, token_type="str"):
     """Returns vocab as a dict
     :param token_type: ["str", "byte"]
@@ -59,10 +63,17 @@ def get_vocab(self, token_type="str"):
     return vocab
 # tiktoken patch
 Encoding.decode = decode
 Encoding.convert_ids_to_tokens = convert_ids_to_tokens
 Encoding.get_vocab = get_vocab

+"""
+，请
+"""
 import tiktoken
 from tiktoken import Encoding
         decode_str = "null"
     return decode_str
 def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
     """
     为什么没有这个方法？
     """
     try:
+        return self.decode_tokens_bytes(tokens)
     except:
         # 什么要返回None？见zh_util.py
         # 16个空闲id, 100256 100261-100275
         return [None for token in tokens]
 def get_vocab(self, token_type="str"):
     """Returns vocab as a dict
     :param token_type: ["str", "byte"]
     return vocab
+def encode(self, *args, **kwargs):
+    """
+    add_special_token 是为了兼容 hf_tokenizer
+    """
+    kwargs.pop("add_special_token", None)
+    return self._encode(*args, **kwargs)
 # tiktoken patch
+Encoding._encode = Encoding.encode
+Encoding.encode = encode
 Encoding.decode = decode
 Encoding.convert_ids_to_tokens = convert_ids_to_tokens
 Encoding.get_vocab = get_vocab

vocab/gpt_35_turbo/decode_test.py CHANGED Viewed

@@ -1,6 +1,13 @@
 from vocab.gpt_35_turbo import tokenizer
-print(tokenizer.decode([100256]))
 print(tokenizer.convert_ids_to_tokens([100256]))

 from vocab.gpt_35_turbo import tokenizer
+text = "你好，请告诉我聚乙烯是什么"
+encoding = tokenizer.encode(text)
+print(tokenizer.decode([6744]))
+print(tokenizer.convert_ids_to_tokens([6744]))
+print(tokenizer.decode([100256]))
 print(tokenizer.convert_ids_to_tokens([100256]))

vocab/gpt_35_turbo/test_tiktoken.py CHANGED Viewed

@@ -12,7 +12,9 @@ import tiktoken
 tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
-encoding = tokenizer.encode("a bcjik今天天气颗粒剂范大将军发卡卡萨")
 decoding_bytes = tokenizer.decode_tokens_bytes(encoding)
 print(encoding)
 print(decoding_bytes)

 tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
+text = "你好，请告诉我聚乙烯是什么"
+# text = "a bcjik今天天气颗粒剂范大将军发卡卡萨"
+encoding = tokenizer.encode(text)
 decoding_bytes = tokenizer.decode_tokens_bytes(encoding)
 print(encoding)
 print(decoding_bytes)