Spaces:

xu-song
/

tokenizer-arena

Running

xu-song commited on Apr 23

Commit

7d2062e

•

1 Parent(s): 480ae5d

update

Files changed (3) hide show

README.md CHANGED Viewed

@@ -9,34 +9,9 @@ app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
-## ss
-## TODO
-- 搜索栏
--
-## 统计
-## vocabsize
-- 增大能提到压缩率，副作用是增大计算量和内存 （getting the most out of your tokenizer for pre-training and）
--
-https://huggingface.co/spaces/yenniejun/tokenizers-languages
-## Compress Rate
 在 [cc-100](https://huggingface.co/datasets/cc100) 数据集，每个语言取1万条数据，测试不同tokenizer的压缩率。压缩率指标 `g_bytes/b_tokens`

 pinned: false
 ---
+## 压缩率 Compress Rate
 在 [cc-100](https://huggingface.co/datasets/cc100) 数据集，每个语言取1万条数据，测试不同tokenizer的压缩率。压缩率指标 `g_bytes/b_tokens`

config.py CHANGED Viewed

@@ -17,4 +17,4 @@ Buenos días!
 华为发布Mate60手机。
 ラグビーワールドカップ2023フランス"""
 default_tokenizer_type_1 = "llama3"
-default_tokenizer_type_2 = "gpt4"

 华为发布Mate60手机。
 ラグビーワールドカップ2023フランス"""
 default_tokenizer_type_1 = "llama3"
+default_tokenizer_type_2 = "gpt_4"

utils/compress_rate_util.py CHANGED Viewed

@@ -41,6 +41,7 @@
 import json
 import os
 import pandas as pd
 from datasets import load_dataset
 from utils.log_util import logger
@@ -94,13 +95,12 @@ def pprint(stats):
             if unit not in stat:
                 columns[unit] = unit_convertor(stat, unit)
             else:
-                pass
         table.append(columns)
     df = pd.DataFrame(table)
     # print(df.to_markdown(index=False, tablefmt='fancy_grid'))
-    logger.info("\n{df.to_markdown(index=False)}")
-    return
 cache = {}
@@ -163,17 +163,20 @@ def tokenize_corpus(tokenizer, lang, cache_dir="stats/compress_rate"):
 def main():
     from vocab import all_tokenizers
     stats = {}
-    for lang in ["en", "zh-Hans"]:
         print("###" * 10 + lang)
-        # for tokenizer_name in ['llama', 'llama2', 'llama3']:
-        for tokenizer_name in all_tokenizers:
             tokenizer = load_tokener(tokenizer_name)
             stat = tokenize_corpus(tokenizer, lang)
-            # ["qwen1_5_14b_chat", "gpt_35_turbo",]:
             stats[tokenizer_name] = stat
         pprint(stats)

 import json
 import os
+import sys
 import pandas as pd
 from datasets import load_dataset
 from utils.log_util import logger
             if unit not in stat:
                 columns[unit] = unit_convertor(stat, unit)
             else:
+                logger.error(f"unit {unit} not support")
         table.append(columns)
     df = pd.DataFrame(table)
     # print(df.to_markdown(index=False, tablefmt='fancy_grid'))
+    logger.info(f"\n{df.to_markdown(index=False)}")
 cache = {}
 def main():
     from vocab import all_tokenizers
+    if len(sys.argv) == 3:
+        tokenizers = [sys.argv[1]]
+        corpuses = [sys.argv[2]]
+    else:
+        tokenizers = all_tokenizers
+        corpuses = ["en", "zh-Hans"]
     stats = {}
+    for lang in corpuses:
         print("###" * 10 + lang)
+        for tokenizer_name in tokenizers:
             tokenizer = load_tokener(tokenizer_name)
             stat = tokenize_corpus(tokenizer, lang)
             stats[tokenizer_name] = stat
         pprint(stats)