Spaces:
Running
Running
update
Browse files- README.md +1 -26
- config.py +1 -1
- utils/compress_rate_util.py +12 -9
README.md
CHANGED
@@ -9,34 +9,9 @@ app_file: app.py
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
13 |
|
14 |
|
15 |
-
##
|
16 |
-
|
17 |
-
|
18 |
-
## TODO
|
19 |
-
|
20 |
-
|
21 |
-
- 搜索栏
|
22 |
-
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
## 统计
|
27 |
-
|
28 |
-
|
29 |
-
## vocabsize
|
30 |
-
|
31 |
-
- 增大能提到压缩率,副作用是增大计算量和内存 (getting the most out of your tokenizer for pre-training and)
|
32 |
-
-
|
33 |
-
|
34 |
-
|
35 |
-
https://huggingface.co/spaces/yenniejun/tokenizers-languages
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
## Compress Rate
|
40 |
|
41 |
|
42 |
在 [cc-100](https://huggingface.co/datasets/cc100) 数据集,每个语言取1万条数据,测试不同tokenizer的压缩率。压缩率指标 `g_bytes/b_tokens`
|
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
|
|
12 |
|
13 |
|
14 |
+
## 压缩率 Compress Rate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
|
17 |
在 [cc-100](https://huggingface.co/datasets/cc100) 数据集,每个语言取1万条数据,测试不同tokenizer的压缩率。压缩率指标 `g_bytes/b_tokens`
|
config.py
CHANGED
@@ -17,4 +17,4 @@ Buenos días!
|
|
17 |
华为发布Mate60手机。
|
18 |
ラグビーワールドカップ2023フランス"""
|
19 |
default_tokenizer_type_1 = "llama3"
|
20 |
-
default_tokenizer_type_2 = "
|
|
|
17 |
华为发布Mate60手机。
|
18 |
ラグビーワールドカップ2023フランス"""
|
19 |
default_tokenizer_type_1 = "llama3"
|
20 |
+
default_tokenizer_type_2 = "gpt_4"
|
utils/compress_rate_util.py
CHANGED
@@ -41,6 +41,7 @@
|
|
41 |
|
42 |
import json
|
43 |
import os
|
|
|
44 |
import pandas as pd
|
45 |
from datasets import load_dataset
|
46 |
from utils.log_util import logger
|
@@ -94,13 +95,12 @@ def pprint(stats):
|
|
94 |
if unit not in stat:
|
95 |
columns[unit] = unit_convertor(stat, unit)
|
96 |
else:
|
97 |
-
|
98 |
|
99 |
table.append(columns)
|
100 |
df = pd.DataFrame(table)
|
101 |
# print(df.to_markdown(index=False, tablefmt='fancy_grid'))
|
102 |
-
logger.info("\n{df.to_markdown(index=False)}")
|
103 |
-
return
|
104 |
|
105 |
|
106 |
cache = {}
|
@@ -163,17 +163,20 @@ def tokenize_corpus(tokenizer, lang, cache_dir="stats/compress_rate"):
|
|
163 |
|
164 |
def main():
|
165 |
from vocab import all_tokenizers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
stats = {}
|
167 |
-
for lang in
|
168 |
print("###" * 10 + lang)
|
169 |
-
|
170 |
-
# for tokenizer_name in ['llama', 'llama2', 'llama3']:
|
171 |
-
for tokenizer_name in all_tokenizers:
|
172 |
tokenizer = load_tokener(tokenizer_name)
|
173 |
stat = tokenize_corpus(tokenizer, lang)
|
174 |
-
# ["qwen1_5_14b_chat", "gpt_35_turbo",]:
|
175 |
stats[tokenizer_name] = stat
|
176 |
-
|
177 |
pprint(stats)
|
178 |
|
179 |
|
|
|
41 |
|
42 |
import json
|
43 |
import os
|
44 |
+
import sys
|
45 |
import pandas as pd
|
46 |
from datasets import load_dataset
|
47 |
from utils.log_util import logger
|
|
|
95 |
if unit not in stat:
|
96 |
columns[unit] = unit_convertor(stat, unit)
|
97 |
else:
|
98 |
+
logger.error(f"unit {unit} not support")
|
99 |
|
100 |
table.append(columns)
|
101 |
df = pd.DataFrame(table)
|
102 |
# print(df.to_markdown(index=False, tablefmt='fancy_grid'))
|
103 |
+
logger.info(f"\n{df.to_markdown(index=False)}")
|
|
|
104 |
|
105 |
|
106 |
cache = {}
|
|
|
163 |
|
164 |
def main():
|
165 |
from vocab import all_tokenizers
|
166 |
+
if len(sys.argv) == 3:
|
167 |
+
tokenizers = [sys.argv[1]]
|
168 |
+
corpuses = [sys.argv[2]]
|
169 |
+
else:
|
170 |
+
tokenizers = all_tokenizers
|
171 |
+
corpuses = ["en", "zh-Hans"]
|
172 |
+
|
173 |
stats = {}
|
174 |
+
for lang in corpuses:
|
175 |
print("###" * 10 + lang)
|
176 |
+
for tokenizer_name in tokenizers:
|
|
|
|
|
177 |
tokenizer = load_tokener(tokenizer_name)
|
178 |
stat = tokenize_corpus(tokenizer, lang)
|
|
|
179 |
stats[tokenizer_name] = stat
|
|
|
180 |
pprint(stats)
|
181 |
|
182 |
|