update
Browse files- app.py +7 -7
- requirements.txt +1 -0
- vocab/chatglm_6b/__init__.py +3 -0
- vocab/llama2/README.md +0 -0
app.py
CHANGED
@@ -40,7 +40,7 @@ example_text = """Replace this text in the input field to see how tokenization w
|
|
40 |
|
41 |
# llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
|
42 |
examples = [
|
43 |
-
["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
44 |
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
45 |
["符号测试:🦙", "baichuan_7b", "llama"],
|
46 |
["中文测试:🦙", "baichuan_7b", "llama"],
|
@@ -83,10 +83,10 @@ def tokenize(text, tokenizer_type, color_num=5):
|
|
83 |
return
|
84 |
|
85 |
|
86 |
-
|
87 |
table.append(
|
88 |
{"TokenID": token_id,
|
89 |
-
"
|
90 |
"Text": decode_text, #
|
91 |
# "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
|
92 |
"Bytes": str(token_bytes),
|
@@ -212,13 +212,13 @@ with gr.Blocks(css=css) as demo:
|
|
212 |
|
213 |
with gr.Row():
|
214 |
output_table_1 = gr.Dataframe(
|
215 |
-
headers=["TokenID", "Byte", "Text"],
|
216 |
-
datatype=["str", "str", "str"],
|
217 |
# elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
|
218 |
)
|
219 |
output_table_2 = gr.Dataframe(
|
220 |
-
headers=["TokenID", "Token", "Text"],
|
221 |
-
datatype=["str", "str", "str"],
|
222 |
)
|
223 |
|
224 |
tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1, stats_token_size_1])
|
|
|
40 |
|
41 |
# llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
|
42 |
examples = [
|
43 |
+
# ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
44 |
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
45 |
["符号测试:🦙", "baichuan_7b", "llama"],
|
46 |
["中文测试:🦙", "baichuan_7b", "llama"],
|
|
|
83 |
return
|
84 |
|
85 |
|
86 |
+
# ⭐
|
87 |
table.append(
|
88 |
{"TokenID": token_id,
|
89 |
+
"Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
|
90 |
"Text": decode_text, #
|
91 |
# "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
|
92 |
"Bytes": str(token_bytes),
|
|
|
212 |
|
213 |
with gr.Row():
|
214 |
output_table_1 = gr.Dataframe(
|
215 |
+
# headers=["TokenID", "Byte", "Text"],
|
216 |
+
# datatype=["str", "str", "str"],
|
217 |
# elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
|
218 |
)
|
219 |
output_table_2 = gr.Dataframe(
|
220 |
+
# headers=["TokenID", "Token", "Text"],
|
221 |
+
# datatype=["str", "str", "str"],
|
222 |
)
|
223 |
|
224 |
tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1, stats_token_size_1])
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
transformers>=4.21.1
|
2 |
sentencepiece
|
3 |
tiktoken
|
|
|
4 |
torch
|
|
|
1 |
transformers>=4.21.1
|
2 |
sentencepiece
|
3 |
tiktoken
|
4 |
+
icetk
|
5 |
torch
|
vocab/chatglm_6b/__init__.py
CHANGED
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
1 |
|
2 |
import os
|
3 |
from transformers import AutoTokenizer
|
|
|
1 |
+
"""
|
2 |
+
依赖 icetk
|
3 |
+
"""
|
4 |
|
5 |
import os
|
6 |
from transformers import AutoTokenizer
|
vocab/llama2/README.md
ADDED
File without changes
|