xu-song commited on
Commit
1ee0570
1 Parent(s): 428b731
app.py CHANGED
@@ -40,7 +40,7 @@ example_text = """Replace this text in the input field to see how tokenization w
40
 
41
  # llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
42
  examples = [
43
- ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
44
  ["标点测试:,。!?;", "baichuan_7b", "llama"],
45
  ["符号测试:🦙", "baichuan_7b", "llama"],
46
  ["中文测试:🦙", "baichuan_7b", "llama"],
@@ -83,10 +83,10 @@ def tokenize(text, tokenizer_type, color_num=5):
83
  return
84
 
85
 
86
-
87
  table.append(
88
  {"TokenID": token_id,
89
- "Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
90
  "Text": decode_text, #
91
  # "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
92
  "Bytes": str(token_bytes),
@@ -212,13 +212,13 @@ with gr.Blocks(css=css) as demo:
212
 
213
  with gr.Row():
214
  output_table_1 = gr.Dataframe(
215
- headers=["TokenID", "Byte", "Text"],
216
- datatype=["str", "str", "str"],
217
  # elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
218
  )
219
  output_table_2 = gr.Dataframe(
220
- headers=["TokenID", "Token", "Text"],
221
- datatype=["str", "str", "str"],
222
  )
223
 
224
  tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1, stats_token_size_1])
 
40
 
41
  # llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
42
  examples = [
43
+ # ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
44
  ["标点测试:,。!?;", "baichuan_7b", "llama"],
45
  ["符号测试:🦙", "baichuan_7b", "llama"],
46
  ["中文测试:🦙", "baichuan_7b", "llama"],
 
83
  return
84
 
85
 
86
+ # ⭐
87
  table.append(
88
  {"TokenID": token_id,
89
+ "Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
90
  "Text": decode_text, #
91
  # "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
92
  "Bytes": str(token_bytes),
 
212
 
213
  with gr.Row():
214
  output_table_1 = gr.Dataframe(
215
+ # headers=["TokenID", "Byte", "Text"],
216
+ # datatype=["str", "str", "str"],
217
  # elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
218
  )
219
  output_table_2 = gr.Dataframe(
220
+ # headers=["TokenID", "Token", "Text"],
221
+ # datatype=["str", "str", "str"],
222
  )
223
 
224
  tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1, stats_token_size_1])
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  transformers>=4.21.1
2
  sentencepiece
3
  tiktoken
 
4
  torch
 
1
  transformers>=4.21.1
2
  sentencepiece
3
  tiktoken
4
+ icetk
5
  torch
vocab/chatglm_6b/__init__.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
 
2
  import os
3
  from transformers import AutoTokenizer
 
1
+ """
2
+ 依赖 icetk
3
+ """
4
 
5
  import os
6
  from transformers import AutoTokenizer
vocab/llama2/README.md ADDED
File without changes