Spaces:
Running
Running
add zephyr
Browse files- vocab/__init__.py +3 -2
- vocab/zephyr_7b_beta/__init__.py +5 -0
vocab/__init__.py
CHANGED
@@ -70,7 +70,7 @@ uniq_tokenizers = [
|
|
70 |
""
|
71 |
]
|
72 |
|
73 |
-
# TODO: alias/abbr, hf_path, tokenizer_class/type, comments,
|
74 |
all_tokenizers = [
|
75 |
##### bert 系列
|
76 |
("bert_base_cased", "", "bert"),
|
@@ -99,7 +99,7 @@ all_tokenizers = [
|
|
99 |
("chatyuan_large_v2", "", "sentencepiece"),
|
100 |
("prompt_clue", "", "sentencepiece"),
|
101 |
|
102 |
-
("llama", "", "sentencepiece"), # '中文单字': 700, '中文多字': 0
|
103 |
("llama2", "", "sentencepiece"),
|
104 |
("chinese_llama", "", "sentencepiece"), #
|
105 |
("chinese_llama2", "", "sentencepiece"), #
|
@@ -168,6 +168,7 @@ all_tokenizers = [
|
|
168 |
("gemma_7b",),
|
169 |
("olmo_7b",),
|
170 |
("aya_101",),
|
|
|
171 |
]
|
172 |
|
173 |
all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]
|
|
|
70 |
""
|
71 |
]
|
72 |
|
73 |
+
# TODO: alias/abbr, description, hf_path, tokenizer_class/type, comments, Organization
|
74 |
all_tokenizers = [
|
75 |
##### bert 系列
|
76 |
("bert_base_cased", "", "bert"),
|
|
|
99 |
("chatyuan_large_v2", "", "sentencepiece"),
|
100 |
("prompt_clue", "", "sentencepiece"),
|
101 |
|
102 |
+
("llama", "", "sentencepiece", "llama use single digits and thus uses 4 tokens to encode the number 1000"), # '中文单字': 700, '中文多字': 0
|
103 |
("llama2", "", "sentencepiece"),
|
104 |
("chinese_llama", "", "sentencepiece"), #
|
105 |
("chinese_llama2", "", "sentencepiece"), #
|
|
|
168 |
("gemma_7b",),
|
169 |
("olmo_7b",),
|
170 |
("aya_101",),
|
171 |
+
("zephyr_7b_beta",)
|
172 |
]
|
173 |
|
174 |
all_tokenizers = [tokenizer[0] for tokenizer in all_tokenizers]
|
vocab/zephyr_7b_beta/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
|
5 |
+
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
|