Spaces:
Running
Running
add more tokenizer
Browse files- vocab/__init__.py +10 -1
- vocab/falcon_7b/__init__.py +4 -0
- vocab/fastchat_t5_3b/__init__.py +3 -0
- vocab/flan_t5_base/__init__.py +3 -0
- vocab/pko_t5_large/__init__.py +3 -0
- vocab/t5/__init__.py +0 -7
- vocab/t5_base/__init__.py +8 -0
- vocab/t5_large/__init__.py +8 -0
- vocab/t5_small/__init__.py +8 -0
vocab/__init__.py
CHANGED
@@ -96,8 +96,8 @@ all_tokenizers = [
|
|
96 |
# "alpaca_7b",
|
97 |
"baichuan",
|
98 |
"baichuan2",
|
99 |
-
"qwen",
|
100 |
"internlm_chat_7b",
|
|
|
101 |
"falcon_180b",
|
102 |
# "goat",
|
103 |
|
@@ -109,9 +109,18 @@ all_tokenizers = [
|
|
109 |
"skywork_13b_base",
|
110 |
"skywork_13b_math",
|
111 |
"mistral",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
]
|
114 |
|
|
|
|
|
115 |
class TokenizerType(Enum):
|
116 |
"""
|
117 |
- https://huggingface.co/docs/transformers/tokenizer_summary
|
|
|
96 |
# "alpaca_7b",
|
97 |
"baichuan",
|
98 |
"baichuan2",
|
|
|
99 |
"internlm_chat_7b",
|
100 |
+
"falcon_7b",
|
101 |
"falcon_180b",
|
102 |
# "goat",
|
103 |
|
|
|
109 |
"skywork_13b_base",
|
110 |
"skywork_13b_math",
|
111 |
"mistral",
|
112 |
+
"t5_small",
|
113 |
+
"t5_base",
|
114 |
+
"t5_large",
|
115 |
+
"flan_t5_base",
|
116 |
+
"fastchat_t5_3b",
|
117 |
+
"pko_t5_large",
|
118 |
+
|
119 |
|
120 |
]
|
121 |
|
122 |
+
all_tokenizers = sorted(all_tokenizers)
|
123 |
+
|
124 |
class TokenizerType(Enum):
|
125 |
"""
|
126 |
- https://huggingface.co/docs/transformers/tokenizer_summary
|
vocab/falcon_7b/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
|
vocab/fastchat_t5_3b/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
+
|
3 |
+
tokenizer = AutoTokenizer.from_pretrained("lmsys/fastchat-t5-3b-v1.0", trust_remote_code=True)
|
vocab/flan_t5_base/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
+
|
3 |
+
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", trust_remote_code=True)
|
vocab/pko_t5_large/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
+
|
3 |
+
tokenizer = AutoTokenizer.from_pretrained("paust/pko-t5-large", trust_remote_code=True)
|
vocab/t5/__init__.py
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
|
3 |
-
|
4 |
-
SentencePiece
|
5 |
-
"""
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/t5_base/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
https://huggingface.co/t5-base
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
+
from transformers import AutoTokenizer
|
7 |
+
|
8 |
+
tokenizer = AutoTokenizer.from_pretrained("t5-base", trust_remote_code=True)
|
vocab/t5_large/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
https://huggingface.co/t5-large
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
+
from transformers import AutoTokenizer
|
7 |
+
|
8 |
+
tokenizer = AutoTokenizer.from_pretrained("t5-large", trust_remote_code=True)
|
vocab/t5_small/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
https://huggingface.co/t5-large
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
+
from transformers import AutoTokenizer
|
7 |
+
|
8 |
+
tokenizer = AutoTokenizer.from_pretrained("t5-small", trust_remote_code=True)
|