Spaces:
Running
Running
fix moss
Browse files
vocab/moss/moss-moon-003-sft/tokenization_moss.py
CHANGED
@@ -146,6 +146,11 @@ class MossTokenizer(PreTrainedTokenizer):
|
|
146 |
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
147 |
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
148 |
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
|
|
|
|
|
|
|
|
|
|
149 |
super().__init__(
|
150 |
errors=errors,
|
151 |
unk_token=unk_token,
|
@@ -156,10 +161,7 @@ class MossTokenizer(PreTrainedTokenizer):
|
|
156 |
add_bos_token=add_bos_token,
|
157 |
**kwargs,
|
158 |
)
|
159 |
-
self.add_bos_token = add_bos_token
|
160 |
|
161 |
-
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
162 |
-
self.encoder = json.load(vocab_handle)
|
163 |
self.decoder = {v: k for k, v in self.encoder.items()}
|
164 |
self.errors = errors # how to handle errors in decoding
|
165 |
self.byte_encoder = bytes_to_unicode()
|
|
|
146 |
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
147 |
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
148 |
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
149 |
+
self.add_bos_token = add_bos_token
|
150 |
+
|
151 |
+
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
152 |
+
self.encoder = json.load(vocab_handle)
|
153 |
+
|
154 |
super().__init__(
|
155 |
errors=errors,
|
156 |
unk_token=unk_token,
|
|
|
161 |
add_bos_token=add_bos_token,
|
162 |
**kwargs,
|
163 |
)
|
|
|
164 |
|
|
|
|
|
165 |
self.decoder = {v: k for k, v in self.encoder.items()}
|
166 |
self.errors = errors # how to handle errors in decoding
|
167 |
self.byte_encoder = bytes_to_unicode()
|