Spaces:
Running
Running
File size: 1,341 Bytes
751936e 814ee6b 751936e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
from .tokenization import ChineseSPTokenizer, make_tokenizer
def DecodeIds(self, Ids, type_token=False):
try:
decode_str = self.DecodeIds(Ids, type_token=type_token)
except Exception as e:
print("WARNING", Ids, e)
decode_str = ""
return decode_str
ChineseSPTokenizer.decode = DecodeIds
add_sentinel_token = 0
tokenizer = make_tokenizer("ChineseSPTokenizer", None, "tokenizer.model", "50048",
None, add_block_symbols=True, cache_dir="cache",
add_sentinel_token=add_sentinel_token, add_task_mask=True,
add_decoder_mask=False,
fix_command_token=False)
tokenizer.vocab_size = tokenizer.num_tokens
def get_vocab(self, token_type="str"):
"""Returns vocab as a dict
:return:
"""
vocab = {}
for i in range(self.vocab_size):
try:
token_byte = self.convert_ids_to_tokens([i])[0]
if token_byte is None:
continue
# token_str = token_byte.decode("utf-8")
vocab[token_byte] = i
except Exception as e: # 773 UnicodeDecodeError
print("exception")
return vocab
ChineseSPTokenizer.get_vocab = get_vocab
# vocab_size = len(tokenizer.get_vocab())
# vocab_size = tokenizer.vocab_size
|