File size: 1,341 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
814ee6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751936e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52


from .tokenization import ChineseSPTokenizer, make_tokenizer


def DecodeIds(self, Ids, type_token=False):
    try:
        decode_str = self.DecodeIds(Ids, type_token=type_token)
    except Exception as e:
        print("WARNING", Ids, e)
        decode_str = ""
    return decode_str

ChineseSPTokenizer.decode = DecodeIds


add_sentinel_token = 0
tokenizer = make_tokenizer("ChineseSPTokenizer", None, "tokenizer.model", "50048",
                           None, add_block_symbols=True, cache_dir="cache",
                           add_sentinel_token=add_sentinel_token, add_task_mask=True,
                           add_decoder_mask=False,
                           fix_command_token=False)

tokenizer.vocab_size = tokenizer.num_tokens




def get_vocab(self, token_type="str"):
    """Returns vocab as a dict
    :return:
    """
    vocab = {}
    for i in range(self.vocab_size):
        try:
            token_byte = self.convert_ids_to_tokens([i])[0]
            if token_byte is None:
                continue
            # token_str = token_byte.decode("utf-8")
            vocab[token_byte] = i

        except Exception as e:  # 773 UnicodeDecodeError
            print("exception")

    return vocab


ChineseSPTokenizer.get_vocab = get_vocab

# vocab_size = len(tokenizer.get_vocab())
# vocab_size = tokenizer.vocab_size