|
""" 封装 sentencepiece.SentencePieceProcessor,以便符合transformers中的tokenizer标准 |
|
|
|
## reference |
|
|
|
|
|
## usage |
|
|
|
- grok |
|
|
|
""" |
|
|
|
import sentencepiece as spm |
|
from transformers import PreTrainedTokenizer |
|
|
|
|
|
class SPTokenizerWrapper(PreTrainedTokenizer): |
|
""" |
|
|
|
## impl in PreTrainedTokenizer |
|
- convert_ids_to_tokens |
|
""" |
|
|
|
def __init__(self, vocab_file): |
|
self.vocab_file = vocab_file |
|
self.sp_model = spm.SentencePieceProcessor(self.vocab_file) |
|
super().__init__() |
|
|
|
@property |
|
def vocab_size(self): |
|
"""Returns vocab size""" |
|
return self.sp_model.get_piece_size() |
|
|
|
def get_vocab(self): |
|
"""Returns vocab as a dict""" |
|
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} |
|
return vocab |
|
|
|
def _convert_token_to_id(self, token): |
|
"""Converts a token (str) in an id using the vocab.""" |
|
return self.sp_model.piece_to_id(token) |
|
|
|
def _convert_id_to_token(self, index): |
|
"""Converts an index (integer) in a token (str) using the vocab.""" |
|
token = self.sp_model.IdToPiece(index) |
|
return token |
|
|
|
|
|
|
|
|
|
def encode(self, *args, **kwargs): |
|
kwargs.pop("add_special_tokens", None) |
|
kwargs.pop("allowed_special", None) |
|
return self.sp_model.Encode(*args, **kwargs) |
|
|
|
def decode(self, *args, **kwargs): |
|
kwargs.pop("skip_special_tokens", None) |
|
return self.sp_model.Decode(*args, **kwargs) |
|
|
|
|
|
|
|
|