File size: 1,950 Bytes

b71846a

from transformers import BertTokenizerFast
import os

class MiniSunTokenizer:
    def __init__(self, vocab_file=None):
        # You can use BERT's tokenizer or any custom vocabulary tokenizer
        if vocab_file:
            self.tokenizer = BertTokenizerFast(vocab_file=vocab_file, do_lower_case=False)
        else:
            # Default BERT tokenizer without a specific vocab file
            self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
        
        # Define special tokens if needed (customizable)
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.cls_token = '[CLS]'
        self.sep_token = '[SEP]'
        self.mask_token = '[MASK]'

    def tokenize(self, text):
        # Tokenizes the input text
        return self.tokenizer.tokenize(text)

    def encode(self, text, max_length=512, padding=True, truncation=True):
        # Converts the text into input IDs and attention mask
        encoded = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=max_length, 
            padding='max_length' if padding else False, 
            truncation=truncation,
            return_attention_mask=True,
            return_tensors='tf'
        )
        return encoded['input_ids'], encoded['attention_mask']

    def decode(self, token_ids):
        # Decodes token IDs back into text
        return self.tokenizer.decode(token_ids, skip_special_tokens=True)

    def save_pretrained(self, save_directory):
        # Save the tokenizer in Hugging Face format
        os.makedirs(save_directory, exist_ok=True)
        self.tokenizer.save_pretrained(save_directory)

# Example usage of the tokenizer
tokenizer = MiniSunTokenizer()

text = "Hello, this is a test sentence for MiniSun model."
input_ids, attention_mask = tokenizer.encode(text, max_length=20)

print("Input IDs:", input_ids)
print("Attention Mask:", attention_mask)