finnstrom3693
/

mini-sun-init-bert-tf-110m

Model card Files Files and versions Community

finnstrom3693 commited on Oct 1

Commit

0f489e7

•

1 Parent(s): 8251115

long tokenizing process

Files changed (1) hide show

tokenizer_make3.py +0 -82

tokenizer_make3.py DELETED Viewed

@@ -1,82 +0,0 @@
-# @title Model Tokenizer
-!pip install tensorflow_text
-from transformers import TFBertTokenizer
-import os
-import tensorflow as tf
-class MiniSunTokenizer:
-    def __init__(self, vocab_file=None):
-        if vocab_file:
-            self.tokenizer = TFBertTokenizer(vocab_file=vocab_file, do_lower_case=False)
-        else:
-            self.tokenizer = TFBertTokenizer.from_pretrained('bert-base-uncased')
-        # Define special tokens
-        self.pad_token = '[PAD]'
-        self.unk_token = '[UNK]'
-        self.cls_token = '[CLS]'
-        self.sep_token = '[SEP]'
-        self.mask_token = '[MASK]'
-        self.eos_token = '[EOS]'
-    def encode(self, text, max_length=512, padding=True, truncation=True):
-        """
-        Encodes the input text (string or batch of strings).
-        It automatically detects if the input is a batch or a single sentence.
-        """
-        if isinstance(text, list):  # If batch of texts, call batch_encode_plus
-            return self._encode_batch(text, max_length, padding, truncation)
-        else:  # Single text input
-            return self._encode_single(text, max_length, padding, truncation)
-    def _encode_single(self, text, max_length=512, padding=True, truncation=True):
-        # Encode a single string
-        encoded = self.tokenizer.encode_plus(
-            text,
-            add_special_tokens=True,
-            max_length=max_length,
-            padding='max_length' if padding else False,
-            truncation=truncation,
-            return_attention_mask=True,
-            return_tensors='np'
-        )
-        return {
-            'input_ids': encoded['input_ids'],
-            'attention_mask': encoded['attention_mask']
-        }
-    def _encode_batch(self, texts, max_length=512, padding=True, truncation=True):
-        # Encode a batch of strings
-        encoded_batch = self.tokenizer.batch_encode_plus(
-            texts,
-            add_special_tokens=True,
-            max_length=max_length,
-            padding='max_length' if padding else False,
-            truncation=truncation,
-            return_attention_mask=True,
-            return_tensors='np'
-        )
-        return {
-            'input_ids': encoded_batch['input_ids'],
-            'attention_mask': encoded_batch['attention_mask']
-        }
-    def decode(self, token_ids):
-        # Decodes token IDs back into text
-        return self.tokenizer.decode(token_ids, skip_special_tokens=True)
-    def save_pretrained(self, save_directory):
-        # Save the tokenizer in Hugging Face format
-        os.makedirs(save_directory, exist_ok=True)
-        self.tokenizer.save_pretrained(save_directory)
-    def __call__(self, text, *args, **kwargs):
-        """
-        This allows the tokenizer object to be called directly like `tokenizer(text)`.
-        It will automatically detect if the input is a batch or a single sentence.
-        """
-        return self.encode(text, *args, **kwargs)
-# Example usage of the tokenizer
-tokenizer = MiniSunTokenizer()