finnstrom3693 commited on
Commit
0f489e7
1 Parent(s): 8251115

long tokenizing process

Browse files
Files changed (1) hide show
  1. tokenizer_make3.py +0 -82
tokenizer_make3.py DELETED
@@ -1,82 +0,0 @@
1
- # @title Model Tokenizer
2
- !pip install tensorflow_text
3
- from transformers import TFBertTokenizer
4
- import os
5
- import tensorflow as tf
6
-
7
- class MiniSunTokenizer:
8
- def __init__(self, vocab_file=None):
9
- if vocab_file:
10
- self.tokenizer = TFBertTokenizer(vocab_file=vocab_file, do_lower_case=False)
11
- else:
12
- self.tokenizer = TFBertTokenizer.from_pretrained('bert-base-uncased')
13
-
14
- # Define special tokens
15
- self.pad_token = '[PAD]'
16
- self.unk_token = '[UNK]'
17
- self.cls_token = '[CLS]'
18
- self.sep_token = '[SEP]'
19
- self.mask_token = '[MASK]'
20
- self.eos_token = '[EOS]'
21
-
22
- def encode(self, text, max_length=512, padding=True, truncation=True):
23
- """
24
- Encodes the input text (string or batch of strings).
25
- It automatically detects if the input is a batch or a single sentence.
26
- """
27
- if isinstance(text, list): # If batch of texts, call batch_encode_plus
28
- return self._encode_batch(text, max_length, padding, truncation)
29
- else: # Single text input
30
- return self._encode_single(text, max_length, padding, truncation)
31
-
32
- def _encode_single(self, text, max_length=512, padding=True, truncation=True):
33
- # Encode a single string
34
- encoded = self.tokenizer.encode_plus(
35
- text,
36
- add_special_tokens=True,
37
- max_length=max_length,
38
- padding='max_length' if padding else False,
39
- truncation=truncation,
40
- return_attention_mask=True,
41
- return_tensors='np'
42
- )
43
- return {
44
- 'input_ids': encoded['input_ids'],
45
- 'attention_mask': encoded['attention_mask']
46
- }
47
-
48
- def _encode_batch(self, texts, max_length=512, padding=True, truncation=True):
49
- # Encode a batch of strings
50
- encoded_batch = self.tokenizer.batch_encode_plus(
51
- texts,
52
- add_special_tokens=True,
53
- max_length=max_length,
54
- padding='max_length' if padding else False,
55
- truncation=truncation,
56
- return_attention_mask=True,
57
- return_tensors='np'
58
- )
59
- return {
60
- 'input_ids': encoded_batch['input_ids'],
61
- 'attention_mask': encoded_batch['attention_mask']
62
- }
63
-
64
- def decode(self, token_ids):
65
- # Decodes token IDs back into text
66
- return self.tokenizer.decode(token_ids, skip_special_tokens=True)
67
-
68
- def save_pretrained(self, save_directory):
69
- # Save the tokenizer in Hugging Face format
70
- os.makedirs(save_directory, exist_ok=True)
71
- self.tokenizer.save_pretrained(save_directory)
72
-
73
- def __call__(self, text, *args, **kwargs):
74
- """
75
- This allows the tokenizer object to be called directly like `tokenizer(text)`.
76
- It will automatically detect if the input is a batch or a single sentence.
77
- """
78
- return self.encode(text, *args, **kwargs)
79
-
80
-
81
- # Example usage of the tokenizer
82
- tokenizer = MiniSunTokenizer()