File size: 1,331 Bytes
5cf5457
 
 
 
 
0cee7ca
 
 
 
 
 
 
 
 
5cf5457
 
 
 
 
 
 
 
 
 
 
0cee7ca
5cf5457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cee7ca
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import TemplateProcessing
import argparse


parser = argparse.ArgumentParser(description='Train BPE Tokenizer.')
parser.add_argument('dataset_file_path', type=str, help='Path to the dataset file')
parser.add_argument('output_file_path', type=str, help='Path to file containing trained tokenizer weights')

# Parse the arguments
args = parser.parse_args()

# Initialize a tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

# Use the byte level pre-tokenizer
tokenizer.pre_tokenizer = ByteLevel()

# Customize training with a BpeTrainer
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

# Path to the file(s) for training the tokenizer
files = [args.dataset_file_path]

# Train the tokenizer
tokenizer.train(files, trainer)

# Optionally, you can customize the post-processing to add special tokens
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

# Save the tokenizer
tokenizer.save(args.output_file_path)