BpeTokenizer / trainBpeTokenizer.py
saicharan2804
Updated training
0cee7ca
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import TemplateProcessing
import argparse
parser = argparse.ArgumentParser(description='Train BPE Tokenizer.')
parser.add_argument('dataset_file_path', type=str, help='Path to the dataset file')
parser.add_argument('output_file_path', type=str, help='Path to file containing trained tokenizer weights')
# Parse the arguments
args = parser.parse_args()
# Initialize a tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
# Use the byte level pre-tokenizer
tokenizer.pre_tokenizer = ByteLevel()
# Customize training with a BpeTrainer
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
# Path to the file(s) for training the tokenizer
files = [args.dataset_file_path]
# Train the tokenizer
tokenizer.train(files, trainer)
# Optionally, you can customize the post-processing to add special tokens
tokenizer.post_processor = TemplateProcessing(
single="[CLS] $A [SEP]",
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
special_tokens=[
("[CLS]", tokenizer.token_to_id("[CLS]")),
("[SEP]", tokenizer.token_to_id("[SEP]")),
],
)
# Save the tokenizer
tokenizer.save(args.output_file_path)