Spaces:
Runtime error
Runtime error
File size: 1,331 Bytes
5cf5457 0cee7ca 5cf5457 0cee7ca 5cf5457 0cee7ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import TemplateProcessing
import argparse
parser = argparse.ArgumentParser(description='Train BPE Tokenizer.')
parser.add_argument('dataset_file_path', type=str, help='Path to the dataset file')
parser.add_argument('output_file_path', type=str, help='Path to file containing trained tokenizer weights')
# Parse the arguments
args = parser.parse_args()
# Initialize a tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
# Use the byte level pre-tokenizer
tokenizer.pre_tokenizer = ByteLevel()
# Customize training with a BpeTrainer
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
# Path to the file(s) for training the tokenizer
files = [args.dataset_file_path]
# Train the tokenizer
tokenizer.train(files, trainer)
# Optionally, you can customize the post-processing to add special tokens
tokenizer.post_processor = TemplateProcessing(
single="[CLS] $A [SEP]",
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
special_tokens=[
("[CLS]", tokenizer.token_to_id("[CLS]")),
("[SEP]", tokenizer.token_to_id("[SEP]")),
],
)
# Save the tokenizer
tokenizer.save(args.output_file_path)
|