Spaces:
Runtime error
Runtime error
from tokenizers import Tokenizer | |
from tokenizers.models import BPE | |
from tokenizers.trainers import BpeTrainer | |
from tokenizers.pre_tokenizers import ByteLevel | |
from tokenizers.processors import TemplateProcessing | |
import argparse | |
parser = argparse.ArgumentParser(description='Train BPE Tokenizer.') | |
parser.add_argument('dataset_file_path', type=str, help='Path to the dataset file') | |
parser.add_argument('output_file_path', type=str, help='Path to file containing trained tokenizer weights') | |
# Parse the arguments | |
args = parser.parse_args() | |
# Initialize a tokenizer | |
tokenizer = Tokenizer(BPE(unk_token="[UNK]")) | |
# Use the byte level pre-tokenizer | |
tokenizer.pre_tokenizer = ByteLevel() | |
# Customize training with a BpeTrainer | |
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) | |
# Path to the file(s) for training the tokenizer | |
files = [args.dataset_file_path] | |
# Train the tokenizer | |
tokenizer.train(files, trainer) | |
# Optionally, you can customize the post-processing to add special tokens | |
tokenizer.post_processor = TemplateProcessing( | |
single="[CLS] $A [SEP]", | |
pair="[CLS] $A [SEP] $B:1 [SEP]:1", | |
special_tokens=[ | |
("[CLS]", tokenizer.token_to_id("[CLS]")), | |
("[SEP]", tokenizer.token_to_id("[SEP]")), | |
], | |
) | |
# Save the tokenizer | |
tokenizer.save(args.output_file_path) | |