Spaces:
Runtime error
Runtime error
| from tokenizers import Tokenizer | |
| from tokenizers.models import BPE | |
| from tokenizers.trainers import BpeTrainer | |
| from tokenizers.pre_tokenizers import ByteLevel | |
| from tokenizers.processors import TemplateProcessing | |
| import argparse | |
| parser = argparse.ArgumentParser(description='Train BPE Tokenizer.') | |
| parser.add_argument('dataset_file_path', type=str, help='Path to the dataset file') | |
| parser.add_argument('output_file_path', type=str, help='Path to file containing trained tokenizer weights') | |
| # Parse the arguments | |
| args = parser.parse_args() | |
| # Initialize a tokenizer | |
| tokenizer = Tokenizer(BPE(unk_token="[UNK]")) | |
| # Use the byte level pre-tokenizer | |
| tokenizer.pre_tokenizer = ByteLevel() | |
| # Customize training with a BpeTrainer | |
| trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) | |
| # Path to the file(s) for training the tokenizer | |
| files = [args.dataset_file_path] | |
| # Train the tokenizer | |
| tokenizer.train(files, trainer) | |
| # Optionally, you can customize the post-processing to add special tokens | |
| tokenizer.post_processor = TemplateProcessing( | |
| single="[CLS] $A [SEP]", | |
| pair="[CLS] $A [SEP] $B:1 [SEP]:1", | |
| special_tokens=[ | |
| ("[CLS]", tokenizer.token_to_id("[CLS]")), | |
| ("[SEP]", tokenizer.token_to_id("[SEP]")), | |
| ], | |
| ) | |
| # Save the tokenizer | |
| tokenizer.save(args.output_file_path) | |