James Barry commited on
Commit
de57501
·
1 Parent(s): 51dbdd6

Create train_gpt2_tokenizer.py

Browse files
Files changed (1) hide show
  1. train_gpt2_tokenizer.py +24 -0
train_gpt2_tokenizer.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer
3
+
4
+ # load dataset
5
+ dataset = load_dataset("oscar", "unshuffled_deduplicated_ga", split="train")
6
+
7
+ # Instantiate tokenizer
8
+ tokenizer = ByteLevelBPETokenizer()
9
+
10
+ def batch_iterator(batch_size=1000):
11
+ for i in range(0, len(dataset), batch_size):
12
+ yield dataset[i: i + batch_size]["text"]
13
+
14
+ # Customized training
15
+ tokenizer.train_from_iterator(batch_iterator(), vocab_size=50257, min_frequency=2, special_tokens=[
16
+ "<s>",
17
+ "<pad>",
18
+ "</s>",
19
+ "<unk>",
20
+ "<mask>",
21
+ ])
22
+
23
+ # Save files to disk
24
+ tokenizer.save("./irish-gpt2/tokenizer.json")