alvin commited on
Commit
08d39dc
·
1 Parent(s): 07fc755

refactor tokenizer related files with eos token

Browse files
Files changed (4) hide show
  1. merges.txt +0 -1
  2. replace_token_script.py +80 -0
  3. tokenizer.json +0 -0
  4. vocab.json +0 -0
merges.txt CHANGED
@@ -49994,4 +49994,3 @@ st ep
49994
  ĠUm p
49995
  ĠKet ersediaan
49996
  ĠMon key
49997
- ĠSIPI LIS
 
49994
  ĠUm p
49995
  ĠKet ersediaan
49996
  ĠMon key
 
replace_token_script.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ''''This script was used to replace the final index of tokenizer.json and vocab.json
2
+ with "<|endoftext|>" token. Also reassociate the corresponding merges'''
3
+
4
+ import json
5
+
6
+ tokenizer_path = 'tokenizer.json'
7
+ model_config_path = 'config.json'
8
+ vocab_path = 'vocab.json'
9
+
10
+ with open(vocab_path, "r") as f:
11
+ vocab_data = json.load(f)
12
+
13
+ with open(tokenizer_path, "r") as f:
14
+ tokenizer_data = json.load(f)
15
+
16
+ with open(model_config_path, "r") as f:
17
+ model_config = json.load(f)
18
+
19
+ model_vocab_size = model_config['vocab_size']
20
+ tokenizer_vocab = tokenizer_data['model']['vocab']
21
+ mergeslength = len(tokenizer_data['model']['merges'])
22
+
23
+ #readjust added_tokens 'id' to model_vocab_size
24
+ tokenizer_data['added_tokens'][-1]['id'] = model_vocab_size = model_config['vocab_size']
25
+
26
+ final_index = model_vocab_size - 1
27
+ eos = '<|endoftext|>'
28
+
29
+ #retrieve the key of final index
30
+ old_key_final_index_tokenizer = list(tokenizer_data['model']['vocab'].keys())[final_index]
31
+ old_key_final_index_vocab = list(vocab_data.keys())[final_index]
32
+ old_key_final_index_vocab_min2 = list(vocab_data.keys())[final_index - 1]
33
+ old_key_final_index_tokenizer_merges = tokenizer_data['model']['merges'][mergeslength - 1]
34
+
35
+ print(f"old_key_final_index_tokenizer = {old_key_final_index_tokenizer}")
36
+ print(f"old_key_final_index_vocab = {old_key_final_index_vocab}")
37
+ print(f"old_key_final_index_vocab_min2 = {old_key_final_index_vocab_min2}")
38
+ print(f"old_key_final_index_tokenizer_merges = {old_key_final_index_tokenizer_merges}")
39
+
40
+ #replace old key with new key
41
+ tokenizer_data['model']['vocab']['<|endoftext|>'] = tokenizer_data['model']['vocab'][old_key_final_index_tokenizer]
42
+ vocab_data[eos] = vocab_data[old_key_final_index_vocab]
43
+
44
+ #replace the final merges idx with vocab_data - 1
45
+ tokenizer_data['model']['merges'] = tokenizer_data['model']['merges'][: mergeslength - 1]
46
+
47
+
48
+ #delete old key
49
+ del tokenizer_data['model']['vocab'][old_key_final_index_tokenizer]
50
+ del vocab_data[old_key_final_index_vocab]
51
+
52
+ #check updated key
53
+ old_key_final_index_tokenizer = list(tokenizer_data['model']['vocab'].keys())[final_index]
54
+ old_key_final_index_vocab = list(vocab_data.keys())[final_index]
55
+ old_key_final_index_tokenizer_merges = tokenizer_data['model']['merges'][mergeslength - 2]
56
+
57
+ print(len(tokenizer_data['model']['merges']))
58
+ print()
59
+ print(f"updated old_key_final_index_tokenizer = {old_key_final_index_tokenizer}")
60
+ print(f"updated old_key_final_index_vocab = {old_key_final_index_vocab}")
61
+ print(f"updated old_key_final_index_tokenizer_merges = {old_key_final_index_tokenizer_merges}")
62
+
63
+ with open(tokenizer_path, "w")as f:
64
+ json.dump(tokenizer_data, f)
65
+
66
+ with open(vocab_path, "w")as f:
67
+ json.dump(vocab_data, f)
68
+
69
+ with open('merges.txt') as f:
70
+ lines = f.readlines()
71
+
72
+ with open("merges.txt", "w") as f:
73
+ for i in range(len(lines) - 1):
74
+ f.write(lines[i])
75
+
76
+ with open('merges.txt') as f:
77
+ newlines = f.readlines()
78
+
79
+ print(f"newlines[len(newlines) - 1] = {newlines[len(newlines) - 1]}")
80
+
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
vocab.json CHANGED
The diff for this file is too large to render. See raw diff