tianxie-sf rooa commited on
Commit
6d7139b
1 Parent(s): 1a0f468

Update tokenization_xgen.py (#28)

Browse files

- Update tokenization_xgen.py (be8c5e0d5bdff8cf8b9df596a138dbb4e1382ab2)


Co-authored-by: Hiroaki Hayashi <[email protected]>

Files changed (1) hide show
  1. tokenization_xgen.py +12 -0
tokenization_xgen.py CHANGED
@@ -60,9 +60,18 @@ def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
60
  ]
61
  return fim_tokens
62
 
 
 
 
 
 
 
 
 
63
  add_whitespaces = include_whitespace(n_min=2, n_max=32)
64
  add_tabs = include_tabs(n_min=2, n_max=10)
65
  fim_tokens = include_fim_tokens()
 
66
 
67
  tokenizer = tiktoken.get_encoding(base)
68
 
@@ -82,6 +91,9 @@ def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
82
  for sp in fim_tokens:
83
  special_tokens[sp] = idx
84
  idx += 1
 
 
 
85
 
86
  if pad_token and pad_token not in tokenizer._special_tokens and pad_token not in special_tokens:
87
  special_tokens[pad_token] = idx
 
60
  ]
61
  return fim_tokens
62
 
63
+ def include_additional_tokens():
64
+ tokens = []
65
+ tokens += [f"<dummy_{i}>" for i in range(4)]
66
+ tokens.append("<sep>") # 50317
67
+ tokens.append("<eom>") # 50318
68
+ tokens += [f"<mask_{i}>" for i in reversed(range(1, 51199-50318+1))]
69
+ return tokens
70
+
71
  add_whitespaces = include_whitespace(n_min=2, n_max=32)
72
  add_tabs = include_tabs(n_min=2, n_max=10)
73
  fim_tokens = include_fim_tokens()
74
+ additional_tokens = include_additional_tokens()
75
 
76
  tokenizer = tiktoken.get_encoding(base)
77
 
 
91
  for sp in fim_tokens:
92
  special_tokens[sp] = idx
93
  idx += 1
94
+ for sp in additional_tokens:
95
+ special_tokens[sp] = idx
96
+ idx += 1
97
 
98
  if pad_token and pad_token not in tokenizer._special_tokens and pad_token not in special_tokens:
99
  special_tokens[pad_token] = idx