xu-song's picture
update
751936e
"""
tokenizer类型:HFTokenizer
## Run
## 来源
- https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
- https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/tokenizer
"""
import json
import ftfy
from gpt_nexo_20b.tokenizer import build_tokenizer
class Encoder(object):
def __init__(self, args):
self.args = args
def initializer(self):
# Use Encoder class as a container for global data
Encoder.tokenizer = build_tokenizer(self.args)
def encode(self, text):
if self.args.ftfy:
text = ftfy.fix_text(text)
ids = {}
text_ids = Encoder.tokenizer.tokenize(text)
return text_ids
class HFConfig:
"""
jsonl_keys 是干嘛的?
对应的配置文件:https://github.com/EleutherAI/gpt-neox/blob/main/configs/20B.yml
"vocab-file": "./20B_checkpoints/20B_tokenizer.json",
"tokenizer_type": "HFTokenizer",
"""
def __init__(self):
self.append_eod = True
self.ftfy = False
self.keep_empty = False
self.log_interval = 100
self.make_vocab_size_divisible_by = 128
self.model_parallel_size = 1
self.padded_vocab_size = 50304
self.rank = 0
self.tokenizer_type = 'HFTokenizer'
self.vocab_file = '20B_tokenizer.json'
class GPTConfig:
"""
对应的配置文件:https://github.com/EleutherAI/gpt-neox/blob/main/configs/local_setup.yml
"vocab-file": "data/gpt2-vocab.json",
"merge-file": "data/gpt2-merges.txt",
"tokenizer_type": Default = GPT2BPETokenizer # 默认值
"""
def __init__(self):
self.input = './data/enwik8/enwik8.zip'
self.merge_file = './data/gpt2-merges.txt'
self.workers = 1
class BERTConfig:
""" 好像不支持
"vocab-file": "./20B_checkpoints/20B_tokenizer.json",
"tokenizer_type": "HFTokenizer",
"""
pass
def test():
args = HFConfig()
encoder = Encoder(args)
tokenizer = build_tokenizer(args)
print(f"Vocab size: {tokenizer.vocab_size}")
encoder.initializer()
tokens = encoder.encode("中国\ngood job一个人去哪里")
# 13609 中
# 23197 国
print(tokens)
for token in tokens:
print(token, Encoder.tokenizer.detokenize([token]))
def convert_vocab():
vocab = json.load(open("20B_tokenizer.json", "r", encoding="utf-8"))
json.dump(vocab, open("20B_tokenizer.zh.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)
def dump_vocab():
args = HFConfig()
tokenizer = build_tokenizer(args)
print(f"Vocab size: {tokenizer.vocab_size}")
with open("20B.vocab.txt", "w", encoding="utf-8") as f_out:
for token in tokenizer.vocab:
f_out.write(token + "\n")
"""
13609 中
23197 国
187
12311 good
2628 job
27896 一个
13484 人
44781 去
20833 �
105 �
42013 里
"""
if __name__ == "__main__":
test()
# convert_vocab()
# dump_vocab()