|
""" |
|
|
|
tokenizer类型:HFTokenizer |
|
|
|
|
|
## Run |
|
|
|
|
|
|
|
## 来源 |
|
|
|
- https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py |
|
- https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/tokenizer |
|
|
|
""" |
|
|
|
import json |
|
import ftfy |
|
from gpt_nexo_20b.tokenizer import build_tokenizer |
|
|
|
|
|
class Encoder(object): |
|
def __init__(self, args): |
|
self.args = args |
|
|
|
def initializer(self): |
|
|
|
Encoder.tokenizer = build_tokenizer(self.args) |
|
|
|
def encode(self, text): |
|
if self.args.ftfy: |
|
text = ftfy.fix_text(text) |
|
ids = {} |
|
text_ids = Encoder.tokenizer.tokenize(text) |
|
return text_ids |
|
|
|
|
|
class HFConfig: |
|
""" |
|
jsonl_keys 是干嘛的? |
|
|
|
对应的配置文件:https://github.com/EleutherAI/gpt-neox/blob/main/configs/20B.yml |
|
"vocab-file": "./20B_checkpoints/20B_tokenizer.json", |
|
"tokenizer_type": "HFTokenizer", |
|
""" |
|
def __init__(self): |
|
self.append_eod = True |
|
self.ftfy = False |
|
self.keep_empty = False |
|
self.log_interval = 100 |
|
self.make_vocab_size_divisible_by = 128 |
|
self.model_parallel_size = 1 |
|
self.padded_vocab_size = 50304 |
|
self.rank = 0 |
|
self.tokenizer_type = 'HFTokenizer' |
|
self.vocab_file = '20B_tokenizer.json' |
|
|
|
|
|
class GPTConfig: |
|
""" |
|
对应的配置文件:https://github.com/EleutherAI/gpt-neox/blob/main/configs/local_setup.yml |
|
"vocab-file": "data/gpt2-vocab.json", |
|
"merge-file": "data/gpt2-merges.txt", |
|
|
|
"tokenizer_type": Default = GPT2BPETokenizer # 默认值 |
|
""" |
|
def __init__(self): |
|
self.input = './data/enwik8/enwik8.zip' |
|
self.merge_file = './data/gpt2-merges.txt' |
|
self.workers = 1 |
|
|
|
class BERTConfig: |
|
""" 好像不支持 |
|
"vocab-file": "./20B_checkpoints/20B_tokenizer.json", |
|
"tokenizer_type": "HFTokenizer", |
|
""" |
|
pass |
|
|
|
|
|
def test(): |
|
args = HFConfig() |
|
encoder = Encoder(args) |
|
tokenizer = build_tokenizer(args) |
|
print(f"Vocab size: {tokenizer.vocab_size}") |
|
encoder.initializer() |
|
|
|
tokens = encoder.encode("中国\ngood job一个人去哪里") |
|
|
|
|
|
print(tokens) |
|
for token in tokens: |
|
print(token, Encoder.tokenizer.detokenize([token])) |
|
|
|
|
|
|
|
def convert_vocab(): |
|
vocab = json.load(open("20B_tokenizer.json", "r", encoding="utf-8")) |
|
json.dump(vocab, open("20B_tokenizer.zh.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2) |
|
|
|
|
|
def dump_vocab(): |
|
args = HFConfig() |
|
tokenizer = build_tokenizer(args) |
|
print(f"Vocab size: {tokenizer.vocab_size}") |
|
with open("20B.vocab.txt", "w", encoding="utf-8") as f_out: |
|
for token in tokenizer.vocab: |
|
f_out.write(token + "\n") |
|
|
|
""" |
|
13609 中 |
|
23197 国 |
|
187 |
|
|
|
12311 good |
|
2628 job |
|
27896 一个 |
|
13484 人 |
|
44781 去 |
|
20833 � |
|
105 � |
|
42013 里 |
|
""" |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
test() |
|
|
|
|