|
import copy |
|
import json |
|
from tokenizers import Tokenizer |
|
|
|
def export_mock_tokenizer(): |
|
input_path = "20B_tokenizer_chinese.json" |
|
|
|
tokenizer = json.load(open(input_path, "r", encoding="utf-8")) |
|
|
|
vocab = tokenizer["model"]["vocab"] |
|
added_tokens = [token["id"] for token in tokenizer["added_tokens"]] |
|
|
|
for k, v in copy.deepcopy(vocab).items(): |
|
if v not in added_tokens: |
|
vocab[str(v)] = v |
|
vocab.pop(k) |
|
|
|
out_path = input_path.replace(".json", ".mock.json") |
|
with open(out_path, "w", encoding="utf-8") as f_out: |
|
f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2)) |
|
|
|
|
|
def mock2(): |
|
pass |
|
|
|
|
|
def load_mock_tokenizer(): |
|
tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.mock.json") |
|
print('') |
|
|
|
export_mock_tokenizer() |
|
load_mock_tokenizer() |