Spaces:
Running
Running
import copy | |
import json | |
from tokenizers import Tokenizer | |
def export_mock_tokenizer(): | |
input_path = "20B_tokenizer_chinese.json" | |
tokenizer = json.load(open(input_path, "r", encoding="utf-8")) | |
vocab = tokenizer["model"]["vocab"] | |
added_tokens = [token["id"] for token in tokenizer["added_tokens"]] | |
for k, v in copy.deepcopy(vocab).items(): | |
if v not in added_tokens: | |
vocab[str(v)] = v | |
vocab.pop(k) | |
out_path = input_path.replace(".json", ".mock.json") | |
with open(out_path, "w", encoding="utf-8") as f_out: | |
f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2)) | |
def mock2(): | |
pass | |
def load_mock_tokenizer(): | |
tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.mock.json") | |
print('') | |
export_mock_tokenizer() | |
load_mock_tokenizer() |