Spaces:
Running
Running
File size: 822 Bytes
7156337 9495a4f 7156337 9495a4f 7156337 9495a4f 7156337 9495a4f 7156337 9495a4f 7156337 9495a4f 7156337 9495a4f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
import copy
import json
from tokenizers import Tokenizer
def export_mock_tokenizer():
input_path = "20B_tokenizer_chinese.json"
tokenizer = json.load(open(input_path, "r", encoding="utf-8"))
vocab = tokenizer["model"]["vocab"]
added_tokens = [token["id"] for token in tokenizer["added_tokens"]]
for k, v in copy.deepcopy(vocab).items():
if v not in added_tokens:
vocab[str(v)] = v
vocab.pop(k)
out_path = input_path.replace(".json", ".mock.json")
with open(out_path, "w", encoding="utf-8") as f_out:
f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2))
def mock2():
pass
def load_mock_tokenizer():
tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.mock.json")
print('')
export_mock_tokenizer()
load_mock_tokenizer() |