Spaces:
Running
Running
import copy | |
import json | |
input_path = "20B_tokenizer_chinese.json" | |
tokenizer = json.load(open(input_path, "r", encoding="utf-8")) | |
vocab = tokenizer["model"]["vocab"] | |
for k, v in copy.deepcopy(vocab).items(): | |
vocab[str(v)] = v | |
vocab.pop(k) | |
out_path = input_path.replace(".json", ".mock.json") | |
with open(out_path, "w", encoding="utf-8") as f_out: | |
f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2)) |