Spaces:
Running
Running
import json | |
error_tokens = [54611, 54612, 54613, 54614, 54615, 54616, 54617, 54618, 54619, 54620, 54621, 54622, | |
54623, 54624, 54625, 54626, 54627, 54628, 54629, 54630, 54631, 54632, 54633] | |
data = json.load(open("20B_tokenizer_chinese.v2.json", "r", encoding="utf-8")) | |
vocab = data["model"]["vocab"] | |
id2vocab = {idx: token for token, idx in vocab.items()} | |
for token_id in error_tokens: | |
token = id2vocab[token_id] | |
for tmp in vocab: | |
if token in tmp and token != tmp: | |
print("catch") | |
# print("a") | |
# json.la |