Spaces:
Running
Running
from utils.zh_util import is_chinese | |
from transformers import LlamaTokenizer | |
llama_vocab = LlamaTokenizer.from_pretrained("../tokenizer").get_vocab() | |
from vocab.gpt_neox_chinese_v1 import tokenizer | |
vocab = tokenizer.get_vocab() | |
f_out = open("append_zh.txt", "w", encoding="utf-8") | |
for token, token_id in vocab.items(): | |
token = tokenizer.decode([token_id]) | |
# token = token.strip("Ġ") | |
if len(token) < 1: | |
continue | |
if is_chinese(token[0]): | |
if token not in llama_vocab: | |
f_out.write(token + "\n") | |