Spaces:
Running
Running
File size: 542 Bytes
0ce6477 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
from utils.zh_util import is_chinese
from transformers import LlamaTokenizer
llama_vocab = LlamaTokenizer.from_pretrained("../tokenizer").get_vocab()
from vocab.gpt_neox_chinese_v1 import tokenizer
vocab = tokenizer.get_vocab()
f_out = open("append_zh.txt", "w", encoding="utf-8")
for token, token_id in vocab.items():
token = tokenizer.decode([token_id])
# token = token.strip("Ġ")
if len(token) < 1:
continue
if is_chinese(token[0]):
if token not in llama_vocab:
f_out.write(token + "\n")
|