mooncast / test /test_tokenizer.py
jzq11111's picture
Upload folder using huggingface_hub
a3e05e8 verified
import sys
sys.path.append('.')
from modules.tokenizer.tokenizer import get_tokenizer_and_extra_tokens
if __name__ == '__main__':
tokenizer, extra_tokens = get_tokenizer_and_extra_tokens()
assert tokenizer.encode("user") == [1495]
assert tokenizer.decode([1495]) == "user"
assert tokenizer.encode("0") == [501]
assert tokenizer.decode([501]) == "0"
assert tokenizer.encode("1") == [503]
assert tokenizer.decode([503]) == "1"
assert tokenizer.encode("assistant") == [110866]
assert tokenizer.decode([110866]) == "assistant"
assert tokenizer.encode("audio") == [26229]
assert tokenizer.decode([26229]) == "audio"
assert extra_tokens.msg_end == 260
assert extra_tokens.user_msg_start == 261
assert extra_tokens.assistant_msg_start == 262
assert extra_tokens.name_end == 272
assert extra_tokens.media_begin == 273
assert extra_tokens.media_content == 274
assert extra_tokens.media_end == 275
assert [tokenizer.convert_tokens_to_ids(i) for i in ['<0x0A>', '</s>', '[extra_id_0]']] == [14, 1, 260]
print("All tests passed!")