Spaces:

xu-song
/

tokenizer-arena

Running

File size: 2,085 Bytes

751936e

"""
https://huggingface.co/ClueAI/ChatYuan-large-v2

支持\n \t

- 英文编码很烂

为什么不直接编码\n \t，反而要过一套前处理和后处理?

"""
import json

from transformers import AutoTokenizer



def preprocess(text):
    """
    词典里
    """
    print("原文本: ", text)
    text = text.replace("\n", "\\n").replace("\t", "\\t")
    print("预处理后文本: ", text)
    return text


def postprocess(text):
    return text.replace("\\n", "\n").replace("\\t", "\t").replace('%20', '  ')


model_dir = 'ChatYuan-large-v2'
tokenizer = AutoTokenizer.from_pretrained(model_dir)

text = "中国\nabcde jump \tnice"
tokens = tokenizer.tokenize(text)

print(tokens)
# ['▁中国', '▁', 'ab', 'c', 'de', '▁', 'j', 'ump', '▁n', 'ice']
print(tokenizer.tokenize(preprocess(text)))
# ['▁中国', '\\n', 'ab', 'c', 'de', '▁', 'j', 'ump', '▁', '\\t', 'n', 'ice']

tokens = [12, 623, 5, 13409, 7, 51, 158, 5, 864, 93,
          3, 1329, 14965, 3402, 188, 4, 7, 623, 5, 56,
          4464, 4, 7, 51, 158, 5, 1526, 158, 617, 1456,
          84, 1607, 10, 11442, 1456, 9938, 9, 12, 14, 38,
          6582, 2945, 2861, 3, 11779, 1074, 712, 1036, 167, 6,
          7, 623, 5, 9898, 513, 79, 26455, 489, 3, 34,
          12029, 22, 7, 51, 158, 5, 1]

tokens = [0,    12, 14381,    10, 19849,     3,     7,     7,    34,   313,
          1344,  9017,     3,   276, 26455,  2258,     3,   578,   864,   529,
          2771,   874, 26455,  1442,     6,     7,     7, 26455,  9220, 19849,
           937,    16, 11726,    33, 11726,    52,     6,     7,    12,     7,
             7,  8353,  1036,  8093,    67,   276,  1036,  3338,     3,   480,
          4490,    30,    34,  1325,     6,     7,  2200,    53,  7321,  2187,
           648,    78,  7321,  2899, 25823,     6,     7,  2964,  3402,  1203,
            13,   537,     6,     7,  1660,  2795,  3402,  1203,     6,     7,
           407,  1802,     7,     7,  3095,  1477,    37,     7,     7, 19849,
             7,     7, 11726,    16, 11726,  7893,    42,     1]


print(tokenizer.decode(tokens))