Spaces:
Running
Running
""" | |
https://huggingface.co/ClueAI/ChatYuan-large-v2 | |
支持\n \t | |
- 英文编码很烂 | |
为什么不直接编码\n \t,反而要过一套前处理和后处理? | |
""" | |
import json | |
from transformers import AutoTokenizer | |
def preprocess(text): | |
""" | |
词典里 | |
""" | |
print("原文本: ", text) | |
text = text.replace("\n", "\\n").replace("\t", "\\t") | |
print("预处理后文本: ", text) | |
return text | |
def postprocess(text): | |
return text.replace("\\n", "\n").replace("\\t", "\t").replace('%20', ' ') | |
model_dir = 'ChatYuan-large-v2' | |
tokenizer = AutoTokenizer.from_pretrained(model_dir) | |
text = "中国\nabcde jump \tnice" | |
tokens = tokenizer.tokenize(text) | |
print(tokens) | |
# ['▁中国', '▁', 'ab', 'c', 'de', '▁', 'j', 'ump', '▁n', 'ice'] | |
print(tokenizer.tokenize(preprocess(text))) | |
# ['▁中国', '\\n', 'ab', 'c', 'de', '▁', 'j', 'ump', '▁', '\\t', 'n', 'ice'] | |
tokens = [12, 623, 5, 13409, 7, 51, 158, 5, 864, 93, | |
3, 1329, 14965, 3402, 188, 4, 7, 623, 5, 56, | |
4464, 4, 7, 51, 158, 5, 1526, 158, 617, 1456, | |
84, 1607, 10, 11442, 1456, 9938, 9, 12, 14, 38, | |
6582, 2945, 2861, 3, 11779, 1074, 712, 1036, 167, 6, | |
7, 623, 5, 9898, 513, 79, 26455, 489, 3, 34, | |
12029, 22, 7, 51, 158, 5, 1] | |
tokens = [0, 12, 14381, 10, 19849, 3, 7, 7, 34, 313, | |
1344, 9017, 3, 276, 26455, 2258, 3, 578, 864, 529, | |
2771, 874, 26455, 1442, 6, 7, 7, 26455, 9220, 19849, | |
937, 16, 11726, 33, 11726, 52, 6, 7, 12, 7, | |
7, 8353, 1036, 8093, 67, 276, 1036, 3338, 3, 480, | |
4490, 30, 34, 1325, 6, 7, 2200, 53, 7321, 2187, | |
648, 78, 7321, 2899, 25823, 6, 7, 2964, 3402, 1203, | |
13, 537, 6, 7, 1660, 2795, 3402, 1203, 6, 7, | |
407, 1802, 7, 7, 3095, 1477, 37, 7, 7, 19849, | |
7, 7, 11726, 16, 11726, 7893, 42, 1] | |
print(tokenizer.decode(tokens)) | |