|
""" |
|
## Dependency |
|
pip install emoji --upgrade |
|
|
|
## |
|
|
|
https://github.com/CLUEbenchmark/CLUEPretrainedModels/blob/master/bert_dict.py |
|
1. 更新langconv,新增: 余 吒 著 覆 |
|
2. |
|
3. 删除30个阿拉伯字母 (阿拉伯语从右向左书写) |
|
4. ok等字母 |
|
|
|
|
|
## TODO: |
|
1. ##~ 这样的词典可以删除,对应要修改tokenizer。 |
|
2. 是否要加入空格 [SPACE] 这样的特殊符号。 |
|
a) 还原问题: 比如 new balance这样的词汇,会被合并。 会吗? 分词后是 new bal ##ance --> new balance 也能完全还原啊。 |
|
b) 语义问题: 同时,在一定意义上也能起到语义隔离的作用,比如 "剑南春 水晶剑 52度 单瓶装高度白酒 750ml 口感浓香型" https://item.jd.com/100006659994.html |
|
[SEP] 也能work |
|
""" |
|
|
|
import codecs |
|
import sys |
|
import re |
|
from langconv import * |
|
import emoji |
|
|
|
|
|
|
|
|
|
emoji_regex = emoji.get_emoji_regexp() |
|
|
|
human_list = ['▲top', '▲topoct', '▲topmay', '▲topapr', '▲topmar', '▲topjun', '▲topdec', '▲topnov', '▲topaug', '▲topjul', |
|
'▲topjan', '▲topsep', '▲topfeb', '¥799', '¥2899', '~~', '~~~', '##~6', '##~10', '~10', '##~5', '~5', |
|
'##~20', '##~8', '##~17', '##~1', '~4', '##~3', '##~7', '~1', 'wedding', '×email', 'cp', '××', 'ok', 'a', |
|
'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', |
|
'w', 'x', 'y', 'z', '##★', '##☆', '↓↓↓', '##●', '##♪', '▌♥', '##|', |
|
'##d', '##▲', '##o', '★★', '##→', '#a', '⋯⋯', '##▼', '##○', '★★★★★', '##∥', '##◆', '##ω', '★★★', '##c', |
|
'##s', '##e', '##p', '##■', '##↑', '##k', '##и', '◆◆', '##g', '##а', '±0', '##◎', '##─', '##r', |
|
'##>', '##t', '★★★★', '##│', '##n', '##l', '##=', '##y', '☆☆☆', '##i', '##↓', 'ˋ▽ˊ', '##v', '↓↓', |
|
'##f2016', '##q', '∟∣', '##я', '##←', '##◆◆', '##cm~', '##f', '##h', '##j', '##u', '##w', |
|
'##z'] |
|
|
|
zhuyin_char = ['ㄅ', 'ㄆ', 'ㆠ', 'ㄇ', 'ㄈ', 'ㄪ', 'ㄉ', 'ㄊ', 'ㄋ', 'ㆹ', 'ㄌ', 'ㄍ', 'ㄎ', 'ㆣ', 'ㄫ', 'ㄏ', 'ㆸ', 'ㄐ', 'ㄑ', 'ㆢ', 'ㄬ', |
|
'ㄒ', 'ㆺ', 'ㄓ', 'ㄔ', 'ㄕ', 'ㄖ', 'ㄗ', 'ㄘ', 'ㆡ', 'ㄙ', 'ㆡ', 'ㆪ', 'ㄨ', 'ㆫ', 'ㆨ', 'ㄩ', 'ㄚ', 'ㆩ', 'ㆦ', 'ㆧ', 'ㄛ', |
|
'ㄜ', 'ㄝ', 'ㆤ', 'ㆥ', 'ㄞ', 'ㆮ', 'ㄟ', 'ㄠ', 'ㆯ', 'ㄡ', 'ㆰ', 'ㆱ', 'ㆬ', 'ㄢ', 'ㄣ', 'ㄯ', 'ㄤ', 'ㆲ', 'ㄥ', 'ㆭ', 'ㄦ', |
|
'ㄭ'] |
|
|
|
special_token = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '<S>', '<T>'] |
|
|
|
japan_chars = ['イ', 'ク', 'シ', 'ス', 'ト', 'ノ', 'フ', 'ラ', 'ル', 'ン'] |
|
|
|
korean_chars = ['ᄀ', 'ᄁ', 'ᄂ', 'ᄃ', 'ᄅ', 'ᄆ', 'ᄇ', 'ᄈ', 'ᄉ', 'ᄋ', 'ᄌ', 'ᄎ', 'ᄏ', 'ᄐ', 'ᄑ', 'ᄒ', 'ᅡ', 'ᅢ', 'ᅣ', 'ᅥ', 'ᅦ', |
|
'ᅧ', 'ᅨ', 'ᅩ', 'ᅪ', 'ᅬ', 'ᅭ', 'ᅮ', 'ᅯ', 'ᅲ', 'ᅳ', 'ᅴ', 'ᅵ', 'ᆨ', 'ᆫ', 'ᆯ', 'ᆷ', 'ᆸ', 'ᆺ', 'ᆻ', 'ᆼ', 'ᗜ'] |
|
|
|
add_puns = ['”', '“', '—', '–', '…', '’', '‘'] |
|
|
|
|
|
add_cn_chars = [char for char in '呡乾绗楦硌袢钕蕞癀皲貉唛笕椴―胗旯鳙鲇鳐鳜鲅鳊鲳鲽鲣枞炝醅馊捯抻绉馐饧莜嘬腘肫鳟镊犽洌蝰铱' \ |
|
'髌锃镲锗甑戗裥弎粝霂猄轱苎偲兿铷栢帏黢洇沄誊忸怩蚬籺氚犇锒鸩噘偾髫'] |
|
|
|
|
|
add_nums = ['10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', |
|
'28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', |
|
'46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', |
|
'64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', |
|
'82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', |
|
'100', '120', '128', '180', '200', '256', '304', '360', '500', '512', '1000', '1080', '2000', '2014', |
|
'2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022'] |
|
|
|
|
|
cn_punc = ',。;:?!()~|' |
|
def q2b(uchar, skip_cn_punc=False): |
|
|
|
if skip_cn_punc and uchar in cn_punc: |
|
return uchar |
|
inside_code = ord(uchar) |
|
if inside_code == 12288: |
|
inside_code = 32 |
|
elif 65281 <= inside_code <= 65374: |
|
inside_code -= 65248 |
|
return chr(inside_code) |
|
|
|
def str_q2b(ustring, skip_cn_punc=False): |
|
""" 全角转半角 """ |
|
return ''.join([q2b(uchar, skip_cn_punc) for uchar in ustring]) |
|
|
|
|
|
with open('vocab.google.txt', 'r', encoding='utf-8') as fin, \ |
|
open('vocab.jd.txt.v2', 'w', encoding='utf-8') as fout: |
|
cout_zh = 0 |
|
cout_en = 0 |
|
cout_jp = 0 |
|
cout_em = 0 |
|
cout_zh_res = 0 |
|
cout_zh_tra = 0 |
|
cout_zh_wp = 0 |
|
cout_en_del = 0 |
|
cout_en_res = 0 |
|
cout_num = 0 |
|
cout_num_del = 0 |
|
cout_num_res = 0 |
|
cout_hand_del = 0 |
|
cout_total = 0 |
|
cout_zhuyin = 0 |
|
cout_unused = 0 |
|
cout_special = 0 |
|
cout_jp = 0 |
|
cout_ko = 0 |
|
cout_ar = 0 |
|
|
|
for line in fin: |
|
cout_total += 1 |
|
token = line.strip() |
|
|
|
if not token: |
|
continue |
|
|
|
if token in ['|']: |
|
print(token) |
|
|
|
if token in human_list: |
|
cout_hand_del += 1 |
|
continue |
|
|
|
|
|
elif re.match(u'[\u4e00-\u9fa5]+', token.replace('##', '')): |
|
cout_zh += 1 |
|
|
|
token_simp = Converter('zh-hans').convert(token) |
|
if token_simp != token: |
|
cout_zh_tra += 1 |
|
continue |
|
else: |
|
if re.match(u'##', token): |
|
|
|
cout_zh_wp += 1 |
|
continue |
|
else: |
|
cout_zh_res += 1 |
|
print(token, file=fout) |
|
|
|
|
|
elif re.match(u'[\uac00-\ud7ff]+', token.replace('##', '')): |
|
|
|
cout_ko += 1 |
|
continue |
|
|
|
|
|
elif re.match(u'[\u30a0-\u30ff\u3040-\u309f]+', token.replace('##', '')): |
|
|
|
cout_jp += 1 |
|
continue |
|
|
|
|
|
elif re.match(u'[\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f' |
|
u'\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD]+', token.replace('##', '')): |
|
cout_ar += 1 |
|
continue |
|
|
|
|
|
elif re.match(u'[a-z]+', token.replace('##', '')): |
|
|
|
cout_en += 1 |
|
print(token, file=fout) |
|
continue |
|
|
|
elif str_q2b(token, skip_cn_punc=True) != token: |
|
print(token, '--', str_q2b(token, skip_cn_punc=True)) |
|
continue |
|
|
|
|
|
elif re.match(emoji_regex, token.replace('##', '')): |
|
|
|
cout_em += 1 |
|
continue |
|
|
|
|
|
elif re.match(u'(##)?\d', token): |
|
cout_num += 1 |
|
if len(token.replace('##', '')) == 1: |
|
|
|
cout_num_res += 1 |
|
print(token, file=fout) |
|
else: |
|
cout_num_del += 1 |
|
|
|
continue |
|
elif token.replace('##', '') in zhuyin_char: |
|
|
|
cout_zhuyin += 1 |
|
continue |
|
elif token.startswith('[unused'): |
|
print(token, file=fout) |
|
cout_unused += 1 |
|
elif token in special_token: |
|
print(token, file=fout) |
|
cout_special += 1 |
|
|
|
elif token.replace('##', '') in japan_chars: |
|
cout_jp += 1 |
|
continue |
|
|
|
elif token.replace('##', '') in korean_chars: |
|
cout_ko += 1 |
|
continue |
|
else: |
|
|
|
print(token, file=fout) |
|
|
|
|
|
if token == '"': |
|
for token in add_puns: |
|
print(token, file=fout) |
|
if token == '9': |
|
for token in add_nums: |
|
cout_num_res += 1 |
|
print(token, file=fout) |
|
if token == '龟': |
|
for token in add_cn_chars: |
|
print(token, file=fout) |
|
|
|
print("cout_zh:{}".format(cout_zh)) |
|
print("cout_zh_tra:{}".format(cout_zh_tra)) |
|
print("cout_zh_wp:{}".format(cout_zh_wp)) |
|
print("cout_zh_res:{}".format(cout_zh_res)) |
|
print("cout_en:{}".format(cout_en)) |
|
print("cout_en_del:{}".format(cout_en_del)) |
|
print("cout_en_res:{}".format(cout_en_res)) |
|
print("cout_num:{}".format(cout_num)) |
|
print("cout_num_del:{}".format(cout_num_del)) |
|
print("cout_num_res:{}".format(cout_num_res)) |
|
print("cout_hand_del:{}".format(cout_hand_del)) |
|
print("cout_zhuyin:{}".format(cout_zhuyin)) |
|
print("cout_unused:{}".format(cout_unused)) |
|
print("cout_special:{}".format(cout_special)) |
|
print("cout_jp:{}".format(cout_jp)) |
|
print("cout_ko:{}".format(cout_ko)) |
|
print("cout_ar:{}".format(cout_ar)) |
|
print("cout_em:{}".format(cout_em)) |
|
|