tokenizer-arena / utils /text_util.py
xu-song's picture
update
751936e
raw
history blame
308 Bytes
def is_chinese(uchar):
"""
https://github.com/fxsjy/jieba/blob/master/jieba/__init__.py#L48
re.compile("([\u4E00-\u9FD5]+)", re.U)
"""
return u'\u4e00' <= uchar <= u'\u9fa5'
def has_chinese(text):
""" contains Chinese characters """
return any(is_chinese(ch) for ch in text)