Spaces:
Sleeping
Sleeping
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
import re | |
import unicodedata | |
import six | |
class Character(object): | |
f_unknown = 'unknown' | |
f_is_alnum = 'is_alnum' | |
f_is_alpha = 'is_alpha' | |
f_is_num = 'is_num' | |
f_is_space = 'is_space' | |
f_is_hyphens = 'is_hyphens' | |
f_is_punctuation = 'is_punctuation' | |
f_is_cjk_character = 'is_cjk_character' | |
f_is_jap_character = 'is_jap_character' | |
f_is_russian_character = 'is_russian_character' | |
def is_alnum(cls, ch: str): | |
"""注意: string.isalnum() 函数, 会对汉字识别为 True. """ | |
if cls.is_cjk_character(ch): | |
return False | |
if ch.isalnum(): | |
return True | |
return False | |
def is_alpha(cls, ch: str): | |
if cls.is_cjk_character(ch): | |
return False | |
if ch.isalpha(): | |
return True | |
return False | |
def is_control(ch): | |
"""控制类字符判断""" | |
if ch in ('\t', '\n', '\r'): | |
return False | |
return unicodedata.category(ch) in ("Cc", "Cf") | |
def is_num(cls, ch: str): | |
if cls.is_cjk_character(ch): | |
return False | |
if ch.isdigit(): | |
return True | |
return False | |
def is_space(cls, ch): | |
"""空格类字符判断""" | |
if ch in (" ", '\n', '\r', '\t'): | |
return True | |
if unicodedata.category(ch) == 'Zs': | |
return True | |
return False | |
def is_hyphens(cls, ch): | |
""" | |
是否为连字符, `-` 匹配减号. | |
+ : 43 | |
- : 45 | |
""" | |
code = ord(ch) | |
if code in (43, 45): | |
return True | |
return False | |
def is_punctuation(cls, ch): | |
"""标点符号类字符判断(全/半角均在此内)""" | |
code = ord(ch) | |
if 33 <= code <= 47 or \ | |
58 <= code <= 64 or \ | |
91 <= code <= 96 or \ | |
123 <= code <= 126 or \ | |
unicodedata.category(ch).startswith("P"): | |
return True | |
return False | |
def is_cjk_character(cls, ch): | |
"""CJK类字符判断(包括中文字符也在此列) | |
参考:https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) | |
""" | |
code = ord(ch) | |
if 0x4E00 <= code <= 0x9FFF or \ | |
0x3400 <= code <= 0x4DBF or \ | |
0x20000 <= code <= 0x2A6DF or \ | |
0x2A700 <= code <= 0x2B73F or \ | |
0x2B740 <= code <= 0x2B81F or \ | |
0x2B820 <= code <= 0x2CEAF or \ | |
0xF900 <= code <= 0xFAFF or \ | |
0x2F800 <= code <= 0x2FA1F: | |
return True | |
return False | |
def is_jap_character(cls, ch): | |
code = ord(ch) | |
if 0x3040 <= code <= 0x309F or \ | |
0x30A0 <= code <= 0x30FF or \ | |
0x31F0 <= code <= 0x31FF: | |
return True | |
return False | |
def is_russian_character(cls, ch): | |
code = ord(ch) | |
if 1040 <= code <= 1104: | |
return True | |
return False | |
def convert_to_unicode(text): | |
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" | |
if six.PY3: | |
if isinstance(text, str): | |
return text | |
elif isinstance(text, bytes): | |
return text.decode("utf-8", "ignore") | |
else: | |
raise ValueError("Unsupported string type: %s" % (type(text))) | |
elif six.PY2: | |
if isinstance(text, str): | |
return text.decode("utf-8", "ignore") | |
elif isinstance(text, unicode): | |
return text | |
else: | |
raise ValueError("Unsupported string type: %s" % (type(text))) | |
else: | |
raise ValueError("Not running on Python2 or Python 3?") | |
class LowerCase(object): | |
confuse_map = { | |
# 俄语 | |
'Й': 'И', | |
'й': 'и', | |
'ѐ': 'е', | |
'ё': 'е', | |
'ѓ': 'г', | |
'ї': 'і', | |
# 西语 | |
'á': 'a', | |
'é': 'e', | |
'í': 'i', | |
'ó': 'o', | |
'ú': 'u', | |
'ü': 'u', | |
'ñ': 'n', | |
} | |
def lowercase(cls, string): | |
"""转小写不应改变字符串的长度""" | |
string = str(string).lower() | |
result = '' | |
for c in string: | |
code = ord(c) | |
# 俄语转小写. | |
if 1040 <= code <= 1072: | |
c = chr(ord(c) + 32) | |
# 混淆字转换 | |
flag = cls.confuse_map.get(c) | |
if flag is not None: | |
c = flag | |
result += c | |
if len(string) != len(result): | |
raise AssertionError('this method should not change the char num. ' | |
'string: {}, result: {}'.format(string, result)) | |
return result | |
class Pattern(object): | |
r""" | |
\d 匹配任意数字符, 等价于 [0-9]. | |
\s 匹配任意空白字符, 等价于 [\t\n\r\f] 包括空隔. | |
re{n,m} 匹配 n 到 m 次由前面的正则表达式定义的片段, 贪婪方式. | |
""" | |
alp_num_ch = r'[A-Z0-9a-z\u4e00-\u9fa5]+' # 提取中文数字字母 | |
alp_num_or_others = r'[^A-Z0-9a-z]|[A-Z0-9a-z]+' # 用于在 ' '.join() 中分融数字字母与其它字符. | |
brackets = r'\(.*?\)' # 识别括号 | |
hw_ry_xy = r'华为|荣耀|小艺' | |
p_pattern = r'[a-z]\d{1,2}\s+p\d{1,2}' | |
pro_pattern = r'([a-z]+\s*\d{1,2})\s+(p\d{1,2})' | |
any_blanks = r'\s+' | |
square_brackets = r'\[.*?\]' # 识别方括号 | |
regex_dsw_find = r'\\\\[dDsSwW][\+\*]?' # 从如 `\d+左右` 中去除 `\d+`. 用于正则索引的获取. | |
class ValidPeriod(object): | |
"""有效期""" | |
l_compare_lt = 'l_compare_lt' | |
l_compare_gt = 'l_compare_gt' | |
l_time = 'l_time' | |
l_time_unit = 'l_time_unit' | |
# 每个子正则表达式 (形如: `<?label>pattern`) 都包含一个标签. | |
l_compare_lt_prefix_regex = rf'?<{l_compare_lt}>不超过|没到|不到|少于' | |
l_compare_gt_prefix_regex = rf'?<{l_compare_gt}>超|超过|超过了|大于|不止' | |
l_compare_lt_suffix_regex = rf'?<{l_compare_lt}>没到|不到|内|以内|之内' | |
l_compare_gt_suffix_regex = rf'?<{l_compare_gt}>以上|不止' | |
l_time_regex = rf'?<{l_time}>[两|壹|零|一|二|三|四|五|六|七|八|九|十|百|千\d]+' | |
# l_time_unit_regex = f'?<{l_time_unit}>年|个月|周|天|星期' | |
l_time_unit_regex = rf'?<{l_time_unit}>(?:个)?年|个月|周|天|日|星期|个星期' | |
# 正则表达式: 识别 -> 不到十天, 一个星期, 超七天后, 七日内, 第七天, 超过了七天, 不止5天 等. 类似的模式. | |
pattern1 = rf'(?:({l_compare_lt_prefix_regex})|({l_compare_gt_prefix_regex}))?\s*({l_time_regex})\s*({l_time_unit_regex})\s*(?:({l_compare_lt_suffix_regex})|({l_compare_gt_suffix_regex}))?' | |
# 正则表达式: 识别 -> 上个月5号, 这个月14日 等. 日期模式 | |
pass | |
def demo1(): | |
""" | |
例句: | |
一个星期, 超七天后, 七日内, 第七天, 超过了七天, 不止5天 | |
# 以下句子都是从标注数据中找出的有效期, 将来也许需要处理这些. | |
刚买2天, 昨天取的, 昨天到货, 签收后的第二天, 签收后七天内, 前两天, 货还没发, 用了几天 | |
:return: | |
""" | |
string = "5天不止" | |
ret = ValidPeriod.valid_period_parse(string) | |
print(ret) | |
return | |
def time_convert(time_string: str): | |
base_num_dict = { | |
'十': 10, | |
'百': 100, | |
'千': 1000, | |
} | |
d = { | |
'壹': 1, | |
'两': 2, | |
'零': 0, | |
'一': 1, | |
'二': 2, | |
'三': 3, | |
'四': 4, | |
'五': 5, | |
'六': 6, | |
'七': 7, | |
'八': 8, | |
'九': 9, | |
} | |
result = 0 | |
tmp = '' | |
for c in time_string: | |
if c.isdecimal(): | |
tmp += c | |
continue | |
base_num = base_num_dict.get(c, None) | |
if base_num is not None: | |
if len(tmp) == 0: | |
result += base_num | |
elif len(tmp) == 1: | |
result += base_num * int(tmp) | |
print(result) | |
tmp = '' | |
else: | |
pass | |
else: | |
tmp += str(d.get(c, '')) | |
else: | |
result += int(tmp) | |
return result | |
def time_unit_convert(time_unit_string: str): | |
d = { | |
'天': 1, | |
'日': 1, | |
'周': 7, | |
'星期': 7, | |
'个星期': 7, | |
'个月': 30, | |
'年': 365, | |
} | |
result = d.get(time_unit_string, 1) | |
return result | |
def get_pattern_label(pattern: str): | |
""" | |
子正则表达式都包含了一个标签, | |
:param pattern: | |
:return: | |
""" | |
pattern_inner = re.compile(r'\?<(.*?)>') | |
label_name_list = re.findall(pattern=pattern_inner, string=pattern) | |
return label_name_list | |
def clean_pattern_label(pattern: str): | |
pattern_inner = re.compile(r'\?<.*?>') | |
result = re.sub(pattern=pattern_inner, repl='', string=pattern) | |
return result | |
def valid_period_parse(cls, string: str) -> (int, dict) or (None, dict): | |
"""cls.pattern1""" | |
label_name_list, label_string_list = cls._search_label_list(string, cls.pattern1) | |
days, detail = cls._estimate_days(label_name_list, label_string_list) | |
return days, detail | |
def _estimate_days(cls, label_name_list, label_string_list) -> (int, dict) or (None, dict): | |
"""当一个标签都没有识别到时, 两 list 为空. 返回结果为 0. """ | |
bias = 0 | |
scale = 1 | |
main_time = 0 | |
for label_name, label_string in zip(label_name_list, label_string_list): | |
if label_name == cls.l_compare_lt: | |
# bias = -1 | |
pass | |
elif label_name == cls.l_compare_gt: | |
bias = 1 | |
elif label_name == cls.l_time: | |
main_time += cls.time_convert(label_string) | |
elif label_name == cls.l_time_unit: | |
scale = cls.time_unit_convert(label_string) | |
else: | |
pass | |
days = main_time * scale + bias | |
detail = { | |
'main_time': main_time, | |
'scale': scale, | |
'bias': bias | |
} | |
if len(label_name_list) == 0 or len(label_string_list) == 0: | |
return None, detail | |
return days, detail | |
def _search_label_list(cls, string: str, pattern: str) -> (list, list): | |
label_name_list = cls.get_pattern_label(pattern) | |
pattern = cls.clean_pattern_label(pattern) | |
match = re.search(pattern=pattern, string=string) | |
if match is None: | |
return list(), list() | |
label_string_list = match.groups() | |
new_label_name_list, new_label_string_list = list(), list() | |
for label_name, label_string in zip(label_name_list, label_string_list): | |
if label_string is None: | |
continue | |
new_label_name_list.append(label_name) | |
new_label_string_list.append(label_string) | |
return new_label_name_list, new_label_string_list | |
def __init__(self): | |
pass | |