#!/usr/bin/python3 # -*- coding: utf-8 -*- import re import unicodedata import six class Character(object): f_unknown = 'unknown' f_is_alnum = 'is_alnum' f_is_alpha = 'is_alpha' f_is_num = 'is_num' f_is_space = 'is_space' f_is_hyphens = 'is_hyphens' f_is_punctuation = 'is_punctuation' f_is_cjk_character = 'is_cjk_character' f_is_jap_character = 'is_jap_character' f_is_russian_character = 'is_russian_character' @classmethod def is_alnum(cls, ch: str): """注意: string.isalnum() 函数, 会对汉字识别为 True. """ if cls.is_cjk_character(ch): return False if ch.isalnum(): return True return False @classmethod def is_alpha(cls, ch: str): if cls.is_cjk_character(ch): return False if ch.isalpha(): return True return False @staticmethod def is_control(ch): """控制类字符判断""" if ch in ('\t', '\n', '\r'): return False return unicodedata.category(ch) in ("Cc", "Cf") @classmethod def is_num(cls, ch: str): if cls.is_cjk_character(ch): return False if ch.isdigit(): return True return False @classmethod def is_space(cls, ch): """空格类字符判断""" if ch in (" ", '\n', '\r', '\t'): return True if unicodedata.category(ch) == 'Zs': return True return False @classmethod def is_hyphens(cls, ch): """ 是否为连字符, `-` 匹配减号. + : 43 - : 45 """ code = ord(ch) if code in (43, 45): return True return False @classmethod def is_punctuation(cls, ch): """标点符号类字符判断(全/半角均在此内)""" code = ord(ch) if 33 <= code <= 47 or \ 58 <= code <= 64 or \ 91 <= code <= 96 or \ 123 <= code <= 126 or \ unicodedata.category(ch).startswith("P"): return True return False @classmethod def is_cjk_character(cls, ch): """CJK类字符判断(包括中文字符也在此列) 参考:https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) """ code = ord(ch) if 0x4E00 <= code <= 0x9FFF or \ 0x3400 <= code <= 0x4DBF or \ 0x20000 <= code <= 0x2A6DF or \ 0x2A700 <= code <= 0x2B73F or \ 0x2B740 <= code <= 0x2B81F or \ 0x2B820 <= code <= 0x2CEAF or \ 0xF900 <= code <= 0xFAFF or \ 0x2F800 <= code <= 0x2FA1F: return True return False @classmethod def is_jap_character(cls, ch): code = ord(ch) if 0x3040 <= code <= 0x309F or \ 0x30A0 <= code <= 0x30FF or \ 0x31F0 <= code <= 0x31FF: return True return False @classmethod def is_russian_character(cls, ch): code = ord(ch) if 1040 <= code <= 1104: return True return False @staticmethod def convert_to_unicode(text): """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" if six.PY3: if isinstance(text, str): return text elif isinstance(text, bytes): return text.decode("utf-8", "ignore") else: raise ValueError("Unsupported string type: %s" % (type(text))) elif six.PY2: if isinstance(text, str): return text.decode("utf-8", "ignore") elif isinstance(text, unicode): return text else: raise ValueError("Unsupported string type: %s" % (type(text))) else: raise ValueError("Not running on Python2 or Python 3?") class LowerCase(object): confuse_map = { # 俄语 'Й': 'И', 'й': 'и', 'ѐ': 'е', 'ё': 'е', 'ѓ': 'г', 'ї': 'і', # 西语 'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u', 'ü': 'u', 'ñ': 'n', } @classmethod def lowercase(cls, string): """转小写不应改变字符串的长度""" string = str(string).lower() result = '' for c in string: code = ord(c) # 俄语转小写. if 1040 <= code <= 1072: c = chr(ord(c) + 32) # 混淆字转换 flag = cls.confuse_map.get(c) if flag is not None: c = flag result += c if len(string) != len(result): raise AssertionError('this method should not change the char num. ' 'string: {}, result: {}'.format(string, result)) return result class Pattern(object): r""" \d 匹配任意数字符, 等价于 [0-9]. \s 匹配任意空白字符, 等价于 [\t\n\r\f] 包括空隔. re{n,m} 匹配 n 到 m 次由前面的正则表达式定义的片段, 贪婪方式. """ alp_num_ch = r'[A-Z0-9a-z\u4e00-\u9fa5]+' # 提取中文数字字母 alp_num_or_others = r'[^A-Z0-9a-z]|[A-Z0-9a-z]+' # 用于在 ' '.join() 中分融数字字母与其它字符. brackets = r'\(.*?\)' # 识别括号 hw_ry_xy = r'华为|荣耀|小艺' p_pattern = r'[a-z]\d{1,2}\s+p\d{1,2}' pro_pattern = r'([a-z]+\s*\d{1,2})\s+(p\d{1,2})' any_blanks = r'\s+' square_brackets = r'\[.*?\]' # 识别方括号 regex_dsw_find = r'\\\\[dDsSwW][\+\*]?' # 从如 `\d+左右` 中去除 `\d+`. 用于正则索引的获取. class ValidPeriod(object): """有效期""" l_compare_lt = 'l_compare_lt' l_compare_gt = 'l_compare_gt' l_time = 'l_time' l_time_unit = 'l_time_unit' # 每个子正则表达式 (形如: `pattern`) 都包含一个标签. l_compare_lt_prefix_regex = rf'?<{l_compare_lt}>不超过|没到|不到|少于' l_compare_gt_prefix_regex = rf'?<{l_compare_gt}>超|超过|超过了|大于|不止' l_compare_lt_suffix_regex = rf'?<{l_compare_lt}>没到|不到|内|以内|之内' l_compare_gt_suffix_regex = rf'?<{l_compare_gt}>以上|不止' l_time_regex = rf'?<{l_time}>[两|壹|零|一|二|三|四|五|六|七|八|九|十|百|千\d]+' # l_time_unit_regex = f'?<{l_time_unit}>年|个月|周|天|星期' l_time_unit_regex = rf'?<{l_time_unit}>(?:个)?年|个月|周|天|日|星期|个星期' # 正则表达式: 识别 -> 不到十天, 一个星期, 超七天后, 七日内, 第七天, 超过了七天, 不止5天 等. 类似的模式. pattern1 = rf'(?:({l_compare_lt_prefix_regex})|({l_compare_gt_prefix_regex}))?\s*({l_time_regex})\s*({l_time_unit_regex})\s*(?:({l_compare_lt_suffix_regex})|({l_compare_gt_suffix_regex}))?' # 正则表达式: 识别 -> 上个月5号, 这个月14日 等. 日期模式 pass @staticmethod def demo1(): """ 例句: 一个星期, 超七天后, 七日内, 第七天, 超过了七天, 不止5天 # 以下句子都是从标注数据中找出的有效期, 将来也许需要处理这些. 刚买2天, 昨天取的, 昨天到货, 签收后的第二天, 签收后七天内, 前两天, 货还没发, 用了几天 :return: """ string = "5天不止" ret = ValidPeriod.valid_period_parse(string) print(ret) return @staticmethod def time_convert(time_string: str): base_num_dict = { '十': 10, '百': 100, '千': 1000, } d = { '壹': 1, '两': 2, '零': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, } result = 0 tmp = '' for c in time_string: if c.isdecimal(): tmp += c continue base_num = base_num_dict.get(c, None) if base_num is not None: if len(tmp) == 0: result += base_num elif len(tmp) == 1: result += base_num * int(tmp) print(result) tmp = '' else: pass else: tmp += str(d.get(c, '')) else: result += int(tmp) return result @staticmethod def time_unit_convert(time_unit_string: str): d = { '天': 1, '日': 1, '周': 7, '星期': 7, '个星期': 7, '个月': 30, '年': 365, } result = d.get(time_unit_string, 1) return result @staticmethod def get_pattern_label(pattern: str): """ 子正则表达式都包含了一个标签, :param pattern: :return: """ pattern_inner = re.compile(r'\?<(.*?)>') label_name_list = re.findall(pattern=pattern_inner, string=pattern) return label_name_list @staticmethod def clean_pattern_label(pattern: str): pattern_inner = re.compile(r'\?<.*?>') result = re.sub(pattern=pattern_inner, repl='', string=pattern) return result @classmethod def valid_period_parse(cls, string: str) -> (int, dict) or (None, dict): """cls.pattern1""" label_name_list, label_string_list = cls._search_label_list(string, cls.pattern1) days, detail = cls._estimate_days(label_name_list, label_string_list) return days, detail @classmethod def _estimate_days(cls, label_name_list, label_string_list) -> (int, dict) or (None, dict): """当一个标签都没有识别到时, 两 list 为空. 返回结果为 0. """ bias = 0 scale = 1 main_time = 0 for label_name, label_string in zip(label_name_list, label_string_list): if label_name == cls.l_compare_lt: # bias = -1 pass elif label_name == cls.l_compare_gt: bias = 1 elif label_name == cls.l_time: main_time += cls.time_convert(label_string) elif label_name == cls.l_time_unit: scale = cls.time_unit_convert(label_string) else: pass days = main_time * scale + bias detail = { 'main_time': main_time, 'scale': scale, 'bias': bias } if len(label_name_list) == 0 or len(label_string_list) == 0: return None, detail return days, detail @classmethod def _search_label_list(cls, string: str, pattern: str) -> (list, list): label_name_list = cls.get_pattern_label(pattern) pattern = cls.clean_pattern_label(pattern) match = re.search(pattern=pattern, string=string) if match is None: return list(), list() label_string_list = match.groups() new_label_name_list, new_label_string_list = list(), list() for label_name, label_string in zip(label_name_list, label_string_list): if label_string is None: continue new_label_name_list.append(label_name) new_label_string_list.append(label_string) return new_label_name_list, new_label_string_list def __init__(self): pass