"""adapted from https://github.com/keithito/tacotron""" import re _letters_and_numbers_re = re.compile( r"((?:[a-zA-Z]+[0-9]|[0-9]+[a-zA-Z])[a-zA-Z0-9']*)", re.IGNORECASE ) _hardware_re = re.compile( "([0-9]+(?:[.,][0-9]+)?)(?:\s?)(tb|gb|mb|kb|ghz|mhz|khz|hz|mm)", re.IGNORECASE ) _hardware_key = { "tb": "terabyte", "gb": "gigabyte", "mb": "megabyte", "kb": "kilobyte", "ghz": "gigahertz", "mhz": "megahertz", "khz": "kilohertz", "hz": "hertz", "mm": "millimeter", "cm": "centimeter", "km": "kilometer", } _dimension_re = re.compile( r"\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b|\b(\d+(?:[,.]\d+)?\s*[xX]\s*\d+(?:[,.]\d+)?(?:in|inch|m)?)\b" ) _dimension_key = {"m": "meter", "in": "inch", "inch": "inch"} def _expand_letters_and_numbers(m): text = re.split(r"(\d+)", m.group(0)) # remove trailing space if text[-1] == "": text = text[:-1] elif text[0] == "": text = text[1:] # if not like 1920s, or AK47's , 20th, 1st, 2nd, 3rd, etc... if text[-1] in ("'s", "s", "th", "nd", "st", "rd") and text[-2].isdigit(): text[-2] = text[-2] + text[-1] text = text[:-1] # for combining digits 2 by 2 new_text = [] for i in range(len(text)): string = text[i] if string.isdigit() and len(string) < 5: # heuristics if len(string) > 2 and string[-2] == "0": if string[-1] == "0": string = [string] else: string = [string[:-3], string[-2], string[-1]] elif len(string) % 2 == 0: string = [string[i : i + 2] for i in range(0, len(string), 2)] elif len(string) > 2: string = [string[0]] + [ string[i : i + 2] for i in range(1, len(string), 2) ] new_text.extend(string) else: new_text.append(string) text = new_text text = " ".join(text) return text def _expand_hardware(m): quantity, measure = m.groups(0) measure = _hardware_key[measure.lower()] if measure[-1] != "z" and float(quantity.replace(",", "")) > 1: return "{} {}s".format(quantity, measure) return "{} {}".format(quantity, measure) def _expand_dimension(m): text = "".join([x for x in m.groups(0) if x != 0]) text = text.replace(" x ", " by ") text = text.replace("x", " by ") if text.endswith(tuple(_dimension_key.keys())): if text[-2].isdigit(): text = "{} {}".format(text[:-1], _dimension_key[text[-1:]]) elif text[-3].isdigit(): text = "{} {}".format(text[:-2], _dimension_key[text[-2:]]) return text def normalize_letters_and_numbers(text): text = re.sub(_hardware_re, _expand_hardware, text) text = re.sub(_dimension_re, _expand_dimension, text) text = re.sub(_letters_and_numbers_re, _expand_letters_and_numbers, text) return text