Spaces:

united-link
/

taiwanese-hakka-tts

Running on Zero

File size: 5,780 Bytes

5e8e534

# Copyright 2024    Hung-Shin Lee ([email protected])
# Apache 2.0

import itertools
import re

c_basic = "零一二三四五六七八九"
d2c = {str(d): c for d, c in enumerate(c_basic)}
d2c["."] = "點"


def num4year(matched):
    def _num4year(num):
        return "{}".format("".join([c_basic[int(i)] for i in num]))

    matched_str = matched.group(0)
    for m in matched.groups():
        matched_str = matched_str.replace(m, _num4year(m))
    return matched_str


def num2chines_simple(matched):
    return "{}".format("".join([d2c[i] for i in matched]))


def num4percent(matched):
    matched = matched.group(1)
    return "百分之{}".format(num2chinese(matched[:-1]))


def num4cellphone(matched):
    matched = matched.group(1)
    matched = matched.replace(" ", "").replace("-", "")
    return "".join([c_basic[int(i)] for i in matched])


def num4er(matched):  # 2 to 二
    matched = matched.group(1)
    return matched.replace("2", "二")


def num4liang(matched):  # 2 to 兩
    matched = matched.group(1)
    return matched.replace("2", "兩")


def num4general(matched):
    num = matched.group(1)
    if re.match("[A-Za-z-─]", num[0]):
        if len(num[1:]) < 3:
            # MP3 or F-16
            return "{}{}".format(num[0], num2chinese(num[1:]))
        else:
            # AM104
            return "{}{}".format(num[0], num2chines_simple(num[1:]))

    else:
        if re.match("[0-9]", num[0]):
            return "{}".format(num2chinese(num))
        else:
            return "{}{}".format(num[0], num2chinese(num[1:]))


def parse_num(text: str) -> str:
    # year
    text = re.sub("([0-9]{4})[到至]([0-9]{4})年", num4year, text)
    text = re.sub("([0-9]{4})年", num4year, text)

    # percentage
    text = re.sub(r"([0-9]+\.?[0-9]?%)", num4percent, text)

    # cellphone
    text = re.sub(r"([0-9]{4}\s?-\s?[0-9]{6})", num4cellphone, text)

    # single 2 to 二
    text = re.sub(r"([^\d]2[診樓月號])", num4er, text)
    text = re.sub(r"([初]2[^\d])", num4er, text)

    # single 2 to 兩
    text = re.sub(r"([^\d]2[^\d])", num4liang, text)

    # general number
    text = re.sub(r"([^0-9]?[0-9]+\.?[0-9]?)", num4general, text)

    return text


def num2chinese(num, big=False, simp=False, o=False, twoalt=True) -> str:
    """
    Converts numbers to Chinese representations.
    https://gist.github.com/gumblex/0d65cad2ba607fd14de7
    `big`   : use financial characters.
    `simp`  : use simplified characters instead of traditional characters.
    `o`     : use 〇 for zero.
    `twoalt`: use 两/兩 for two when appropriate.
    Note that `o` and `twoalt` is ignored when `big` is used,
    and `twoalt` is ignored when `o` is used for formal representations.
    """
    # check num first
    nd = str(num)
    if abs(float(nd)) >= 1e48:
        raise ValueError("number out of range")
    elif "e" in nd:
        raise ValueError("scientific notation is not supported")
    c_symbol = "正负点" if simp else "正負點"
    if o:  # formal
        twoalt = False
    if big:
        c_basic = "零壹贰叁肆伍陆柒捌玖" if simp else "零壹貳參肆伍陸柒捌玖"
        c_unit1 = "拾佰仟"
        c_twoalt = "贰" if simp else "貳"
    else:
        c_basic = "〇一二三四五六七八九" if o else "零一二三四五六七八九"
        c_unit1 = "十百千"
        if twoalt:
            c_twoalt = "两" if simp else "兩"
        else:
            c_twoalt = "二"
    c_unit2 = "万亿兆京垓秭穰沟涧正载" if simp else "萬億兆京垓秭穰溝澗正載"

    def revuniq(l):
        return "".join(k for k, g in itertools.groupby(reversed(l)))

    nd = str(num)
    result = []
    if nd[0] == "+":
        result.append(c_symbol[0])
    elif nd[0] == "-":
        result.append(c_symbol[1])
    if "." in nd:
        integer, remainder = nd.lstrip("+-").split(".")
    else:
        integer, remainder = nd.lstrip("+-"), None
    if int(integer):
        splitted = [integer[max(i - 4, 0) : i] for i in range(len(integer), 0, -4)]
        intresult = []
        for nu, unit in enumerate(splitted):
            # special cases
            if int(unit) == 0:  # 0000
                intresult.append(c_basic[0])
                continue
            elif nu > 0 and int(unit) == 2:  # 0002
                intresult.append(c_twoalt + c_unit2[nu - 1])
                continue
            ulist = []
            unit = unit.zfill(4)
            for nc, ch in enumerate(reversed(unit)):
                if ch == "0":
                    if ulist:  # ???0
                        ulist.append(c_basic[0])
                elif nc == 0:
                    ulist.append(c_basic[int(ch)])
                elif nc == 1 and ch == "1" and all([i == "0" for i in unit[: nc + 1]]):
                    # special case for tens
                    # edit the 'elif' if you don't like
                    # 十四, 三千零十四, 三千三百一十四
                    ulist.append(c_unit1[0])
                elif nc > 1 and ch == "2":
                    ulist.append(c_twoalt + c_unit1[nc - 1])
                else:
                    ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
            # print(ulist)
            ustr = revuniq(ulist)
            if nu == 0:
                intresult.append(ustr)
            else:
                intresult.append(ustr + c_unit2[nu - 1])
        result.append(revuniq(intresult).strip(c_basic[0]))
    else:
        result.append(c_basic[0])
    if remainder:
        result.append(c_symbol[2])
        result.append("".join(c_basic[int(ch)] for ch in remainder))
    return "".join(result)


if __name__ == "__main__":
    text = "若手機仔幾多號？吾手機仔係0964-498042。"

    print(f"{text} -> {parse_num(text)}")