File size: 5,780 Bytes
5e8e534
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# Copyright 2024    Hung-Shin Lee ([email protected])
# Apache 2.0

import itertools
import re

c_basic = "零一二三四五六七八九"
d2c = {str(d): c for d, c in enumerate(c_basic)}
d2c["."] = "點"


def num4year(matched):
    def _num4year(num):
        return "{}".format("".join([c_basic[int(i)] for i in num]))

    matched_str = matched.group(0)
    for m in matched.groups():
        matched_str = matched_str.replace(m, _num4year(m))
    return matched_str


def num2chines_simple(matched):
    return "{}".format("".join([d2c[i] for i in matched]))


def num4percent(matched):
    matched = matched.group(1)
    return "百分之{}".format(num2chinese(matched[:-1]))


def num4cellphone(matched):
    matched = matched.group(1)
    matched = matched.replace(" ", "").replace("-", "")
    return "".join([c_basic[int(i)] for i in matched])


def num4er(matched):  # 2 to 二
    matched = matched.group(1)
    return matched.replace("2", "二")


def num4liang(matched):  # 2 to 兩
    matched = matched.group(1)
    return matched.replace("2", "兩")


def num4general(matched):
    num = matched.group(1)
    if re.match("[A-Za-z-─]", num[0]):
        if len(num[1:]) < 3:
            # MP3 or F-16
            return "{}{}".format(num[0], num2chinese(num[1:]))
        else:
            # AM104
            return "{}{}".format(num[0], num2chines_simple(num[1:]))

    else:
        if re.match("[0-9]", num[0]):
            return "{}".format(num2chinese(num))
        else:
            return "{}{}".format(num[0], num2chinese(num[1:]))


def parse_num(text: str) -> str:
    # year
    text = re.sub("([0-9]{4})[到至]([0-9]{4})年", num4year, text)
    text = re.sub("([0-9]{4})年", num4year, text)

    # percentage
    text = re.sub(r"([0-9]+\.?[0-9]?%)", num4percent, text)

    # cellphone
    text = re.sub(r"([0-9]{4}\s?-\s?[0-9]{6})", num4cellphone, text)

    # single 2 to 二
    text = re.sub(r"([^\d]2[診樓月號])", num4er, text)
    text = re.sub(r"([初]2[^\d])", num4er, text)

    # single 2 to 兩
    text = re.sub(r"([^\d]2[^\d])", num4liang, text)

    # general number
    text = re.sub(r"([^0-9]?[0-9]+\.?[0-9]?)", num4general, text)

    return text


def num2chinese(num, big=False, simp=False, o=False, twoalt=True) -> str:
    """
    Converts numbers to Chinese representations.
    https://gist.github.com/gumblex/0d65cad2ba607fd14de7
    `big`   : use financial characters.
    `simp`  : use simplified characters instead of traditional characters.
    `o`     : use 〇 for zero.
    `twoalt`: use 两/兩 for two when appropriate.
    Note that `o` and `twoalt` is ignored when `big` is used,
    and `twoalt` is ignored when `o` is used for formal representations.
    """
    # check num first
    nd = str(num)
    if abs(float(nd)) >= 1e48:
        raise ValueError("number out of range")
    elif "e" in nd:
        raise ValueError("scientific notation is not supported")
    c_symbol = "正负点" if simp else "正負點"
    if o:  # formal
        twoalt = False
    if big:
        c_basic = "零壹贰叁肆伍陆柒捌玖" if simp else "零壹貳參肆伍陸柒捌玖"
        c_unit1 = "拾佰仟"
        c_twoalt = "贰" if simp else "貳"
    else:
        c_basic = "〇一二三四五六七八九" if o else "零一二三四五六七八九"
        c_unit1 = "十百千"
        if twoalt:
            c_twoalt = "两" if simp else "兩"
        else:
            c_twoalt = "二"
    c_unit2 = "万亿兆京垓秭穰沟涧正载" if simp else "萬億兆京垓秭穰溝澗正載"

    def revuniq(l):
        return "".join(k for k, g in itertools.groupby(reversed(l)))

    nd = str(num)
    result = []
    if nd[0] == "+":
        result.append(c_symbol[0])
    elif nd[0] == "-":
        result.append(c_symbol[1])
    if "." in nd:
        integer, remainder = nd.lstrip("+-").split(".")
    else:
        integer, remainder = nd.lstrip("+-"), None
    if int(integer):
        splitted = [integer[max(i - 4, 0) : i] for i in range(len(integer), 0, -4)]
        intresult = []
        for nu, unit in enumerate(splitted):
            # special cases
            if int(unit) == 0:  # 0000
                intresult.append(c_basic[0])
                continue
            elif nu > 0 and int(unit) == 2:  # 0002
                intresult.append(c_twoalt + c_unit2[nu - 1])
                continue
            ulist = []
            unit = unit.zfill(4)
            for nc, ch in enumerate(reversed(unit)):
                if ch == "0":
                    if ulist:  # ???0
                        ulist.append(c_basic[0])
                elif nc == 0:
                    ulist.append(c_basic[int(ch)])
                elif nc == 1 and ch == "1" and all([i == "0" for i in unit[: nc + 1]]):
                    # special case for tens
                    # edit the 'elif' if you don't like
                    # 十四, 三千零十四, 三千三百一十四
                    ulist.append(c_unit1[0])
                elif nc > 1 and ch == "2":
                    ulist.append(c_twoalt + c_unit1[nc - 1])
                else:
                    ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
            # print(ulist)
            ustr = revuniq(ulist)
            if nu == 0:
                intresult.append(ustr)
            else:
                intresult.append(ustr + c_unit2[nu - 1])
        result.append(revuniq(intresult).strip(c_basic[0]))
    else:
        result.append(c_basic[0])
    if remainder:
        result.append(c_symbol[2])
        result.append("".join(c_basic[int(ch)] for ch in remainder))
    return "".join(result)


if __name__ == "__main__":
    text = "若手機仔幾多號?吾手機仔係0964-498042。"

    print(f"{text} -> {parse_num(text)}")