Commit
·
7859f42
1
Parent(s):
cdfa72c
add num. verbalizer
Browse files- text/LICENSE +19 -0
- text/__init__.py +74 -0
- text/ca.sor +485 -0
- text/cleaners.py +150 -0
- text/cmudict.py +65 -0
- text/numbers.py +71 -0
- text/numbers_ca.py +54 -0
- text/numbers_ca_test.py +106 -0
- text/soros.py +140 -0
- text/symbols.py +17 -0
- text/symbols_en.py +18 -0
text/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright (c) 2017 Keith Ito
|
2 |
+
|
3 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4 |
+
of this software and associated documentation files (the "Software"), to deal
|
5 |
+
in the Software without restriction, including without limitation the rights
|
6 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7 |
+
copies of the Software, and to permit persons to whom the Software is
|
8 |
+
furnished to do so, subject to the following conditions:
|
9 |
+
|
10 |
+
The above copyright notice and this permission notice shall be included in
|
11 |
+
all copies or substantial portions of the Software.
|
12 |
+
|
13 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19 |
+
THE SOFTWARE.
|
text/__init__.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
import re
|
3 |
+
from text import cleaners
|
4 |
+
from text.symbols import symbols
|
5 |
+
|
6 |
+
|
7 |
+
# Mappings from symbol to numeric ID and vice versa:
|
8 |
+
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
9 |
+
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
10 |
+
|
11 |
+
# Regular expression matching text enclosed in curly braces:
|
12 |
+
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
|
13 |
+
|
14 |
+
|
15 |
+
def text_to_sequence(text, cleaner_names):
|
16 |
+
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
17 |
+
|
18 |
+
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
|
19 |
+
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
|
20 |
+
|
21 |
+
Args:
|
22 |
+
text: string to convert to a sequence
|
23 |
+
cleaner_names: names of the cleaner functions to run the text through
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
List of integers corresponding to the symbols in the text
|
27 |
+
'''
|
28 |
+
sequence = []
|
29 |
+
|
30 |
+
# Check for curly braces and treat their contents as ARPAbet:
|
31 |
+
while len(text):
|
32 |
+
m = _curly_re.match(text)
|
33 |
+
if not m:
|
34 |
+
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
|
35 |
+
break
|
36 |
+
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
|
37 |
+
sequence += _arpabet_to_sequence(m.group(2))
|
38 |
+
text = m.group(3)
|
39 |
+
|
40 |
+
return sequence
|
41 |
+
|
42 |
+
|
43 |
+
def sequence_to_text(sequence):
|
44 |
+
'''Converts a sequence of IDs back to a string'''
|
45 |
+
result = ''
|
46 |
+
for symbol_id in sequence:
|
47 |
+
if symbol_id in _id_to_symbol:
|
48 |
+
s = _id_to_symbol[symbol_id]
|
49 |
+
# Enclose ARPAbet back in curly braces:
|
50 |
+
if len(s) > 1 and s[0] == '@':
|
51 |
+
s = '{%s}' % s[1:]
|
52 |
+
result += s
|
53 |
+
return result.replace('}{', ' ')
|
54 |
+
|
55 |
+
|
56 |
+
def _clean_text(text, cleaner_names):
|
57 |
+
for name in cleaner_names:
|
58 |
+
cleaner = getattr(cleaners, name)
|
59 |
+
if not cleaner:
|
60 |
+
raise Exception('Unknown cleaner: %s' % name)
|
61 |
+
text = cleaner(text)
|
62 |
+
return text
|
63 |
+
|
64 |
+
|
65 |
+
def _symbols_to_sequence(symbols):
|
66 |
+
return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
|
67 |
+
|
68 |
+
|
69 |
+
def _arpabet_to_sequence(text):
|
70 |
+
return _symbols_to_sequence(['@' + s for s in text.split()])
|
71 |
+
|
72 |
+
|
73 |
+
def _should_keep_symbol(s):
|
74 |
+
return s in _symbol_to_id and s is not '_' and s is not '~'
|
text/ca.sor
ADDED
@@ -0,0 +1,485 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
^0 zero
|
2 |
+
1$ u
|
3 |
+
1 un
|
4 |
+
2 dos
|
5 |
+
3 tres
|
6 |
+
4 quatre
|
7 |
+
5 cinc
|
8 |
+
6 sis
|
9 |
+
7 set
|
10 |
+
8 huit # [:ca-valencia:] [:ca-ES-valencia:]
|
11 |
+
8 vuit
|
12 |
+
9 nou
|
13 |
+
#10-19
|
14 |
+
10 deu
|
15 |
+
11 onze
|
16 |
+
12 dotze
|
17 |
+
13 tretze
|
18 |
+
14 catorze
|
19 |
+
15 quinze
|
20 |
+
16 setze
|
21 |
+
17 dèsset # [:ca-valencia:] [:ca-ES-valencia:]
|
22 |
+
17 desset # [:ca-balear:] [:ca-ES-balear:]
|
23 |
+
17 disset
|
24 |
+
18 devuit # [:ca-balear:] [:ca-ES-balear:]
|
25 |
+
18 díhuit # [:ca-valencia:] [:ca-ES-valencia:]
|
26 |
+
19 denou # [:ca-balear:] [:ca-ES-balear:]
|
27 |
+
19 dènou # [:ca-valencia:] [:ca-ES-valencia:]
|
28 |
+
1(\d) di$1
|
29 |
+
# 20-29
|
30 |
+
20 vint
|
31 |
+
2(\d) vint-i-$1
|
32 |
+
# 30, 40, 50, 60, 70, 80, 90
|
33 |
+
30 trenta
|
34 |
+
40 quaranta
|
35 |
+
50 cinquanta
|
36 |
+
60 seixanta
|
37 |
+
70 setanta
|
38 |
+
80 huitanta # [:ca-valencia:] [:ca-ES-valencia:]
|
39 |
+
80 vuitanta
|
40 |
+
90 noranta
|
41 |
+
(\d)(\d) $(\10)-$2
|
42 |
+
|
43 |
+
#100-199
|
44 |
+
100 cent
|
45 |
+
1(\d\d) cent $1
|
46 |
+
#200-999
|
47 |
+
(\d)00 $1-cents
|
48 |
+
(\d)(\d\d) $1-cents $2
|
49 |
+
|
50 |
+
#1000-1999
|
51 |
+
1000 mil
|
52 |
+
1(\d{3}) mil $1
|
53 |
+
|
54 |
+
#2000-999999
|
55 |
+
(\d{1,3})000 $1 mil
|
56 |
+
(\d{1,3})(\d{3}) $1 mil $2
|
57 |
+
|
58 |
+
# our limit is number <10^606
|
59 |
+
(\d{606,}) ""
|
60 |
+
|
61 |
+
# x-lions
|
62 |
+
# 10000000=10^6 -> un milió
|
63 |
+
1((0{6})+) un $(pre:$(count:\1))lió
|
64 |
+
1((\d{6})+) un $(pre:$(count:\1))lió $1
|
65 |
+
# 2000000=2·10^6 -> dos milions
|
66 |
+
(\d{1,6})((0{6})+) $1 $(pre:$(count:\2))lions
|
67 |
+
(\d{1,6})((\d{6})+) $1 $(pre:$(count:\2))lions $2
|
68 |
+
|
69 |
+
|
70 |
+
# count number of 10^6, usefull for x-lions, and x-liards prefixes.
|
71 |
+
count:.{0,5}? 0
|
72 |
+
count:.{6}.{0,5} 1
|
73 |
+
count:(.{12}).{0,5} 2
|
74 |
+
count:(.{18}).{0,5} 3
|
75 |
+
count:(.{24}).{0,5} 4
|
76 |
+
count:(.{30}).{0,5} 5
|
77 |
+
count:(.{36}).{0,5} 6
|
78 |
+
count:(.{42}).{0,5} 7
|
79 |
+
count:(.{48}).{0,5} 8
|
80 |
+
count:(.{54}).{0,5} 9
|
81 |
+
count:(.{60})(.{0,59}) 1|$(count:\2)
|
82 |
+
count:(.{120})(.{0,59}) 2|$(count:\2)
|
83 |
+
count:(.{180})(.{0,59}) 3|$(count:\2)
|
84 |
+
count:(.{240})(.{0,59}) 4|$(count:\2)
|
85 |
+
count:(.{300})(.{0,59}) 5|$(count:\2)
|
86 |
+
count:(.{360})(.{0,59}) 6|$(count:\2)
|
87 |
+
count:(.{420})(.{0,59}) 7|$(count:\2)
|
88 |
+
count:(.{480})(.{0,59}) 8|$(count:\2)
|
89 |
+
count:(.{540})(.{0,59}) 9|$(count:\2)
|
90 |
+
count:(.{600})(.{0,5}) 10|$(count:\2) # our limit is 10^606-1
|
91 |
+
|
92 |
+
# prefixes needed for x-lions and x-liards, up to 10^606-1
|
93 |
+
pre:1 mi
|
94 |
+
pre:2 bi
|
95 |
+
pre:3 tri
|
96 |
+
pre:4 quadri
|
97 |
+
pre:5 quinti
|
98 |
+
pre:6 sexti
|
99 |
+
pre:7 septi
|
100 |
+
pre:8 octi
|
101 |
+
pre:9 noni
|
102 |
+
pre:10 deci
|
103 |
+
pre:1(\d) $(pre2:\1)|deci
|
104 |
+
pre:(\d)0 $(pre3:\1)
|
105 |
+
pre:(\d)(\d) $(pre2:\2)|$(pre3:\1)
|
106 |
+
pre:100 centi
|
107 |
+
|
108 |
+
pre2:1 uno
|
109 |
+
pre2:2 duo
|
110 |
+
pre2:3 tre
|
111 |
+
pre2:4 quattour
|
112 |
+
pre2:5 quin
|
113 |
+
pre2:6 sex
|
114 |
+
pre2:7 septen
|
115 |
+
pre2:8 octo
|
116 |
+
pre2:9 novem
|
117 |
+
|
118 |
+
pre3:1 deci
|
119 |
+
pre3:2 viginti
|
120 |
+
pre3:3 triginti
|
121 |
+
pre3:4 quadraginti
|
122 |
+
pre3:5 quinquaginti
|
123 |
+
pre3:6 sexaginti
|
124 |
+
pre3:7 septuaginti
|
125 |
+
pre3:8 octoginti
|
126 |
+
pre3:9 nonoginti
|
127 |
+
pre3:10 centi
|
128 |
+
|
129 |
+
# negative number
|
130 |
+
[--](\d+) menys |$1
|
131 |
+
|
132 |
+
# decimals
|
133 |
+
"([^,]*\d)[.]((\d{3})+)([,][^,.]*)?" $(\1\2\4)
|
134 |
+
"([--]?\d+)([,]0*)?" $1
|
135 |
+
"([--]?\d+)[,](\d*)" $(\1·\2)
|
136 |
+
"([--]?\d+·0*)([^0]00?)0*" $1| |$2
|
137 |
+
"([--]?\d+·0*)([^0])" $1| |$2
|
138 |
+
"([--]?\d+·0*)([^0]\d)" $1| |$2
|
139 |
+
"([--]?\d+·0*)([^0]\d\d)" $1| |$2
|
140 |
+
"([--]?\d+·0*)([^0]\d\d)0*" $1| |$2
|
141 |
+
|
142 |
+
"([--]?\d+·0*)(([^0]|[^0]\d*[^0]))0*" $1| $(read:\2)
|
143 |
+
"([--]?\d+)·(\d*)(\d)" $(\1·\2)| |$3
|
144 |
+
"([--]?\d+)·" $1| coma
|
145 |
+
|
146 |
+
# used for decimal part
|
147 |
+
#read:(\d*[^0])0*$ $(read:\1)
|
148 |
+
read:(\d*[1-9])(00+)([1-9]\d*) $(read:\1)| |$(read:\2) |$(read:\3)
|
149 |
+
read:(\d$) $1
|
150 |
+
read:0(\d+) $(read:0)| |$(read:\1)
|
151 |
+
read:([1-9]\d) $1
|
152 |
+
read:([1-9]\d\d) $1
|
153 |
+
read:(\d\d\d) $1
|
154 |
+
read:(\d\d)((\d\d)+) $(read:\1)| |$(read:\2)
|
155 |
+
read:(\d\d)((\d\d)*)(\d\d\d) $(read:\1)| |$(read:\2)| |$(read:\4)
|
156 |
+
|
157 |
+
|
158 |
+
# convert masculine forms to feminine forms
|
159 |
+
# it can be run after: standard number conversion; and after ordinal, partitive functions.
|
160 |
+
## runned with feminine function.
|
161 |
+
f:(.*iliard)(.*) \1$(f:\2) # convert only <1,000,000,000
|
162 |
+
f:(.*ili)(.*) \1$(f:\2) # convert only <100,0000
|
163 |
+
f:(.*d)o(s[^èé]*) $(f:\1ue\2) # 2 -> dos -> dues
|
164 |
+
f:(.*cent)(s.*) $(f:\1e\2) # cents -> centes
|
165 |
+
f:(((.*)[^a-zèé]|))u$ \1una # vint-i-u -> vint-i-una
|
166 |
+
## runned after ord function.
|
167 |
+
f:(.*[^0-9])n$ \1na # segon -> segona
|
168 |
+
f:(.*[^0-9]r)$ \1a # tercer -> tercera
|
169 |
+
f:(.*[^0-9]r)t$ \1ta # quart -> quarta
|
170 |
+
f:(.*[^0-9])è$ \1ena # sisè -> sisena
|
171 |
+
f:(.*[^0-9])é$ \1ena # sisé -> sisena
|
172 |
+
## runned after ord2 function.
|
173 |
+
f:(.*[0-9])[nrtè]$ \1a # 2n -> 2a
|
174 |
+
## runnded after part function.
|
175 |
+
f:(.*ter)ç$ \1cera # terç -> tercera
|
176 |
+
f:(.*è[sc]i)m$ \1ma # milionèsim -> milionèsima
|
177 |
+
f:(.*[^0-9]i)g$ \1tja # mig -> mitja
|
178 |
+
|
179 |
+
|
180 |
+
no-centes:(.*)centes(.*) \1cents\2
|
181 |
+
no-centes:(.*) \1
|
182 |
+
|
183 |
+
# convert ordinal numbers (1st, 2nd, 3rd,... nth) to partitive (1, 1/2, 1/3, .... 1/n)
|
184 |
+
p:(.*)primer$ \1unitat
|
185 |
+
p:(.*)segon$ \1mig
|
186 |
+
p:(.*)tercer$ \1terç
|
187 |
+
p:(.*quart)$ \1
|
188 |
+
p:(.*)des[èé]$ \1dècim
|
189 |
+
p:((.*)cent)[èé]$ \1èsim
|
190 |
+
p:((.*)mil)[èé]$ \1·lèsim
|
191 |
+
p:((.*)ilion)[èé]$ \1èsim
|
192 |
+
p:((.*)iliard)[èé]$ \1èsim
|
193 |
+
|
194 |
+
|
195 |
+
# fallback, ignore 1-letter not-defined fuctions
|
196 |
+
.:(.*) \1
|
197 |
+
|
198 |
+
# runned after ordinal and partitive fuctions
|
199 |
+
pl:(.*[^\d][nrtnec])$ \1s
|
200 |
+
pl:(.*[^\d])ig$ \1igs # mig -> mitjos
|
201 |
+
pl:(.*[^\d])ja$ \1ges
|
202 |
+
pl:(.*[^\d])a$ \1es
|
203 |
+
pl:(.*[^\d])[èé]$ \1ens
|
204 |
+
# after ord2: 1r->1rs, 2n->2ns, 5è->5ns, ...
|
205 |
+
pl:(\d+[rnrt])$ \1s # 1r -> 1rs, 2n -> 2ns, 4t -> 4ts
|
206 |
+
pl:(\d+)[èé]$ \1ns # 5è -> 5ns
|
207 |
+
pl:(\d+)a$ \1es # 2a -> 2es
|
208 |
+
# after partitive
|
209 |
+
pl:([^[0-9]*[sç])$ \1os # dos -> dosos, terç > terços
|
210 |
+
pl:([^[0-9]*è[sc]im)$ \1s # dècim -> dècims
|
211 |
+
#fallback
|
212 |
+
pl:(.*) \1
|
213 |
+
|
214 |
+
|
215 |
+
# unit/subunit singular/plural
|
216 |
+
# million or greater part of the number name separated by "ili" pattern
|
217 |
+
# before masculine to feminine conversion
|
218 |
+
us(.).:([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*) $(\1:\7)| \2
|
219 |
+
up(.).:([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*) $(\1:\7)| \3
|
220 |
+
ud(.).:([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*) $(\1:\7)| \4
|
221 |
+
ss.(.):([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*) $(\1:\7)| \5
|
222 |
+
sp.(.):([^,]*),([^,]*),([^,]*),([^,]*),([^,]*),([^,]*) $(\1:\7)| \6
|
223 |
+
|
224 |
+
# "mm" means masculine unit and masculine subunit
|
225 |
+
# Usually used by Catalan users
|
226 |
+
CHF:(.+),(.+) $(\2mm: franc suís, francs suïssos, de francs suïssos, cèntim, cèntims, \1)
|
227 |
+
EUR:(.+),(.+) $(\2mm: euro, euros, d'euros, cèntim, cèntims, \1)
|
228 |
+
GBP:(.+),(.+) $(\2fm: lliura esterlina, lliures esterlines, de lliures esterlines, penic, penics, \1)
|
229 |
+
JPY:(.+),(.+) $(\2mm: ien, iens, de iens, sen, sen, \1)
|
230 |
+
USD:(.+),(.+) $(\2mm: dòlar dels EUA, dòlars dels EUA, de dòlars dels EUA, centau, centaus, \1)
|
231 |
+
# ACTIVE ISO 4217 CODES--A--
|
232 |
+
AED:(.+),(.+) $(\2mm: dírham dels Emirats Àrabs Units, dírhams dels Emirats Àrabs Units, de dírhams dels Emirats Àrabs Units, fils, fulús, \1)
|
233 |
+
AFN:(.+),(.+) $(\2mm: afgani, afganis, d'afganis, puli, puli, \1)
|
234 |
+
ALL:(.+),(.+) $(\2mm: lek, lekë, de lekë, qindarka, qindarka, \1)
|
235 |
+
AMD:(.+),(.+) $(\2mm: dram, drams, de drams, luma, luma, \1)
|
236 |
+
ANG:(.+),(.+) $(\2mm: florí de les Antilles Neerlandeses, florins de les Antilles Neerlandeses, de florins de les Antilles Neerlandeses, cèntim, cèntims, \1)
|
237 |
+
AOA:(.+),(.+) $(\2fm: kwanza, kwanzes, de kwanzes, cèntim cèntims, \1)
|
238 |
+
ARS:(.+),(.+) $(\2mm: peso argentí, pesos argentins, de pesos argentins, centau, centaus, \1)
|
239 |
+
AUD(.+),(.+) $(\2mm: dòlar australià, dòlars australians, de dòlars australians, centau, centaus, \1)
|
240 |
+
AWG:(.+),(.+) $(\2mm: florí d'Aruba, florins d'Aruba, de florins d'Aruba, cèntim, cèntims, \1)
|
241 |
+
AZN:(.+),(.+) $(\2mm: manat azerbaidjanès, manats azerbaidjanesos, de manats azerbaidjanesos, qəpik, qəpik, \1)
|
242 |
+
# ACTIVE ISO 4217 CODES --X--
|
243 |
+
#XAF Franc CFA emès pel BEAC (Banc dels Estats de l'Àfrica Central)
|
244 |
+
XAG:(.+),(.+) $(\2fm: unça de plata, unces de plata, d'unces de plata, cèntim, cèntims, \1)
|
245 |
+
XAU:(.+),(.+) $(\2fm: unça d'or, unces d'or, d'unces d'or, cèntim, cèntims, \1)
|
246 |
+
#XBA Unitat compensatòria europea (EURCO) (unitat per al mercat d'obligacions)
|
247 |
+
#XBB Unitat monetària europea (EMU-6) (unitat per al mercat d'obligacions)
|
248 |
+
#XBC Unitat de compte europea 9 (EUA-9) (unitat per al mercat d'obligacions)
|
249 |
+
#XBD Unitat de compte europea 17 (EUA-17) (unitat per al mercat d'obligacions)
|
250 |
+
#XCD Dòlar del Carib Oriental
|
251 |
+
#XDR Drets especials de gir (del Fons Monetari Internacional)
|
252 |
+
#XFU Franc UIC (divisa especial)
|
253 |
+
#XOF Franc CFA emès pel BCEAO (Banc Central dels Estats de l'Àfrica Occidental)
|
254 |
+
XPD:(.+),(.+) $(\2fm: unça de pal·ladi, unces de pal·ladi, d'unces de pal·ladi, cèntim, cèntims, \1)
|
255 |
+
#XPF Franc CFP (per als territoris francesos del Pacífic)
|
256 |
+
XPT:(.+),(.+) $(\2fm: unça de platí, unces de platí, d'unces de platí, cèntim, cèntims, \1)
|
257 |
+
#XTS Codi reservat per a proves
|
258 |
+
#XXX Sense moneda, sense transacció monetària
|
259 |
+
# OBSOLETE ISO 4217 CODES --Replaced by EUR--
|
260 |
+
ADF:(.+),(.+) $(\2mm: franc andorrà, francs andorrans, de francs andorrans, cèntim, cèntims, \1)
|
261 |
+
ADP:(.+),(.+) $(\2fm: pesseta andorrana, pessetes andorranes, de pessetes andorranes, cèntim, cèntims, \1)
|
262 |
+
ATS:(.+),(.+) $(\2mm: xíling austríac, xílings austríacs, de xílings austríacs, groschen, groschen, \1)
|
263 |
+
BEF:(.+),(.+) $(\2mm: franc belga, francs belgues, de francs belgues, cèntim, cèntims, \1)
|
264 |
+
CYP:(.+),(.+) $(\2mm: lliura xipriota, lliures xipriotes, de lliures xipriotes, cèntim, cèntims, \1)
|
265 |
+
DEM:(.+),(.+) $(\2mm: marc alemany, marcs alemanys, de marcs alemanys, penic, penics, \1)
|
266 |
+
ESP:(.+),(.+) $(\2fm: pesseta, pessetes, de pessetes, cèntim, cèntims, \1)
|
267 |
+
FIM:(.+),(.+) $(\2mm: marc finlandès, marcs finlandesos, de marcs finlandesos, penic, penics, \1)
|
268 |
+
FRF:(.+),(.+) $(\2mm: franc francès, francs francesos, de francs francesos, cèntim, cèntims, \1)
|
269 |
+
GRD:(.+),(.+) $(\2fm: dracma grega, dracmes gregues, leptó, leptà, \1)
|
270 |
+
IEP:(.+),(.+) $(\2fm: lliura irlandesa, lliures irlandeses, de lliures irlandeses, penic, penics, \1)
|
271 |
+
ITL:(.+),(.+) $(\2fm: lira italiana, lires italianes, de lires italianes, cèntim, cèntims, \1)
|
272 |
+
LUF:(.+),(.+) $(\2mm: franc luxemburguès, francs luxemburguesos, de francs luxemburguesos, cèntim, cèntims, \1)
|
273 |
+
MCF:(.+),(.+) $(\2mm: franc monegasc, francs monegascs, de francs monegascs, cèntim, cèntims, \1)
|
274 |
+
MTL:(.+),(.+) $(\2fm: lira maltesa, lires malteses, de lires malteses, cèntim, cèntims, \1)
|
275 |
+
NLG:(.+),(.+) $(\2mm: florí neerlandès, florins neerlandesos, de florins neerlandesos, cèntim, cèntims, \1)
|
276 |
+
PTE:(.+),(.+) $(\2mm: escut portuguès, escuts portuguesos, de escuts portuguesos, centau, centaus, \1)
|
277 |
+
SIT:(.+),(.+) $(\2mm: tolar eslovè, tolars eslovens, de tolars eslovens, stotin, stotinov, \1)
|
278 |
+
SKK:(.+),(.+) $(\2fm: corona eslovaca, corones eslovaques, de corones eslovaques, halier, halierov, \1)
|
279 |
+
SML:(.+),(.+) $(\2fm: lira de San Marino, lires de San Marino, de lires de San Marino, cèntim, cèntims, \1)
|
280 |
+
VAL:(.+),(.+) $(\2fm: lira vaticana, lires vaticanes, de lires vaticanes, cèntim, cèntims, \1)
|
281 |
+
XEU:(.+),(.+) $(\2mm: ecu, ecus, d'ecus, cèntim, cèntims, \1)
|
282 |
+
|
283 |
+
#crypto-currencies
|
284 |
+
XMR:(.+),(.+) $(\2mm: monero, moneros, de moneros, piconero, piconeros, \1) #TODO: 1,000,000,000,000 piconeros = 1 monero
|
285 |
+
XBT:(.+),(.+) $(\2mm: bitcoin, bitcoins, de bitcoins, satoshi, satoshis, \1) # TODO: 10,000,000 satoshis = 1,000 millibitcoin = 1 bitcoin
|
286 |
+
|
287 |
+
# unknow currency
|
288 |
+
[A-Z]{3}:.* ""
|
289 |
+
|
290 |
+
|
291 |
+
"([A-Z]{3}) ([-−]?1)([.,]00?)?"$(\1:|$2,us)
|
292 |
+
"([A-Z]{3}) ([-−]?\d+0{6,})([.,]00?)?"$(\1:|$2,ud)
|
293 |
+
"([A-Z]{3}) ([-−]?\d+)([.,]00?)?"$(\1:|$2,up)
|
294 |
+
"(([A-Z]{3}) [-−]?\d+)[.,](01)" $1 amb$(\2:un,ss)
|
295 |
+
"(([A-Z]{3}) [-−]?\d+)[.,](\d)" $1 amb$(\2:|$(\30),sp)
|
296 |
+
"(([A-Z]{3}) [-−]?\d+)[.,](\d\d)" $1 amb$(\2:|$3,sp)
|
297 |
+
|
298 |
+
|
299 |
+
# detects number followed by currency code
|
300 |
+
"([-−]?\d+)([.,]\d+)? ([A-Z]{3})" $(\3 \1\2)
|
301 |
+
|
302 |
+
|
303 |
+
# currency symbols
|
304 |
+
"€[ ]?([^ ]*)" $(EUR \1)
|
305 |
+
"£[ ]?([^ ]*)" $(GBP \1)
|
306 |
+
"\$[ ]?([^ ]*)" $(USD \1)
|
307 |
+
"¥[ ]?([^ ]*)" $(JPY \1)
|
308 |
+
"₩[ ]?([^ ]*)" $(KRW \1)
|
309 |
+
"₽[ ]?([^ ]*)" $(RUB \1)
|
310 |
+
"ɱ[ ]?([^ ]*)" $(XMR \1)
|
311 |
+
"₿[ ]?([^ ]*)" $(XBT \1)
|
312 |
+
|
313 |
+
"([^ ]+)[ ]?€$" $(EUR \1)
|
314 |
+
"([^ ]+)[ ]?£$" $(GBP \1)
|
315 |
+
"([^ ]+)[ ]?\$$" $(USD \1)
|
316 |
+
"([^ ]+)[ ]?¥$" $(JPY \1)
|
317 |
+
"([^ ]+)[ ]?₩$" $(KRW \1)
|
318 |
+
"([^ ]+)[ ]?₽$" $(RUB \1)
|
319 |
+
"([^ ]+)[ ]?ɱ$" $(XMR \1)
|
320 |
+
"([^ ]+)[ ]?₿$" $(XBT \1)
|
321 |
+
|
322 |
+
== feminine ==
|
323 |
+
|
324 |
+
1 una
|
325 |
+
(.*) $(f:|$1)
|
326 |
+
|
327 |
+
== masculine ==
|
328 |
+
|
329 |
+
1 un
|
330 |
+
(.*) $1
|
331 |
+
|
332 |
+
== ordinal(-masculine)? ==
|
333 |
+
|
334 |
+
([-−]\d+) ""
|
335 |
+
\d+[,.] ""
|
336 |
+
0 zeroé # [:ca-valencia:] [:ca-ES-valencia:]
|
337 |
+
0 zeroè
|
338 |
+
1 primer
|
339 |
+
2 segon
|
340 |
+
3 tercer
|
341 |
+
4 quart
|
342 |
+
(\d+)$ $(ordinal $2)
|
343 |
+
"un ([^ ]*(ilió|iliard))$" $(ordinal \2)
|
344 |
+
(.*li)ó$ \2oné # [:ca-valencia:] [:ca-ES-valencia:]
|
345 |
+
(.*li)ó$ \2onè
|
346 |
+
(.*(cent|mil|ion|iliard))s?$ \2é # [:ca-valencia:] [:ca-ES-valencia:]
|
347 |
+
(.*(cent|mil|ion|iliard))s?$ \2è
|
348 |
+
"(.* )u$" \2uné # [:ca-valencia:] [:ca-ES-valencia:]
|
349 |
+
"(.* )u$" \2unè
|
350 |
+
(.*-)u$ \2uné # [:ca-valencia:] [:ca-ES-valencia:]
|
351 |
+
(.*-)u$ \2unè
|
352 |
+
"u" primer
|
353 |
+
"un" primer
|
354 |
+
"dos" segon
|
355 |
+
"tres" terç
|
356 |
+
"quatre" quart
|
357 |
+
(.*)cinc$ \2cinqué # [:ca-valencia:] [:ca-ES-valencia:]
|
358 |
+
(.*)cinc$ \2cinquè
|
359 |
+
(.*)dènou$ \2denové # [:ca-valencia:] [:ca-ES-valencia:]
|
360 |
+
(.*)nou$ \2nové # [:ca-valencia:] [:ca-ES-valencia:]
|
361 |
+
(.*)nou$ \2novè
|
362 |
+
(.*)deu$ \2desé # [:ca-valencia:] [:ca-ES-valencia:]
|
363 |
+
(.*)deu$ \2desè
|
364 |
+
(.*)dèsset$ \2desseté # [:ca-valencia:] [:ca-ES-valencia:]
|
365 |
+
(.*)díhuit$ \2dihuité # [:ca-valencia:] [:ca-ES-valencia:]
|
366 |
+
(.*)[ae]$ \2é # [:ca-valencia:] [:ca-ES-valencia:]
|
367 |
+
(.*)[ae]$ \2è
|
368 |
+
(.*\D)$ \2é # [:ca-valencia:] [:ca-ES-valencia:]
|
369 |
+
(.*\D)$ \2è
|
370 |
+
|
371 |
+
== ordinal-feminine ==
|
372 |
+
([-−]\d+) ""
|
373 |
+
\d+[,.] ""
|
374 |
+
(\d+)$ $(no-centes:$(f:$(ordinal \1)))
|
375 |
+
|
376 |
+
== ordinal-masculine-plural ==
|
377 |
+
|
378 |
+
([-−]?\d+) $(ordinal-masculine-plural $(ordinal \1))
|
379 |
+
primer primers
|
380 |
+
segon segons
|
381 |
+
(.*)è \1ens
|
382 |
+
(.*)er \1ers
|
383 |
+
|
384 |
+
== ordinal-feminine-plural ==
|
385 |
+
|
386 |
+
([-−]?\d+) $(ordinal-feminine-plural $(ordinal-feminine \1))
|
387 |
+
(.*)a \1es
|
388 |
+
|
389 |
+
== ordinal-number(-masculine)? ==
|
390 |
+
|
391 |
+
#(\d+) $(o:\2)
|
392 |
+
1$ 1r
|
393 |
+
2$ 2n
|
394 |
+
3$ 3r
|
395 |
+
4$ 4t
|
396 |
+
(\d+)$ \2é # [:ca-valencia:] [:ca-ES-valencia:]
|
397 |
+
(\d+)$ \2è
|
398 |
+
|
399 |
+
== ordinal-number-feminine ==
|
400 |
+
(\d+)$ \1a
|
401 |
+
|
402 |
+
== partitive(-masculine)? ==
|
403 |
+
([--]?\d+) $(p:$(ordinal \2))
|
404 |
+
|
405 |
+
== partitive-feminine ==
|
406 |
+
([--]?\d+) $(no-centes:$(f:$(p:$(ordinal \1))))
|
407 |
+
|
408 |
+
|
409 |
+
== partitive(-masculine)?-plural ==
|
410 |
+
([--]?\d+) $(pl:$(p:$(ordinal $2)))
|
411 |
+
|
412 |
+
== partitive-feminine-plural ==
|
413 |
+
([--]?\d+) $(no-centes:$(pl:$(f:$(p:$(ordinal $1)))))
|
414 |
+
|
415 |
+
== fraction(-masculine)? ==
|
416 |
+
([--]?1)(/1)? $2
|
417 |
+
([--]?1)/2 mig
|
418 |
+
([--]?1)/([3-9]\d*) $(masculine \2)| $(partitive \3)
|
419 |
+
([--]?\d+)(/1)? $2
|
420 |
+
([--]?\d+)/([1-9]\d*) $2| $(partitive-plural \3)
|
421 |
+
|
422 |
+
== fraction-feminine ==
|
423 |
+
([--]?1)(/1)? $(f:$1)| unitat
|
424 |
+
([--]?1)/([1-9]\d*) $(f:$1)| $(partitive-feminine \2)| part
|
425 |
+
([--]?\d+)(/1)? $(f:$1)| unitats
|
426 |
+
([--]?\d+)/([1-9]\d*) $(f:$1)| $(partitive-feminine-plural \2)| parts
|
427 |
+
|
428 |
+
== collective ==
|
429 |
+
2 parell, parella o duo
|
430 |
+
3 tern, terna, tercet, trio, tríada o treset
|
431 |
+
4 qüern, tètrada, quartet, quarteta o quàdruple
|
432 |
+
5 quintern, quintet, cinquet o quíntuple
|
433 |
+
6 sextet, siset o sèxtuple
|
434 |
+
7 septet, setet o sèptuple
|
435 |
+
8 octet o òctuple
|
436 |
+
9 nònuple
|
437 |
+
10 dècada o dècuple
|
438 |
+
12 dotzena
|
439 |
+
100 centenar
|
440 |
+
144 grossa
|
441 |
+
1000 miler
|
442 |
+
10000 miríada
|
443 |
+
|
444 |
+
== years ==
|
445 |
+
2 bienni
|
446 |
+
3 trienni
|
447 |
+
4 quadrienni
|
448 |
+
5 quinquenni o lustre
|
449 |
+
6 sesenni
|
450 |
+
7 septenni
|
451 |
+
10 dècada o decenni
|
452 |
+
12 duodecenni
|
453 |
+
15 quindecenni
|
454 |
+
20 vintenni o vicenni
|
455 |
+
30 trentenni o tricenni
|
456 |
+
40 quarantenni
|
457 |
+
50 cinquantenni
|
458 |
+
60 seixantenni
|
459 |
+
70 setantenni
|
460 |
+
80 huitantenni [:ca-valencia:] [:ca-ES-valencia:]
|
461 |
+
80 vuitantenni
|
462 |
+
90 norantenni
|
463 |
+
100 segle o centenni
|
464 |
+
1000 mil·lenni
|
465 |
+
|
466 |
+
== multiplicative ==
|
467 |
+
2 doble o duple
|
468 |
+
3 triple
|
469 |
+
4 quàdruple
|
470 |
+
5 quíntuple
|
471 |
+
6 sèxtuple
|
472 |
+
7 sèptuple
|
473 |
+
8 òctuple
|
474 |
+
9 nònuple
|
475 |
+
10 dècuple
|
476 |
+
12 duodècuple
|
477 |
+
100 cèntuple
|
478 |
+
1/10 subdècuple
|
479 |
+
1/2 súbduple
|
480 |
+
|
481 |
+
== help ==
|
482 |
+
|
483 |
+
"" $(1)|, $(2), $(3)\n$(help feminine)$(help masculine)$(help ordinal-number-masculine)$(help ordinal-number-feminine)$(help ordinal-feminine)$(help ordinal-masculine)
|
484 |
+
(feminine|masculine|ordinal(-number)?(-feminine|-masculine)?) \1: $(\1 1), $(\1 2), $(\1 3)\n
|
485 |
+
|
text/cleaners.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
|
3 |
+
'''
|
4 |
+
Cleaners are transformations that run over the input text at both training and eval time.
|
5 |
+
|
6 |
+
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
7 |
+
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
|
8 |
+
1. "english_cleaners" for English text
|
9 |
+
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
10 |
+
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
11 |
+
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
12 |
+
the symbols in symbols.py to match your data).
|
13 |
+
'''
|
14 |
+
|
15 |
+
import re
|
16 |
+
from unidecode import unidecode
|
17 |
+
from text.numbers import normalize_numbers
|
18 |
+
from text.numbers_ca import normalize_numbers_ca
|
19 |
+
from text.symbols import symbols
|
20 |
+
|
21 |
+
# Regular expression matching whitespace:
|
22 |
+
_whitespace_re = re.compile(r'\s+')
|
23 |
+
|
24 |
+
# List of (regular expression, replacement) pairs for abbreviations:
|
25 |
+
_abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
|
26 |
+
('mrs', 'misess'),
|
27 |
+
('mr', 'mister'),
|
28 |
+
('dr', 'doctor'),
|
29 |
+
('st', 'saint'),
|
30 |
+
('co', 'company'),
|
31 |
+
('jr', 'junior'),
|
32 |
+
('maj', 'major'),
|
33 |
+
('gen', 'general'),
|
34 |
+
('drs', 'doctors'),
|
35 |
+
('rev', 'reverend'),
|
36 |
+
('lt', 'lieutenant'),
|
37 |
+
('hon', 'honorable'),
|
38 |
+
('sgt', 'sergeant'),
|
39 |
+
('capt', 'captain'),
|
40 |
+
('esq', 'esquire'),
|
41 |
+
('ltd', 'limited'),
|
42 |
+
('col', 'colonel'),
|
43 |
+
('ft', 'fort'),
|
44 |
+
]]
|
45 |
+
|
46 |
+
# List of (regular expression, replacement) pairs for catalan abbreviations:
|
47 |
+
_abbreviations_ca = [(re.compile('\\b%s\\b' % x[0], re.IGNORECASE), x[1]) for x in [
|
48 |
+
('tv3', 't v tres'),
|
49 |
+
('8tv', 'vuit t v'),
|
50 |
+
('pp', 'p p'),
|
51 |
+
('psoe', 'p soe'),
|
52 |
+
('sr.?', 'senyor'),
|
53 |
+
('sra.?', 'senyora'),
|
54 |
+
('srta.?', 'senyoreta')
|
55 |
+
]]
|
56 |
+
|
57 |
+
_replacements_ca = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
58 |
+
(';', ','),
|
59 |
+
(':', '\.'),
|
60 |
+
('\.\.\.,', ','),
|
61 |
+
('\.\.\.', '…'),
|
62 |
+
('ñ','ny')
|
63 |
+
]]
|
64 |
+
|
65 |
+
|
66 |
+
def expand_abbreviations(text, lang='ca'):
|
67 |
+
if lang == 'en':
|
68 |
+
_abbreviations = _abbreviations_en
|
69 |
+
elif lang == 'ca':
|
70 |
+
_abbreviations = _abbreviations_ca
|
71 |
+
else:
|
72 |
+
raise ValueError('no %s language for abbreviations'%lang)
|
73 |
+
for regex, replacement in _abbreviations:
|
74 |
+
text = re.sub(regex, replacement, text)
|
75 |
+
return text
|
76 |
+
|
77 |
+
|
78 |
+
def convert_characters(text, lang='ca'):
|
79 |
+
if lang == 'ca':
|
80 |
+
_replacements = _replacements_ca
|
81 |
+
else:
|
82 |
+
raise ValueError('no %s language for punctuation conversion'%lang)
|
83 |
+
for regex, replacement in _replacements_ca:
|
84 |
+
text = re.sub(regex, replacement, text)
|
85 |
+
return text
|
86 |
+
|
87 |
+
|
88 |
+
def expand_numbers(text, lang="ca"):
|
89 |
+
if lang == 'ca':
|
90 |
+
return normalize_numbers_ca(text)
|
91 |
+
else:
|
92 |
+
return normalize_numbers(text)
|
93 |
+
|
94 |
+
|
95 |
+
def lowercase(text):
|
96 |
+
return text.lower()
|
97 |
+
|
98 |
+
|
99 |
+
def collapse_whitespace(text):
|
100 |
+
return re.sub(_whitespace_re, ' ', text)
|
101 |
+
|
102 |
+
|
103 |
+
def convert_to_ascii(text, lang="ca"):
|
104 |
+
if lang == 'en':
|
105 |
+
return unidecode(text)
|
106 |
+
elif lang == 'ca':
|
107 |
+
char_replace = []
|
108 |
+
for t in set(list(text)):
|
109 |
+
if t not in symbols:
|
110 |
+
char_replace.append([t, unidecode(t)])
|
111 |
+
for target, replace in char_replace:
|
112 |
+
text = text.replace(target, replace)
|
113 |
+
return text
|
114 |
+
else:
|
115 |
+
raise ValueError('no %s language for punctuation conversion'%lang)
|
116 |
+
|
117 |
+
|
118 |
+
def basic_cleaners(text):
|
119 |
+
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
|
120 |
+
text = lowercase(text)
|
121 |
+
text = collapse_whitespace(text)
|
122 |
+
return text
|
123 |
+
|
124 |
+
|
125 |
+
def transliteration_cleaners(text):
|
126 |
+
'''Pipeline for non-English text that transliterates to ASCII.'''
|
127 |
+
text = convert_to_ascii(text)
|
128 |
+
text = lowercase(text)
|
129 |
+
text = collapse_whitespace(text)
|
130 |
+
return text
|
131 |
+
|
132 |
+
|
133 |
+
def english_cleaners(text):
|
134 |
+
'''Pipeline for English text, including number and abbreviation expansion.'''
|
135 |
+
text = convert_to_ascii(text)
|
136 |
+
text = lowercase(text)
|
137 |
+
text = expand_numbers(text, lang='en')
|
138 |
+
text = expand_abbreviations(text, lang='en')
|
139 |
+
text = collapse_whitespace(text)
|
140 |
+
return text
|
141 |
+
|
142 |
+
|
143 |
+
def catalan_cleaners(text):
|
144 |
+
text = lowercase(text)
|
145 |
+
text = expand_numbers(text, lang="ca")
|
146 |
+
text = convert_characters(text, lang="ca")
|
147 |
+
text = convert_to_ascii(text, lang="ca")
|
148 |
+
text = expand_abbreviations(text, lang="ca")
|
149 |
+
text = collapse_whitespace(text)
|
150 |
+
return text
|
text/cmudict.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
|
3 |
+
import re
|
4 |
+
|
5 |
+
|
6 |
+
valid_symbols = [
|
7 |
+
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
|
8 |
+
'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
|
9 |
+
'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
|
10 |
+
'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
|
11 |
+
'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
|
12 |
+
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
|
13 |
+
'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
|
14 |
+
]
|
15 |
+
|
16 |
+
_valid_symbol_set = set(valid_symbols)
|
17 |
+
|
18 |
+
|
19 |
+
class CMUDict:
|
20 |
+
'''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
|
21 |
+
def __init__(self, file_or_path, keep_ambiguous=True):
|
22 |
+
if isinstance(file_or_path, str):
|
23 |
+
with open(file_or_path, encoding='latin-1') as f:
|
24 |
+
entries = _parse_cmudict(f)
|
25 |
+
else:
|
26 |
+
entries = _parse_cmudict(file_or_path)
|
27 |
+
if not keep_ambiguous:
|
28 |
+
entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
|
29 |
+
self._entries = entries
|
30 |
+
|
31 |
+
|
32 |
+
def __len__(self):
|
33 |
+
return len(self._entries)
|
34 |
+
|
35 |
+
|
36 |
+
def lookup(self, word):
|
37 |
+
'''Returns list of ARPAbet pronunciations of the given word.'''
|
38 |
+
return self._entries.get(word.upper())
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
_alt_re = re.compile(r'\([0-9]+\)')
|
43 |
+
|
44 |
+
|
45 |
+
def _parse_cmudict(file):
|
46 |
+
cmudict = {}
|
47 |
+
for line in file:
|
48 |
+
if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
|
49 |
+
parts = line.split(' ')
|
50 |
+
word = re.sub(_alt_re, '', parts[0])
|
51 |
+
pronunciation = _get_pronunciation(parts[1])
|
52 |
+
if pronunciation:
|
53 |
+
if word in cmudict:
|
54 |
+
cmudict[word].append(pronunciation)
|
55 |
+
else:
|
56 |
+
cmudict[word] = [pronunciation]
|
57 |
+
return cmudict
|
58 |
+
|
59 |
+
|
60 |
+
def _get_pronunciation(s):
|
61 |
+
parts = s.strip().split(' ')
|
62 |
+
for part in parts:
|
63 |
+
if part not in _valid_symbol_set:
|
64 |
+
return None
|
65 |
+
return ' '.join(parts)
|
text/numbers.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
|
3 |
+
import inflect
|
4 |
+
import re
|
5 |
+
|
6 |
+
|
7 |
+
_inflect = inflect.engine()
|
8 |
+
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
9 |
+
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
10 |
+
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
|
11 |
+
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
|
12 |
+
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
|
13 |
+
_number_re = re.compile(r'[0-9]+')
|
14 |
+
|
15 |
+
|
16 |
+
def _remove_commas(m):
|
17 |
+
return m.group(1).replace(',', '')
|
18 |
+
|
19 |
+
|
20 |
+
def _expand_decimal_point(m):
|
21 |
+
return m.group(1).replace('.', ' point ')
|
22 |
+
|
23 |
+
|
24 |
+
def _expand_dollars(m):
|
25 |
+
match = m.group(1)
|
26 |
+
parts = match.split('.')
|
27 |
+
if len(parts) > 2:
|
28 |
+
return match + ' dollars' # Unexpected format
|
29 |
+
dollars = int(parts[0]) if parts[0] else 0
|
30 |
+
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
31 |
+
if dollars and cents:
|
32 |
+
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
33 |
+
cent_unit = 'cent' if cents == 1 else 'cents'
|
34 |
+
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
|
35 |
+
elif dollars:
|
36 |
+
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
37 |
+
return '%s %s' % (dollars, dollar_unit)
|
38 |
+
elif cents:
|
39 |
+
cent_unit = 'cent' if cents == 1 else 'cents'
|
40 |
+
return '%s %s' % (cents, cent_unit)
|
41 |
+
else:
|
42 |
+
return 'zero dollars'
|
43 |
+
|
44 |
+
|
45 |
+
def _expand_ordinal(m):
|
46 |
+
return _inflect.number_to_words(m.group(0))
|
47 |
+
|
48 |
+
|
49 |
+
def _expand_number(m):
|
50 |
+
num = int(m.group(0))
|
51 |
+
if num > 1000 and num < 3000:
|
52 |
+
if num == 2000:
|
53 |
+
return 'two thousand'
|
54 |
+
elif num > 2000 and num < 2010:
|
55 |
+
return 'two thousand ' + _inflect.number_to_words(num % 100)
|
56 |
+
elif num % 100 == 0:
|
57 |
+
return _inflect.number_to_words(num // 100) + ' hundred'
|
58 |
+
else:
|
59 |
+
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
|
60 |
+
else:
|
61 |
+
return _inflect.number_to_words(num, andword='')
|
62 |
+
|
63 |
+
|
64 |
+
def normalize_numbers(text):
|
65 |
+
text = re.sub(_comma_number_re, _remove_commas, text)
|
66 |
+
text = re.sub(_pounds_re, r'\1 pounds', text)
|
67 |
+
text = re.sub(_dollars_re, _expand_dollars, text)
|
68 |
+
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
69 |
+
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
70 |
+
text = re.sub(_number_re, _expand_number, text)
|
71 |
+
return text
|
text/numbers_ca.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import io
|
3 |
+
import pathlib
|
4 |
+
from text.soros import compile
|
5 |
+
|
6 |
+
filepath = pathlib.Path(__file__).parent.absolute()
|
7 |
+
with io.open(f"{filepath}/ca.sor", 'r', encoding="utf-8") as prg:
|
8 |
+
num2text = compile(prg.read(), 'ca')
|
9 |
+
|
10 |
+
_separador_milers_re = re.compile(r'([0-9][0-9\.]+[0-9]{3})')
|
11 |
+
_decimal_re = re.compile(r'([0-9]+\,[0-9]+)')
|
12 |
+
_ordinal_ms_re = re.compile(r'([0-9]+)(r|er|n|on|t|rt|è|e|ne|nè)+(\b)')
|
13 |
+
_ordinal_mp_re = re.compile(r'([0-9]+)(rs|ns|ts|ns)+(\b)')
|
14 |
+
_ordinal_fs_re = re.compile(r'([0-9]+)(a|ra|na|ta)+(\b)')
|
15 |
+
_ordinal_fp_re = re.compile(r'([0-9]+)(es)+(\b)')
|
16 |
+
_cardinal_re = re.compile(r'[0-9]+')
|
17 |
+
_fraccions_re = re.compile(r'(\b)([0-9]+\/[0-9]+)(\b)')
|
18 |
+
_hores_re = re.compile(r'(\b)([0-9]{1,2}):([0-9]{2})(\b)')
|
19 |
+
|
20 |
+
def _esborra_separador_milers(m):
|
21 |
+
return m.group(1).replace('.', '')
|
22 |
+
|
23 |
+
def _num2text(m):
|
24 |
+
return num2text.run(m.group(0))
|
25 |
+
|
26 |
+
def _ordinal_ms(m):
|
27 |
+
return num2text.run(f"ordinal {m.group(1)}") + m.group(3)
|
28 |
+
|
29 |
+
def _ordinal_mp(m):
|
30 |
+
return num2text.run(f"ordinal-masculine-plural {m.group(1)}") + m.group(3)
|
31 |
+
|
32 |
+
def _ordinal_fs(m):
|
33 |
+
return num2text.run(f"ordinal-feminine {m.group(1)}") + m.group(3)
|
34 |
+
|
35 |
+
def _ordinal_fp(m):
|
36 |
+
return num2text.run(f"ordinal-feminine-plural {m.group(1)}") + m.group(3)
|
37 |
+
|
38 |
+
def _fraccions(m):
|
39 |
+
return m.group(1) + num2text.run(f"fraction {m.group(2)}") + m.group(3)
|
40 |
+
|
41 |
+
def _hores(m):
|
42 |
+
return m.group(1) + num2text.run(m.group(2)) + " i " + num2text.run(m.group(3)) + m.group(4)
|
43 |
+
|
44 |
+
def normalize_numbers_ca(text):
|
45 |
+
text = re.sub(_separador_milers_re, _esborra_separador_milers, text)
|
46 |
+
text = re.sub(_decimal_re, _num2text, text)
|
47 |
+
text = re.sub(_ordinal_ms_re, _ordinal_ms, text)
|
48 |
+
text = re.sub(_ordinal_mp_re, _ordinal_mp, text)
|
49 |
+
text = re.sub(_ordinal_fs_re, _ordinal_fs, text)
|
50 |
+
text = re.sub(_ordinal_fp_re, _ordinal_fp, text)
|
51 |
+
text = re.sub(_fraccions_re, _fraccions, text)
|
52 |
+
text = re.sub(_hores_re, _hores, text)
|
53 |
+
text = re.sub(_cardinal_re, _num2text, text)
|
54 |
+
return text
|
text/numbers_ca_test.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
|
3 |
+
from text.numbers_ca import normalize_numbers_ca
|
4 |
+
|
5 |
+
|
6 |
+
class NumbersCa(unittest.TestCase):
|
7 |
+
def test_cardinals(self):
|
8 |
+
"""
|
9 |
+
Converteix cardinals simples en una frase
|
10 |
+
"""
|
11 |
+
self.assertEqual(normalize_numbers_ca("Va nèixer el 23 de desembre de 1988"), "Va nèixer el vint-i-tres de desembre de mil nou-cents vuitanta-vuit")
|
12 |
+
self.assertEqual(normalize_numbers_ca("tinc 3 preguntes"), "tinc tres preguntes")
|
13 |
+
|
14 |
+
def test_separador_milers(self):
|
15 |
+
"""
|
16 |
+
Ignora separadors de milers
|
17 |
+
"""
|
18 |
+
self.assertEqual(normalize_numbers_ca("1.000"), "mil")
|
19 |
+
self.assertEqual(normalize_numbers_ca("323.400"), "tres-cents vint-i-tres mil quatre-cents")
|
20 |
+
self.assertEqual(normalize_numbers_ca("900.323.400"), "nou-cents milions tres-cents vint-i-tres mil quatre-cents")
|
21 |
+
|
22 |
+
def test_decimals(self):
|
23 |
+
"""
|
24 |
+
Converteix decimals
|
25 |
+
"""
|
26 |
+
self.assertEqual(normalize_numbers_ca("1,33"), "u coma trenta-tres")
|
27 |
+
self.assertEqual(normalize_numbers_ca("75,5"), "setanta-cinc coma cinc")
|
28 |
+
self.assertEqual(normalize_numbers_ca("75,555"), "setanta-cinc coma cinc-cents cinquanta-cinc")
|
29 |
+
self.assertEqual(normalize_numbers_ca("999.999.999,99"), "nou-cents noranta-nou milions nou-cents noranta-nou mil nou-cents noranta-nou coma noranta-nou")
|
30 |
+
self.assertEqual(normalize_numbers_ca("1,12345678900"), "u coma dotze trenta-quatre cinquanta-sis set-cents vuitanta-nou")
|
31 |
+
|
32 |
+
def test_decimals_2(self):
|
33 |
+
"""
|
34 |
+
Ignora comes que no pertànyen a un número decimal
|
35 |
+
"""
|
36 |
+
self.assertEqual(normalize_numbers_ca("Va comprar pa, vi i llonganisses"), "Va comprar pa, vi i llonganisses")
|
37 |
+
self.assertEqual(normalize_numbers_ca("El número guanyador és 1, 23, 55, 34"), "El número guanyador és u, vint-i-tres, cinquanta-cinc, trenta-quatre")
|
38 |
+
|
39 |
+
def test_ordinals_ms(self):
|
40 |
+
"""
|
41 |
+
Converteix ordinals masculins singulars
|
42 |
+
"""
|
43 |
+
self.assertEqual(normalize_numbers_ca("Va arribar 4t de 5"), "Va arribar quart de cinc")
|
44 |
+
self.assertEqual(normalize_numbers_ca("el 1r va ser ell"), "el primer va ser ell")
|
45 |
+
self.assertEqual(normalize_numbers_ca("el 3er, no va aguantar"), "el tercer, no va aguantar")
|
46 |
+
self.assertEqual(normalize_numbers_ca("2n"), "segon")
|
47 |
+
self.assertEqual(normalize_numbers_ca("2on"), "segon")
|
48 |
+
self.assertEqual(normalize_numbers_ca("4t"), "quart")
|
49 |
+
self.assertEqual(normalize_numbers_ca("4rt"), "quart")
|
50 |
+
self.assertEqual(normalize_numbers_ca("5è: remogueu la barreja"), "cinquè: remogueu la barreja")
|
51 |
+
self.assertEqual(normalize_numbers_ca("6e"), "sisè")
|
52 |
+
self.assertEqual(normalize_numbers_ca("6e"), "sisè")
|
53 |
+
self.assertEqual(normalize_numbers_ca("21nè"), "vint-i-unè")
|
54 |
+
self.assertEqual(normalize_numbers_ca("un 81ne de Palamós"), "un vuitanta-unè de Palamós")
|
55 |
+
|
56 |
+
def test_ordinals_fs(self):
|
57 |
+
"""
|
58 |
+
Converteix ordinals femenins singulars
|
59 |
+
"""
|
60 |
+
self.assertEqual(normalize_numbers_ca("1a"), "primera")
|
61 |
+
self.assertEqual(normalize_numbers_ca("3ra"), "tercera")
|
62 |
+
self.assertEqual(normalize_numbers_ca("2a"), "segona")
|
63 |
+
self.assertEqual(normalize_numbers_ca("2na"), "segona")
|
64 |
+
self.assertEqual(normalize_numbers_ca("4a."), "quarta.")
|
65 |
+
self.assertEqual(normalize_numbers_ca("pugi a la 4ta, després giri a l'esquerra"), "pugi a la quarta, després giri a l'esquerra")
|
66 |
+
self.assertEqual(normalize_numbers_ca("va quedar 5a en la classificació"), "va quedar cinquena en la classificació")
|
67 |
+
self.assertEqual(normalize_numbers_ca("la 5na vegada"), "la cinquena vegada")
|
68 |
+
|
69 |
+
def test_ordinals_mp(self):
|
70 |
+
"""
|
71 |
+
Converteix ordinals masculins plurals
|
72 |
+
"""
|
73 |
+
self.assertEqual(normalize_numbers_ca("1rs"), "primers")
|
74 |
+
self.assertEqual(normalize_numbers_ca("van arribar 2ns"), "van arribar segons")
|
75 |
+
|
76 |
+
def test_ordinals_fp(self):
|
77 |
+
"""
|
78 |
+
Converteix ordinals femenins plurals
|
79 |
+
"""
|
80 |
+
self.assertEqual(normalize_numbers_ca("1es"), "primeres")
|
81 |
+
|
82 |
+
def test_fraccions_s(self):
|
83 |
+
"""
|
84 |
+
Converteix fraccions singulars
|
85 |
+
"""
|
86 |
+
self.assertEqual(normalize_numbers_ca("1/2 got de vi"), "mig got de vi")
|
87 |
+
self.assertEqual(normalize_numbers_ca("1/3 de farina"), "un terç de farina")
|
88 |
+
self.assertEqual(normalize_numbers_ca("1/8"), "un vuitè")
|
89 |
+
|
90 |
+
def test_fraccions_p(self):
|
91 |
+
"""
|
92 |
+
Converteix fraccions plurals
|
93 |
+
"""
|
94 |
+
self.assertEqual(normalize_numbers_ca("4/2 gots de vi"), "quatre migs gots de vi")
|
95 |
+
self.assertEqual(normalize_numbers_ca("2/3 de farina"), "dos terços de farina")
|
96 |
+
self.assertEqual(normalize_numbers_ca("3/8"), "tres vuitens")
|
97 |
+
|
98 |
+
def test_hores(self):
|
99 |
+
"""
|
100 |
+
Converteix hores de manera simplificada
|
101 |
+
"""
|
102 |
+
self.assertEqual(normalize_numbers_ca("a les 11:45"), "a les onze i quaranta-cinc")
|
103 |
+
self.assertEqual(normalize_numbers_ca("a partir de les 23:12"), "a partir de les vint-i-tres i dotze")
|
104 |
+
|
105 |
+
if __name__ == '__main__':
|
106 |
+
unittest.main()
|
text/soros.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"Soros interpreter (see http://numbertext.org)"
|
2 |
+
from __future__ import unicode_literals
|
3 |
+
from __future__ import print_function
|
4 |
+
import re
|
5 |
+
import sys
|
6 |
+
|
7 |
+
|
8 |
+
def run(program, data, lang):
|
9 |
+
return compile(program, lang).run(data)
|
10 |
+
|
11 |
+
|
12 |
+
def compile(program, lang):
|
13 |
+
return _Soros(program, lang)
|
14 |
+
|
15 |
+
# conversion function
|
16 |
+
|
17 |
+
|
18 |
+
def _tr(text, chars, chars2, delim):
|
19 |
+
for i in range(0, len(chars)):
|
20 |
+
text = text.replace(delim + chars[i], chars2[i])
|
21 |
+
return text
|
22 |
+
|
23 |
+
|
24 |
+
# string literals for metacharacter encoding
|
25 |
+
_m = "\\\";#$()|[]"
|
26 |
+
# Unicode private area
|
27 |
+
_c = u"\uE000\uE001\uE002\uE003\uE004\uE005\uE006\uE007\uE008\uE009"
|
28 |
+
_pipe = u"\uE003"
|
29 |
+
# separator prefix = \uE00A
|
30 |
+
|
31 |
+
# pattern to recognize function calls in the replacement string
|
32 |
+
_func = re.compile(_tr(r"""(?:\|?(?:\$\()+)? # optional nested calls
|
33 |
+
(\|?\$\(([^\(\)]*)\)\|?) # inner call (2 subgroups)
|
34 |
+
(?:\)+\|?)?""", # optional nested calls
|
35 |
+
_m[4:8], _c[:4], "\\"), re.X) # \$, \(, \), \| -> \uE000..\uE003
|
36 |
+
|
37 |
+
|
38 |
+
class _Soros:
|
39 |
+
def __init__(self, prg, lang):
|
40 |
+
self.lines = []
|
41 |
+
if prg.find("__numbertext__") == -1:
|
42 |
+
prg = "__numbertext__;" + prg
|
43 |
+
# default left zero deletion
|
44 |
+
# and separator function (no separation, if subcall returns with empty string)
|
45 |
+
prg = prg.replace("__numbertext__", u"""0+(0|[1-9]\\d*) $1
|
46 |
+
\"([a-z][-a-z]* )0+(0|[1-9]\\d*)\" $(\\1\\2)
|
47 |
+
\"\uE00A(.*)\uE00A(.+)\uE00A(.*)\" \\1\\2\\3
|
48 |
+
\"\uE00A.*\uE00A\uE00A.*\"
|
49 |
+
""")
|
50 |
+
prg = _tr(prg, _m[:4], _c[:4],
|
51 |
+
"\\") # \\, \", \;, \# -> \uE000..\uE003
|
52 |
+
# switch off all country-dependent lines, and switch on the requested ones
|
53 |
+
prg = re.sub(
|
54 |
+
r"(^|[\n;])([^\n;#]*#[^\n]*[\[]:[^\n:\]]*:][^\n]*)", r"\1#\2", prg)
|
55 |
+
prg = re.sub(r"(^|[\n;])#([^\n;#]*#[^\n]*[\[]:" +
|
56 |
+
lang.replace("_", "-") + r":][^\n]*)", r"\1\2", prg)
|
57 |
+
matchline = re.compile("^\s*(\"[^\"]*\"|[^\s]*)\s*(.*[^\s])?\s*$")
|
58 |
+
prefix = ""
|
59 |
+
for s in re.sub("(#[^\n]*)?(\n|$)", ";", prg).split(";"):
|
60 |
+
macro = re.match("== *(.*[^ ]?) ==", s)
|
61 |
+
if macro != None:
|
62 |
+
prefix = macro.group(1)
|
63 |
+
continue
|
64 |
+
m = matchline.match(s)
|
65 |
+
if prefix != "" and s != "" and m != None:
|
66 |
+
s = m.group(1).strip("\"")
|
67 |
+
space = " " if s != "" else ""
|
68 |
+
caret = ""
|
69 |
+
if s[0:1] == "^":
|
70 |
+
s = s[1:]
|
71 |
+
caret = "^"
|
72 |
+
s2 = m.group(2) if m.group(2) != None else ""
|
73 |
+
s = "\"" + caret + prefix + space + s + "\" " + s2
|
74 |
+
m = matchline.match(s)
|
75 |
+
if m != None:
|
76 |
+
s = _tr(m.group(1).strip("\""), _c[1:4], _m[1:4], "") \
|
77 |
+
.replace(_c[_m.find("\\")], "\\\\") # -> \\, ", ;, #
|
78 |
+
if m.group(2) != None:
|
79 |
+
s2 = m.group(2).strip("\"")
|
80 |
+
else:
|
81 |
+
s2 = ""
|
82 |
+
# \$, \(, \), \|, \[, \] -> \uE004..\uE009
|
83 |
+
s2 = _tr(s2, _m[4:], _c[4:], "\\")
|
84 |
+
# call inner separator: [ ... $1 ... ] -> $(\uE00A ... \uE00A$1\uE00A ... )
|
85 |
+
s2 = re.sub(r"[\[]\$(\d\d?|\([^\)]+\))",
|
86 |
+
u"$(\uE00A\uE00A|$\\1\uE00A", s2)
|
87 |
+
s2 = re.sub(r"[\[]([^\$[\\]*)\$(\d\d?|\([^\)]+\))",
|
88 |
+
u"$(\uE00A\\1\uE00A$\\2\uE00A", s2)
|
89 |
+
# add "|" in terminating position
|
90 |
+
s2 = re.sub(r"\uE00A]$", "|\uE00A)", s2)
|
91 |
+
s2 = re.sub(r"]", ")", s2)
|
92 |
+
s2 = re.sub(r"(\$\d|\))\|\$", r"\1||$",
|
93 |
+
s2) # $()|$() -> $()||$()
|
94 |
+
# \uE000..\uE003-> \, ", ;, #
|
95 |
+
s2 = _tr(s2, _c[:4], _m[:4], "")
|
96 |
+
# $, (, ), | -> \uE000..\uE003
|
97 |
+
s2 = _tr(s2, _m[4:8], _c[:4], "")
|
98 |
+
# \uE004..\uE009 -> $, (, ), |, [, ]
|
99 |
+
s2 = _tr(s2, _c[4:], _m[4:], "")
|
100 |
+
s2 = re.sub(r"\\(\d)", r"\\g<\1>",
|
101 |
+
re.sub(r"\uE000(\d)", "\uE000\uE001\\\\g<\\1>\uE002", s2))
|
102 |
+
try:
|
103 |
+
self.lines = self.lines + [[
|
104 |
+
re.compile("^" + s.lstrip("^").rstrip("$") + "$"),
|
105 |
+
s2, s[:1] == "^", s[-1:] == "$"]]
|
106 |
+
except:
|
107 |
+
print("Error in following regex line: " + s, file=sys.stderr)
|
108 |
+
raise
|
109 |
+
|
110 |
+
def run(self, data):
|
111 |
+
return self._run(data, True, True)
|
112 |
+
|
113 |
+
def _run(self, data, begin, end):
|
114 |
+
for i in self.lines:
|
115 |
+
if not ((begin == False and i[2]) or (end == False and i[3])):
|
116 |
+
m = i[0].match(data)
|
117 |
+
if m:
|
118 |
+
try:
|
119 |
+
s = m.expand(i[1])
|
120 |
+
except:
|
121 |
+
print("Error for the following input: " +
|
122 |
+
data, file=sys.stderr)
|
123 |
+
raise
|
124 |
+
n = _func.search(s)
|
125 |
+
while n:
|
126 |
+
b = False
|
127 |
+
e = False
|
128 |
+
if n.group(1)[0:1] == _pipe or n.group()[0:1] == _pipe:
|
129 |
+
b = True
|
130 |
+
elif n.start() == 0:
|
131 |
+
b = begin
|
132 |
+
if n.group(1)[-1:] == _pipe or n.group()[-1:] == _pipe:
|
133 |
+
e = True
|
134 |
+
elif n.end() == len(s):
|
135 |
+
e = end
|
136 |
+
s = s[:n.start(1)] + self._run(n.group(2),
|
137 |
+
b, e) + s[n.end(1):]
|
138 |
+
n = _func.search(s)
|
139 |
+
return s
|
140 |
+
return ""
|
text/symbols.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
|
3 |
+
'''
|
4 |
+
Defines the set of symbols used in text input to the model.
|
5 |
+
|
6 |
+
The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
|
7 |
+
from text import cmudict
|
8 |
+
|
9 |
+
_pad = '_' # in principle not used in tacotron2
|
10 |
+
_punctuation = '\'!,.?…· '
|
11 |
+
_letters = 'AÀÁBCÇDEÉÈFGHIÍÏJKLMNOÓÒPQRSTUÜÚVWXYZaàábcçdeéèfghiíïjklmnoóòpqrstuüúvwxyz'
|
12 |
+
|
13 |
+
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
14 |
+
_arpabet = ['@' + s for s in cmudict.valid_symbols]
|
15 |
+
|
16 |
+
# Export all symbols:
|
17 |
+
symbols = [_pad] + list(_punctuation) + list(_letters) + _arpabet
|
text/symbols_en.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
|
3 |
+
'''
|
4 |
+
Defines the set of symbols used in text input to the model.
|
5 |
+
|
6 |
+
The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
|
7 |
+
from text import cmudict
|
8 |
+
|
9 |
+
_pad = '_'
|
10 |
+
_punctuation = '!\'(),.:;? '
|
11 |
+
_special = '-'
|
12 |
+
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
|
13 |
+
|
14 |
+
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
15 |
+
_arpabet = ['@' + s for s in cmudict.valid_symbols]
|
16 |
+
|
17 |
+
# Export all symbols:
|
18 |
+
symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
|