alysa commited on
Commit
7bf0108
·
1 Parent(s): b89ac4f

Upload 2 files

Browse files
Files changed (2) hide show
  1. text/__init__.py +447 -0
  2. text/symbols.py +71 -0
text/__init__.py ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from text.symbols import symbols
2
+
3
+
4
+ # Mappings from symbol to numeric ID and vice versa:
5
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
6
+ _id_to_symbol = {i: s for i, s in enumerate(symbols)}
7
+
8
+
9
+ def cleaned_text_to_sequence(cleaned_text):
10
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
11
+ Args:
12
+ text: string to convert to a sequence
13
+ Returns:
14
+ List of integers corresponding to the symbols in the text
15
+ """
16
+ sequence = [_symbol_to_id[symbol] for symbol in cleaned_text.split()]
17
+ return sequence
18
+
19
+
20
+ def sequence_to_text(sequence):
21
+ """Converts a sequence of IDs back to a string"""
22
+ result = ""
23
+ for symbol_id in sequence:
24
+ s = _id_to_symbol[symbol_id]
25
+ result += s
26
+ return result
27
+
28
+
29
+ pinyin_dict = {
30
+ "a": ("^", "a"),
31
+ "ai": ("^", "ai"),
32
+ "an": ("^", "an"),
33
+ "ang": ("^", "ang"),
34
+ "ao": ("^", "ao"),
35
+ "ba": ("b", "a"),
36
+ "bai": ("b", "ai"),
37
+ "ban": ("b", "an"),
38
+ "bang": ("b", "ang"),
39
+ "bao": ("b", "ao"),
40
+ "be": ("b", "e"),
41
+ "bei": ("b", "ei"),
42
+ "ben": ("b", "en"),
43
+ "beng": ("b", "eng"),
44
+ "bi": ("b", "i"),
45
+ "bian": ("b", "ian"),
46
+ "biao": ("b", "iao"),
47
+ "bie": ("b", "ie"),
48
+ "bin": ("b", "in"),
49
+ "bing": ("b", "ing"),
50
+ "bo": ("b", "o"),
51
+ "bu": ("b", "u"),
52
+ "ca": ("c", "a"),
53
+ "cai": ("c", "ai"),
54
+ "can": ("c", "an"),
55
+ "cang": ("c", "ang"),
56
+ "cao": ("c", "ao"),
57
+ "ce": ("c", "e"),
58
+ "cen": ("c", "en"),
59
+ "ceng": ("c", "eng"),
60
+ "cha": ("ch", "a"),
61
+ "chai": ("ch", "ai"),
62
+ "chan": ("ch", "an"),
63
+ "chang": ("ch", "ang"),
64
+ "chao": ("ch", "ao"),
65
+ "che": ("ch", "e"),
66
+ "chen": ("ch", "en"),
67
+ "cheng": ("ch", "eng"),
68
+ "chi": ("ch", "iii"),
69
+ "chong": ("ch", "ong"),
70
+ "chou": ("ch", "ou"),
71
+ "chu": ("ch", "u"),
72
+ "chua": ("ch", "ua"),
73
+ "chuai": ("ch", "uai"),
74
+ "chuan": ("ch", "uan"),
75
+ "chuang": ("ch", "uang"),
76
+ "chui": ("ch", "uei"),
77
+ "chun": ("ch", "uen"),
78
+ "chuo": ("ch", "uo"),
79
+ "ci": ("c", "ii"),
80
+ "cong": ("c", "ong"),
81
+ "cou": ("c", "ou"),
82
+ "cu": ("c", "u"),
83
+ "cuan": ("c", "uan"),
84
+ "cui": ("c", "uei"),
85
+ "cun": ("c", "uen"),
86
+ "cuo": ("c", "uo"),
87
+ "da": ("d", "a"),
88
+ "dai": ("d", "ai"),
89
+ "dan": ("d", "an"),
90
+ "dang": ("d", "ang"),
91
+ "dao": ("d", "ao"),
92
+ "de": ("d", "e"),
93
+ "dei": ("d", "ei"),
94
+ "den": ("d", "en"),
95
+ "deng": ("d", "eng"),
96
+ "di": ("d", "i"),
97
+ "dia": ("d", "ia"),
98
+ "dian": ("d", "ian"),
99
+ "diao": ("d", "iao"),
100
+ "die": ("d", "ie"),
101
+ "ding": ("d", "ing"),
102
+ "diu": ("d", "iou"),
103
+ "dong": ("d", "ong"),
104
+ "dou": ("d", "ou"),
105
+ "du": ("d", "u"),
106
+ "duan": ("d", "uan"),
107
+ "dui": ("d", "uei"),
108
+ "dun": ("d", "uen"),
109
+ "duo": ("d", "uo"),
110
+ "e": ("^", "e"),
111
+ "ei": ("^", "ei"),
112
+ "en": ("^", "en"),
113
+ "ng": ("^", "en"),
114
+ "eng": ("^", "eng"),
115
+ "er": ("^", "er"),
116
+ "fa": ("f", "a"),
117
+ "fan": ("f", "an"),
118
+ "fang": ("f", "ang"),
119
+ "fei": ("f", "ei"),
120
+ "fen": ("f", "en"),
121
+ "feng": ("f", "eng"),
122
+ "fo": ("f", "o"),
123
+ "fou": ("f", "ou"),
124
+ "fu": ("f", "u"),
125
+ "ga": ("g", "a"),
126
+ "gai": ("g", "ai"),
127
+ "gan": ("g", "an"),
128
+ "gang": ("g", "ang"),
129
+ "gao": ("g", "ao"),
130
+ "ge": ("g", "e"),
131
+ "gei": ("g", "ei"),
132
+ "gen": ("g", "en"),
133
+ "geng": ("g", "eng"),
134
+ "gong": ("g", "ong"),
135
+ "gou": ("g", "ou"),
136
+ "gu": ("g", "u"),
137
+ "gua": ("g", "ua"),
138
+ "guai": ("g", "uai"),
139
+ "guan": ("g", "uan"),
140
+ "guang": ("g", "uang"),
141
+ "gui": ("g", "uei"),
142
+ "gun": ("g", "uen"),
143
+ "guo": ("g", "uo"),
144
+ "ha": ("h", "a"),
145
+ "hai": ("h", "ai"),
146
+ "han": ("h", "an"),
147
+ "hang": ("h", "ang"),
148
+ "hao": ("h", "ao"),
149
+ "he": ("h", "e"),
150
+ "hei": ("h", "ei"),
151
+ "hen": ("h", "en"),
152
+ "heng": ("h", "eng"),
153
+ "hong": ("h", "ong"),
154
+ "hou": ("h", "ou"),
155
+ "hu": ("h", "u"),
156
+ "hua": ("h", "ua"),
157
+ "huai": ("h", "uai"),
158
+ "huan": ("h", "uan"),
159
+ "huang": ("h", "uang"),
160
+ "hui": ("h", "uei"),
161
+ "hun": ("h", "uen"),
162
+ "huo": ("h", "uo"),
163
+ "ji": ("j", "i"),
164
+ "jia": ("j", "ia"),
165
+ "jian": ("j", "ian"),
166
+ "jiang": ("j", "iang"),
167
+ "jiao": ("j", "iao"),
168
+ "jie": ("j", "ie"),
169
+ "jin": ("j", "in"),
170
+ "jing": ("j", "ing"),
171
+ "jiong": ("j", "iong"),
172
+ "jiu": ("j", "iou"),
173
+ "ju": ("j", "v"),
174
+ "juan": ("j", "van"),
175
+ "jue": ("j", "ve"),
176
+ "jun": ("j", "vn"),
177
+ "ka": ("k", "a"),
178
+ "kai": ("k", "ai"),
179
+ "kan": ("k", "an"),
180
+ "kang": ("k", "ang"),
181
+ "kao": ("k", "ao"),
182
+ "ke": ("k", "e"),
183
+ "kei": ("k", "ei"),
184
+ "ken": ("k", "en"),
185
+ "keng": ("k", "eng"),
186
+ "kong": ("k", "ong"),
187
+ "kou": ("k", "ou"),
188
+ "ku": ("k", "u"),
189
+ "kua": ("k", "ua"),
190
+ "kuai": ("k", "uai"),
191
+ "kuan": ("k", "uan"),
192
+ "kuang": ("k", "uang"),
193
+ "kui": ("k", "uei"),
194
+ "kun": ("k", "uen"),
195
+ "kuo": ("k", "uo"),
196
+ "la": ("l", "a"),
197
+ "lai": ("l", "ai"),
198
+ "lan": ("l", "an"),
199
+ "lang": ("l", "ang"),
200
+ "lao": ("l", "ao"),
201
+ "le": ("l", "e"),
202
+ "lei": ("l", "ei"),
203
+ "leng": ("l", "eng"),
204
+ "li": ("l", "i"),
205
+ "lia": ("l", "ia"),
206
+ "lian": ("l", "ian"),
207
+ "liang": ("l", "iang"),
208
+ "liao": ("l", "iao"),
209
+ "lie": ("l", "ie"),
210
+ "lin": ("l", "in"),
211
+ "ling": ("l", "ing"),
212
+ "liu": ("l", "iou"),
213
+ "lo": ("l", "o"),
214
+ "long": ("l", "ong"),
215
+ "lou": ("l", "ou"),
216
+ "lu": ("l", "u"),
217
+ "lv": ("l", "v"),
218
+ "luan": ("l", "uan"),
219
+ "lve": ("l", "ve"),
220
+ "lue": ("l", "ve"),
221
+ "lun": ("l", "uen"),
222
+ "luo": ("l", "uo"),
223
+ "ma": ("m", "a"),
224
+ "mai": ("m", "ai"),
225
+ "man": ("m", "an"),
226
+ "mang": ("m", "ang"),
227
+ "mao": ("m", "ao"),
228
+ "me": ("m", "e"),
229
+ "mei": ("m", "ei"),
230
+ "men": ("m", "en"),
231
+ "meng": ("m", "eng"),
232
+ "mi": ("m", "i"),
233
+ "mian": ("m", "ian"),
234
+ "miao": ("m", "iao"),
235
+ "mie": ("m", "ie"),
236
+ "min": ("m", "in"),
237
+ "ming": ("m", "ing"),
238
+ "miu": ("m", "iou"),
239
+ "mo": ("m", "o"),
240
+ "mou": ("m", "ou"),
241
+ "mu": ("m", "u"),
242
+ "na": ("n", "a"),
243
+ "nai": ("n", "ai"),
244
+ "nan": ("n", "an"),
245
+ "nang": ("n", "ang"),
246
+ "nao": ("n", "ao"),
247
+ "ne": ("n", "e"),
248
+ "nei": ("n", "ei"),
249
+ "nen": ("n", "en"),
250
+ "neng": ("n", "eng"),
251
+ "ni": ("n", "i"),
252
+ "nia": ("n", "ia"),
253
+ "nian": ("n", "ian"),
254
+ "niang": ("n", "iang"),
255
+ "niao": ("n", "iao"),
256
+ "nie": ("n", "ie"),
257
+ "nin": ("n", "in"),
258
+ "ning": ("n", "ing"),
259
+ "niu": ("n", "iou"),
260
+ "nong": ("n", "ong"),
261
+ "nou": ("n", "ou"),
262
+ "nu": ("n", "u"),
263
+ "nv": ("n", "v"),
264
+ "nuan": ("n", "uan"),
265
+ "nve": ("n", "ve"),
266
+ "nue": ("n", "ve"),
267
+ "nuo": ("n", "uo"),
268
+ "o": ("^", "o"),
269
+ "ou": ("^", "ou"),
270
+ "pa": ("p", "a"),
271
+ "pai": ("p", "ai"),
272
+ "pan": ("p", "an"),
273
+ "pang": ("p", "ang"),
274
+ "pao": ("p", "ao"),
275
+ "pe": ("p", "e"),
276
+ "pei": ("p", "ei"),
277
+ "pen": ("p", "en"),
278
+ "peng": ("p", "eng"),
279
+ "pi": ("p", "i"),
280
+ "pian": ("p", "ian"),
281
+ "piao": ("p", "iao"),
282
+ "pie": ("p", "ie"),
283
+ "pin": ("p", "in"),
284
+ "ping": ("p", "ing"),
285
+ "po": ("p", "o"),
286
+ "pou": ("p", "ou"),
287
+ "pu": ("p", "u"),
288
+ "qi": ("q", "i"),
289
+ "qia": ("q", "ia"),
290
+ "qian": ("q", "ian"),
291
+ "qiang": ("q", "iang"),
292
+ "qiao": ("q", "iao"),
293
+ "qie": ("q", "ie"),
294
+ "qin": ("q", "in"),
295
+ "qing": ("q", "ing"),
296
+ "qiong": ("q", "iong"),
297
+ "qiu": ("q", "iou"),
298
+ "qu": ("q", "v"),
299
+ "quan": ("q", "van"),
300
+ "que": ("q", "ve"),
301
+ "qun": ("q", "vn"),
302
+ "ran": ("r", "an"),
303
+ "rang": ("r", "ang"),
304
+ "rao": ("r", "ao"),
305
+ "re": ("r", "e"),
306
+ "ren": ("r", "en"),
307
+ "reng": ("r", "eng"),
308
+ "ri": ("r", "iii"),
309
+ "rong": ("r", "ong"),
310
+ "rou": ("r", "ou"),
311
+ "ru": ("r", "u"),
312
+ "rua": ("r", "ua"),
313
+ "ruan": ("r", "uan"),
314
+ "rui": ("r", "uei"),
315
+ "run": ("r", "uen"),
316
+ "ruo": ("r", "uo"),
317
+ "sa": ("s", "a"),
318
+ "sai": ("s", "ai"),
319
+ "san": ("s", "an"),
320
+ "sang": ("s", "ang"),
321
+ "sao": ("s", "ao"),
322
+ "se": ("s", "e"),
323
+ "sen": ("s", "en"),
324
+ "seng": ("s", "eng"),
325
+ "sha": ("sh", "a"),
326
+ "shai": ("sh", "ai"),
327
+ "shan": ("sh", "an"),
328
+ "shang": ("sh", "ang"),
329
+ "shao": ("sh", "ao"),
330
+ "she": ("sh", "e"),
331
+ "shei": ("sh", "ei"),
332
+ "shen": ("sh", "en"),
333
+ "sheng": ("sh", "eng"),
334
+ "shi": ("sh", "iii"),
335
+ "shou": ("sh", "ou"),
336
+ "shu": ("sh", "u"),
337
+ "shua": ("sh", "ua"),
338
+ "shuai": ("sh", "uai"),
339
+ "shuan": ("sh", "uan"),
340
+ "shuang": ("sh", "uang"),
341
+ "shui": ("sh", "uei"),
342
+ "shun": ("sh", "uen"),
343
+ "shuo": ("sh", "uo"),
344
+ "si": ("s", "ii"),
345
+ "song": ("s", "ong"),
346
+ "sou": ("s", "ou"),
347
+ "su": ("s", "u"),
348
+ "suan": ("s", "uan"),
349
+ "sui": ("s", "uei"),
350
+ "sun": ("s", "uen"),
351
+ "suo": ("s", "uo"),
352
+ "ta": ("t", "a"),
353
+ "tai": ("t", "ai"),
354
+ "tan": ("t", "an"),
355
+ "tang": ("t", "ang"),
356
+ "tao": ("t", "ao"),
357
+ "te": ("t", "e"),
358
+ "tei": ("t", "ei"),
359
+ "teng": ("t", "eng"),
360
+ "ti": ("t", "i"),
361
+ "tian": ("t", "ian"),
362
+ "tiao": ("t", "iao"),
363
+ "tie": ("t", "ie"),
364
+ "ting": ("t", "ing"),
365
+ "tong": ("t", "ong"),
366
+ "tou": ("t", "ou"),
367
+ "tu": ("t", "u"),
368
+ "tuan": ("t", "uan"),
369
+ "tui": ("t", "uei"),
370
+ "tun": ("t", "uen"),
371
+ "tuo": ("t", "uo"),
372
+ "wa": ("^", "ua"),
373
+ "wai": ("^", "uai"),
374
+ "wan": ("^", "uan"),
375
+ "wang": ("^", "uang"),
376
+ "wei": ("^", "uei"),
377
+ "wen": ("^", "uen"),
378
+ "weng": ("^", "ueng"),
379
+ "wo": ("^", "uo"),
380
+ "wu": ("^", "u"),
381
+ "xi": ("x", "i"),
382
+ "xia": ("x", "ia"),
383
+ "xian": ("x", "ian"),
384
+ "xiang": ("x", "iang"),
385
+ "xiao": ("x", "iao"),
386
+ "xie": ("x", "ie"),
387
+ "xin": ("x", "in"),
388
+ "xing": ("x", "ing"),
389
+ "xiong": ("x", "iong"),
390
+ "xiu": ("x", "iou"),
391
+ "xu": ("x", "v"),
392
+ "xuan": ("x", "van"),
393
+ "xue": ("x", "ve"),
394
+ "xun": ("x", "vn"),
395
+ "ya": ("^", "ia"),
396
+ "yan": ("^", "ian"),
397
+ "yang": ("^", "iang"),
398
+ "yao": ("^", "iao"),
399
+ "ye": ("^", "ie"),
400
+ "yi": ("^", "i"),
401
+ "yin": ("^", "in"),
402
+ "ying": ("^", "ing"),
403
+ "yo": ("^", "iou"),
404
+ "yong": ("^", "iong"),
405
+ "you": ("^", "iou"),
406
+ "yu": ("^", "v"),
407
+ "yuan": ("^", "van"),
408
+ "yue": ("^", "ve"),
409
+ "yun": ("^", "vn"),
410
+ "za": ("z", "a"),
411
+ "zai": ("z", "ai"),
412
+ "zan": ("z", "an"),
413
+ "zang": ("z", "ang"),
414
+ "zao": ("z", "ao"),
415
+ "ze": ("z", "e"),
416
+ "zei": ("z", "ei"),
417
+ "zen": ("z", "en"),
418
+ "zeng": ("z", "eng"),
419
+ "zha": ("zh", "a"),
420
+ "zhai": ("zh", "ai"),
421
+ "zhan": ("zh", "an"),
422
+ "zhang": ("zh", "ang"),
423
+ "zhao": ("zh", "ao"),
424
+ "zhe": ("zh", "e"),
425
+ "zhei": ("zh", "ei"),
426
+ "zhen": ("zh", "en"),
427
+ "zheng": ("zh", "eng"),
428
+ "zhi": ("zh", "iii"),
429
+ "zhong": ("zh", "ong"),
430
+ "zhou": ("zh", "ou"),
431
+ "zhu": ("zh", "u"),
432
+ "zhua": ("zh", "ua"),
433
+ "zhuai": ("zh", "uai"),
434
+ "zhuan": ("zh", "uan"),
435
+ "zhuang": ("zh", "uang"),
436
+ "zhui": ("zh", "uei"),
437
+ "zhun": ("zh", "uen"),
438
+ "zhuo": ("zh", "uo"),
439
+ "zi": ("z", "ii"),
440
+ "zong": ("z", "ong"),
441
+ "zou": ("z", "ou"),
442
+ "zu": ("z", "u"),
443
+ "zuan": ("z", "uan"),
444
+ "zui": ("z", "uei"),
445
+ "zun": ("z", "uen"),
446
+ "zuo": ("z", "uo"),
447
+ }
text/symbols.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _pause = ["sil", "eos", "sp", "#0", "#1", "#2", "#3"]
2
+
3
+ _initials = [
4
+ "^",
5
+ "b",
6
+ "c",
7
+ "ch",
8
+ "d",
9
+ "f",
10
+ "g",
11
+ "h",
12
+ "j",
13
+ "k",
14
+ "l",
15
+ "m",
16
+ "n",
17
+ "p",
18
+ "q",
19
+ "r",
20
+ "s",
21
+ "sh",
22
+ "t",
23
+ "x",
24
+ "z",
25
+ "zh",
26
+ ]
27
+
28
+ _tones = ["1", "2", "3", "4", "5"]
29
+
30
+ _finals = [
31
+ "a",
32
+ "ai",
33
+ "an",
34
+ "ang",
35
+ "ao",
36
+ "e",
37
+ "ei",
38
+ "en",
39
+ "eng",
40
+ "er",
41
+ "i",
42
+ "ia",
43
+ "ian",
44
+ "iang",
45
+ "iao",
46
+ "ie",
47
+ "ii",
48
+ "iii",
49
+ "in",
50
+ "ing",
51
+ "iong",
52
+ "iou",
53
+ "o",
54
+ "ong",
55
+ "ou",
56
+ "u",
57
+ "ua",
58
+ "uai",
59
+ "uan",
60
+ "uang",
61
+ "uei",
62
+ "uen",
63
+ "ueng",
64
+ "uo",
65
+ "v",
66
+ "van",
67
+ "ve",
68
+ "vn",
69
+ ]
70
+
71
+ symbols = _pause + _initials + [i + j for i in _finals for j in _tones]