utkarsh2299 commited on
Commit
c9d8925
·
verified ·
1 Parent(s): 76690c5

Upload text_preprocess_for_inference.py

Browse files
Files changed (1) hide show
  1. text_preprocess_for_inference.py +979 -0
text_preprocess_for_inference.py ADDED
@@ -0,0 +1,979 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ TTS Preprocessing
3
+ Developed by Arun Kumar A(CS20S013) - November 2022
4
+ Code Changes by Utkarsh - 2023
5
+ '''
6
+ import os
7
+ import re
8
+ import json
9
+ import pandas as pd
10
+ import string
11
+ from collections import defaultdict
12
+ import time
13
+ import subprocess
14
+ import shutil
15
+ from multiprocessing import Process
16
+ import traceback
17
+
18
+ #imports of dependencies from environment.yml
19
+ from num_to_words import num_to_word
20
+ from g2p_en import G2p
21
+
22
+ def add_to_dictionary(dict_to_add, dict_file):
23
+ append_string = ""
24
+ for key, value in dict_to_add.items():
25
+ append_string += (str(key) + " " + str(value) + "\n")
26
+
27
+ if os.path.isfile(dict_file):
28
+ # make a copy of the dictionary
29
+ source_dir = os.path.dirname(dict_file)
30
+ dict_file_name = os.path.basename(dict_file)
31
+ temp_file_name = "." + dict_file_name + ".temp"
32
+ temp_dict_file = os.path.join(source_dir, temp_file_name)
33
+ shutil.copy(dict_file, temp_dict_file)
34
+ # append the new words in the dictionary to the temp file
35
+ with open(temp_dict_file, "a") as f:
36
+ f.write(append_string)
37
+ # check if the write is successful and then replace the temp file as the dict file
38
+ try:
39
+ df_orig = pd.read_csv(dict_file, delimiter=" ", header=None, dtype=str)
40
+ df_temp = pd.read_csv(temp_dict_file, delimiter=" ", header=None, dtype=str)
41
+ if len(df_temp) > len(df_orig):
42
+ os.rename(temp_dict_file, dict_file)
43
+ print(f"{len(dict_to_add)} new words appended to Dictionary: {dict_file}")
44
+ except:
45
+ print(traceback.format_exc())
46
+ else:
47
+ # create a new dictionary
48
+ with open(dict_file, "a") as f:
49
+ f.write(append_string)
50
+ print(f"New Dictionary: {dict_file} created with {len(dict_to_add)} words")
51
+
52
+
53
+ class TextCleaner:
54
+ def __init__(self):
55
+ # this is a static set of cleaning rules to be applied
56
+ self.cleaning_rules = {
57
+ " +" : " ",
58
+ "^ +" : "",
59
+ " +$" : "",
60
+ "#" : "",
61
+ "[.,;।!](\r\n)*" : "# ",
62
+ "[.,;।!](\n)*" : "# ",
63
+ "(\r\n)+" : "# ",
64
+ "(\n)+" : "# ",
65
+ "(\r)+" : "# ",
66
+ """[?;:)(!|&’‘,।\."]""": "",
67
+ "[/']" : "",
68
+ "[-–]" : " ",
69
+ }
70
+
71
+ def clean(self, text):
72
+ for key, replacement in self.cleaning_rules.items():
73
+ text = re.sub(key, replacement, text)
74
+ return text
75
+
76
+ def clean_list(self, text):
77
+ # input is supposed to be a list of strings
78
+ output_text = []
79
+ for line in text:
80
+ line = line.strip()
81
+ for key, replacement in self.cleaning_rules.items():
82
+ line = re.sub(key, replacement, line)
83
+ output_text.append(line)
84
+ return output_text
85
+
86
+
87
+ class Phonifier:
88
+ def __init__(self, dict_location=None):
89
+ if dict_location is None:
90
+ dict_location = "phone_dict"
91
+ self.dict_location = dict_location
92
+
93
+ # self.phone_dictionary = {}
94
+ # # load dictionary for all the available languages
95
+ # for dict_file in os.listdir(dict_location):
96
+ # try:
97
+ # if dict_file.startswith("."):
98
+ # # ignore hidden files
99
+ # continue
100
+ # language = dict_file
101
+ # dict_file_path = os.path.join(dict_location, dict_file)
102
+ # df = pd.read_csv(dict_file_path, delimiter=" ", header=None, dtype=str)
103
+ # self.phone_dictionary[language] = df.set_index(0).to_dict('dict')[1]
104
+ # except Exception as e:
105
+ # print(traceback.format_exc())
106
+
107
+ # print("Phone dictionary loaded for the following languages:", list(self.phone_dictionary.keys()))
108
+
109
+ self.g2p = G2p()
110
+ print('Loading G2P model... Done!')
111
+ # Mapping between the cmu phones and the iitm cls
112
+ self.cmu_2_cls_map = {
113
+ "AA" : "aa",
114
+ "AA0" : "aa",
115
+ "AA1" : "aa",
116
+ "AA2" : "aa",
117
+ "AE" : "axx",
118
+ "AE0" : "axx",
119
+ "AE1" : "axx",
120
+ "AE2" : "axx",
121
+ "AH" : "a",
122
+ "AH0" : "a",
123
+ "AH1" : "a",
124
+ "AH2" : "a",
125
+ "AO" : "ax",
126
+ "AO0" : "ax",
127
+ "AO1" : "ax",
128
+ "AO2" : "ax",
129
+ "AW" : "ou",
130
+ "AW0" : "ou",
131
+ "AW1" : "ou",
132
+ "AW2" : "ou",
133
+ "AX" : "a",
134
+ "AY" : "ei",
135
+ "AY0" : "ei",
136
+ "AY1" : "ei",
137
+ "AY2" : "ei",
138
+ "B" : "b",
139
+ "CH" : "c",
140
+ "D" : "dx",
141
+ "DH" : "d",
142
+ "EH" : "ee",
143
+ "EH0" : "ee",
144
+ "EH1" : "ee",
145
+ "EH2" : "ee",
146
+ "ER" : "a r",
147
+ "ER0" : "a r",
148
+ "ER1" : "a r",
149
+ "ER2" : "a r",
150
+ "EY" : "ee",
151
+ "EY0" : "ee",
152
+ "EY1" : "ee",
153
+ "EY2" : "ee",
154
+ "F" : "f",
155
+ "G" : "g",
156
+ "HH" : "h",
157
+ "IH" : "i",
158
+ "IH0" : "i",
159
+ "IH1" : "i",
160
+ "IH2" : "i",
161
+ "IY" : "ii",
162
+ "IY0" : "ii",
163
+ "IY1" : "ii",
164
+ "IY2" : "ii",
165
+ "JH" : "j",
166
+ "K" : "k",
167
+ "L" : "l",
168
+ "M" : "m",
169
+ "N" : "n",
170
+ "NG" : "ng",
171
+ "OW" : "o",
172
+ "OW0" : "o",
173
+ "OW1" : "o",
174
+ "OW2" : "o",
175
+ "OY" : "ei",
176
+ "OY0" : "ei",
177
+ "OY1" : "ei",
178
+ "OY2" : "ei",
179
+ "P" : "p",
180
+ "R" : "r",
181
+ "S" : "s",
182
+ "SH" : "sh",
183
+ "T" : "tx",
184
+ "TH" : "t",
185
+ "UH" : "u",
186
+ "UH0" : "u",
187
+ "UH1" : "u",
188
+ "UH2" : "u",
189
+ "UW" : "uu",
190
+ "UW0" : "uu",
191
+ "UW1" : "uu",
192
+ "UW2" : "uu",
193
+ "V" : "w",
194
+ "W" : "w",
195
+ "Y" : "y",
196
+ "Z" : "z",
197
+ "ZH" : "sh",
198
+ }
199
+
200
+ # Mapping between the iitm cls and iitm char
201
+ self.cls_2_chr_map = {
202
+ "aa" : "A",
203
+ "ii" : "I",
204
+ "uu" : "U",
205
+ "ee" : "E",
206
+ "oo" : "O",
207
+ "nn" : "N",
208
+ "ae" : "ऍ",
209
+ "ag" : "ऽ",
210
+ "au" : "औ",
211
+ "axx" : "अ",
212
+ "ax" : "ऑ",
213
+ "bh" : "B",
214
+ "ch" : "C",
215
+ "dh" : "ध",
216
+ "dx" : "ड",
217
+ "dxh" : "ढ",
218
+ "dxhq" : "T",
219
+ "dxq" : "D",
220
+ "ei" : "ऐ",
221
+ "ai" : "ऐ",
222
+ "eu" : "உ",
223
+ "gh" : "घ",
224
+ "gq" : "G",
225
+ "hq" : "H",
226
+ "jh" : "J",
227
+ "kh" : "ख",
228
+ "khq" : "K",
229
+ "kq" : "क",
230
+ "ln" : "ൾ",
231
+ "lw" : "ൽ",
232
+ "lx" : "ള",
233
+ "mq" : "M",
234
+ "nd" : "न",
235
+ "ng" : "ङ",
236
+ "nj" : "ञ",
237
+ "nk" : "Y",
238
+ "nw" : "ൺ",
239
+ "nx" : "ण",
240
+ "ou" : "औ",
241
+ "ph" : "P",
242
+ "rq" : "R",
243
+ "rqw" : "ॠ",
244
+ "rw" : "ർ",
245
+ "rx" : "र",
246
+ "sh" : "श",
247
+ "sx" : "ष",
248
+ "th" : "थ",
249
+ "tx" : "ट",
250
+ "txh" : "ठ",
251
+ "wv" : "W",
252
+ "zh" : "Z",
253
+ }
254
+
255
+ # Multilingual support for OOV characters
256
+ oov_map_json_file = 'multilingualcharmap.json'
257
+ with open(oov_map_json_file, 'r') as oov_file:
258
+ self.oov_map = json.load(oov_file)
259
+
260
+
261
+
262
+ def load_lang_dict(self, language, phone_dictionary):
263
+ # load dictionary for requested language
264
+ try:
265
+
266
+ dict_file = language
267
+ print("language", language)
268
+ dict_file_path = os.path.join(self.dict_location, dict_file)
269
+ print("dict_file_path", dict_file_path)
270
+ df = pd.read_csv(dict_file_path, delimiter=" ", header=None, dtype=str)
271
+ phone_dictionary[language] = df.set_index(0).to_dict('dict')[1]
272
+
273
+ dict_file = 'english'
274
+ dict_file_path = os.path.join(self.dict_location, dict_file)
275
+ df = pd.read_csv(dict_file_path, delimiter=" ", header=None, dtype=str)
276
+ phone_dictionary['english'] = df.set_index(0).to_dict('dict')[1]
277
+
278
+ except Exception as e:
279
+ print(traceback.format_exc())
280
+
281
+ return phone_dictionary
282
+
283
+ def __is_float(self, word):
284
+ parts = word.split('.')
285
+ if len(parts) != 2:
286
+ return False
287
+ return parts[0].isdecimal() and parts[1].isdecimal()
288
+
289
+ def en_g2p(self, word):
290
+ phn_out = self.g2p(word)
291
+ # print(f"phn_out: {phn_out}")
292
+ # iterate over the string list and replace each word with the corresponding value from the dictionary
293
+ for i, phn in enumerate(phn_out):
294
+ if phn in self.cmu_2_cls_map.keys():
295
+ phn_out[i] = self.cmu_2_cls_map[phn]
296
+ # cls_out = self.cmu_2_cls_map[phn]
297
+ if phn_out[i] in self.cls_2_chr_map.keys():
298
+ phn_out[i] = self.cls_2_chr_map[phn_out[i]]
299
+ else:
300
+ pass
301
+ else:
302
+ pass # ignore words that are not in the dictionary
303
+ # print(f"i: {i}, phn: {phn}, cls_out: {cls_out}, phn_out: {phn_out[i]}")
304
+ return ("".join(phn_out)).strip().replace(" ", "")
305
+
306
+ def __post_phonify(self, text, language, gender):
307
+ language_gender_id = language+'_'+gender
308
+ if language_gender_id in self.oov_map.keys():
309
+ output_string = ''
310
+ for char in text:
311
+ if char in self.oov_map[language_gender_id].keys():
312
+ output_string += self.oov_map[language_gender_id][char]
313
+ else:
314
+ output_string += char
315
+ # output_string += self.oov_map['language_gender_id']['char']
316
+ return output_string
317
+ else:
318
+ return text
319
+
320
+ def __is_english_word(self, word):
321
+ maxchar = max(word)
322
+ if u'\u0000' <= maxchar <= u'\u007f':
323
+ return True
324
+ return False
325
+
326
+ def __phonify(self, text, language, gender, phone_dictionary):
327
+ # text is expected to be a list of strings
328
+ words = set((" ".join(text)).split(" "))
329
+ #print(f"words test: {words}")
330
+ non_dict_words = []
331
+
332
+
333
+ if language in phone_dictionary:
334
+ for word in words:
335
+ # print(f"word: {word}")
336
+ if word not in phone_dictionary[language] and (language == "english" or (not self.__is_english_word(word))):
337
+ non_dict_words.append(word)
338
+ #print('INSIDE IF CONDITION OF ADDING WORDS')
339
+ else:
340
+ non_dict_words = words
341
+ print(f"word not in dict: {non_dict_words}")
342
+
343
+ if len(non_dict_words) > 0:
344
+ # unified parser has to be run for the non dictionary words
345
+ os.makedirs("tmp", exist_ok=True)
346
+ timestamp = str(time.time())
347
+ non_dict_words_file = os.path.abspath("tmp/non_dict_words_" + timestamp)
348
+ out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
349
+ with open(non_dict_words_file, "w") as f:
350
+ f.write("\n".join(non_dict_words))
351
+
352
+ if(language == 'tamil'):
353
+ current_directory = os.getcwd()
354
+ #tamil_parser_cmd = "tamil_parser.sh"
355
+ tamil_parser_cmd = f"{current_directory}/ssn_parser_new/tamil_parser.py"
356
+ #subprocess.run(["bash", tamil_parser_cmd, non_dict_words_file, out_dict_file, timestamp, "ssn_parser"])
357
+ subprocess.run(["python", tamil_parser_cmd, non_dict_words_file, out_dict_file, timestamp, f"{current_directory}/ssn_parser_new"])
358
+ elif(language == 'english'):
359
+ phn_out_dict = {}
360
+ for i in range(0,len(non_dict_words)):
361
+ phn_out_dict[non_dict_words[i]] = self.en_g2p(non_dict_words[i])
362
+ # Create a string representation of the dictionary
363
+ data_str = "\n".join([f"{key}\t{value}" for key, value in phn_out_dict.items()])
364
+ print(f"data_str: {data_str}")
365
+ with open(out_dict_file, "w") as f:
366
+ f.write(data_str)
367
+ else:
368
+
369
+ out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
370
+ from get_phone_mapped_python import TextReplacer
371
+
372
+ from indic_unified_parser.uparser import wordparse
373
+
374
+ text_replacer=TextReplacer()
375
+ # def write_output_to_file(output_text, file_path):
376
+ # with open(file_path, 'w') as f:
377
+ # f.write(output_text)
378
+ parsed_output_list = []
379
+ for word in non_dict_words:
380
+ parsed_word = wordparse(word, 0, 0, 1)
381
+ parsed_output_list.append(parsed_word)
382
+ replaced_output_list = [text_replacer.apply_replacements(parsed_word) for parsed_word in parsed_output_list]
383
+ with open(out_dict_file, 'w', encoding='utf-8') as file:
384
+ for original_word, formatted_word in zip(non_dict_words, replaced_output_list):
385
+ line = f"{original_word}\t{formatted_word}\n"
386
+ file.write(line)
387
+ print(line, end='')
388
+
389
+
390
+ try:
391
+
392
+ df = pd.read_csv(out_dict_file, delimiter="\t", header=None, dtype=str)
393
+ #print('DATAFRAME OUTPUT FILE', df.head())
394
+ new_dict = df.dropna().set_index(0).to_dict('dict')[1]
395
+ #print("new dict",new_dict)
396
+ if language not in phone_dictionary:
397
+ phone_dictionary[language] = new_dict
398
+ else:
399
+ phone_dictionary[language].update(new_dict)
400
+ # run a non-blocking child process to update the dictionary file
401
+ #print("phone_dict", self.phone_dictionary)
402
+ p = Process(target=add_to_dictionary, args=(new_dict, os.path.join(self.dict_location, language)))
403
+ p.start()
404
+ except Exception as err:
405
+ print(f"Error: While loading {out_dict_file}")
406
+ traceback.print_exc()
407
+
408
+ # phonify text with dictionary
409
+ text_phonified = []
410
+ for phrase in text:
411
+ phrase_phonified = []
412
+ for word in phrase.split(" "):
413
+ if self.__is_english_word(word):
414
+ if word in phone_dictionary["english"]:
415
+ phrase_phonified.append(str(phone_dictionary["english"][word]))
416
+ else:
417
+ phrase_phonified.append(str(self.en_g2p(word)))
418
+ elif word in phone_dictionary[language]:
419
+ # if a word could not be parsed, skip it
420
+ phrase_phonified.append(str(phone_dictionary[language][word]))
421
+ # text_phonified.append(self.__post_phonify(" ".join(phrase_phonified),language, gender))
422
+ text_phonified.append(" ".join(phrase_phonified))
423
+ return text_phonified
424
+
425
+ def __merge_lists(self, lists):
426
+ merged_string = ""
427
+ for list in lists:
428
+ for word in list:
429
+ merged_string += word + " "
430
+ return merged_string.strip()
431
+
432
+ def __phonify_list(self, text, language, gender, phone_dictionary):
433
+ # text is expected to be a list of list of strings
434
+ words = set(self.__merge_lists(text).split(" "))
435
+ non_dict_words = []
436
+ if language in phone_dictionary:
437
+ for word in words:
438
+ if word not in phone_dictionary[language] and (language == "english" or (not self.__is_english_word(word))):
439
+ non_dict_words.append(word)
440
+ else:
441
+ non_dict_words = words
442
+
443
+ if len(non_dict_words) > 0:
444
+ print(len(non_dict_words))
445
+ print(non_dict_words)
446
+ # unified parser has to be run for the non dictionary words
447
+ os.makedirs("tmp", exist_ok=True)
448
+ timestamp = str(time.time())
449
+ non_dict_words_file = os.path.abspath("tmp/non_dict_words_" + timestamp)
450
+ out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
451
+ with open(non_dict_words_file, "w") as f:
452
+ f.write("\n".join(non_dict_words))
453
+
454
+ if(language == 'tamil'):
455
+ current_directory = os.getcwd()
456
+ #tamil_parser_cmd = "tamil_parser.sh"
457
+ tamil_parser_cmd = f"{current_directory}/ssn_parser_new/tamil_parser.py"
458
+ #subprocess.run(["bash", tamil_parser_cmd, non_dict_words_file, out_dict_file, timestamp, "ssn_parser"])
459
+ subprocess.run(["python", tamil_parser_cmd, non_dict_words_file, out_dict_file, timestamp, f"{current_directory}/ssn_parser_new"])
460
+
461
+ elif(language == 'english'):
462
+ phn_out_dict = {}
463
+ for i in range(0,len(non_dict_words)):
464
+ phn_out_dict[non_dict_words[i]] = self.en_g2p(non_dict_words[i])
465
+ # Create a string representation of the dictionary
466
+ data_str = "\n".join([f"{key}\t{value}" for key, value in phn_out_dict.items()])
467
+ print(f"data_str: {data_str}")
468
+ with open(out_dict_file, "w") as f:
469
+ f.write(data_str)
470
+ else:
471
+ out_dict_file = os.path.abspath("tmp/out_dict_" + timestamp)
472
+ from get_phone_mapped_python import TextReplacer
473
+
474
+ from indic_unified_parser.uparser import wordparse
475
+
476
+ text_replacer=TextReplacer()
477
+
478
+ parsed_output_list = []
479
+ for word in non_dict_words:
480
+ parsed_word = wordparse(word, 0, 0, 1)
481
+ parsed_output_list.append(parsed_word)
482
+ replaced_output_list = [text_replacer.apply_replacements(parsed_word) for parsed_word in parsed_output_list]
483
+ with open(out_dict_file, 'w', encoding='utf-8') as file:
484
+ for original_word, formatted_word in zip(non_dict_words, replaced_output_list):
485
+ line = f"{original_word}\t{formatted_word}\n"
486
+ file.write(line)
487
+ print(line, end='')
488
+
489
+ try:
490
+ df = pd.read_csv(out_dict_file, delimiter="\t", header=None, dtype=str)
491
+ new_dict = df.dropna().set_index(0).to_dict('dict')[1]
492
+ print(new_dict)
493
+ if language not in phone_dictionary:
494
+ phone_dictionary[language] = new_dict
495
+ else:
496
+ phone_dictionary[language].update(new_dict)
497
+ # run a non-blocking child process to update the dictionary file
498
+ p = Process(target=add_to_dictionary, args=(new_dict, os.path.join(self.dict_location, language)))
499
+ p.start()
500
+ except Exception as err:
501
+ traceback.print_exc()
502
+
503
+ # phonify text with dictionary
504
+ text_phonified = []
505
+ for line in text:
506
+ line_phonified = []
507
+ for phrase in line:
508
+ phrase_phonified = []
509
+ for word in phrase.split(" "):
510
+ if self.__is_english_word(word):
511
+ if word in phone_dictionary["english"]:
512
+ phrase_phonified.append(str(phone_dictionary["english"][word]))
513
+ else:
514
+ phrase_phonified.append(str(self.en_g2p(word)))
515
+ elif word in phone_dictionary[language]:
516
+ # if a word could not be parsed, skip it
517
+ phrase_phonified.append(str(phone_dictionary[language][word]))
518
+ # line_phonified.append(self.__post_phonify(" ".join(phrase_phonified), language, gender))
519
+ line_phonified.append(" ".join(phrase_phonified))
520
+ text_phonified.append(line_phonified)
521
+ return text_phonified
522
+
523
+ def phonify(self, text, language, gender, phone_dictionary):
524
+ if not isinstance(text, list):
525
+ out = self.__phonify([text], language, gender)
526
+ return out[0]
527
+ return self.__phonify(text, language, gender, phone_dictionary)
528
+
529
+ def phonify_list(self, text, language, gender, phone_dictionary):
530
+ if isinstance(text, list):
531
+ return self.__phonify_list(text, language, gender, phone_dictionary)
532
+ else:
533
+ print("Error!! Expected to have a list as input.")
534
+
535
+
536
+ class TextNormalizer:
537
+ def __init__(self, char_map_location=None, phonifier = Phonifier()):
538
+ self.phonifier = phonifier
539
+ if char_map_location is None:
540
+ char_map_location = "charmap"
541
+
542
+ # this is a static set of cleaning rules to be applied
543
+ self.cleaning_rules = {
544
+ " +" : " ",
545
+ "^ +" : "",
546
+ " +$" : "",
547
+ "#$" : "",
548
+ "# +$" : "",
549
+ }
550
+
551
+ # this is the list of languages supported by num_to_words
552
+ self.keydict = {"english" : "en",
553
+ "hindi" : "hi",
554
+ "gujarati" : "gu",
555
+ "marathi" : "mr",
556
+ "bengali" : "bn",
557
+ "telugu" : "te",
558
+ "tamil" : "ta",
559
+ "kannada" : "kn",
560
+ "odia" : "or",
561
+ "punjabi" : "pa"
562
+ }
563
+
564
+ self.g2p = G2p()
565
+ print('Loading G2P model... Done!')
566
+
567
+ def __post_cleaning(self, text):
568
+ for key, replacement in self.cleaning_rules.items():
569
+ text = re.sub(key, replacement, text)
570
+ return text
571
+
572
+ def __post_cleaning_list(self, text):
573
+ # input is supposed to be a list of strings
574
+ output_text = []
575
+ for line in text:
576
+ for key, replacement in self.cleaning_rules.items():
577
+ line = re.sub(key, replacement, line)
578
+ output_text.append(line)
579
+ return output_text
580
+
581
+ def __check_char_type(self, str_c):
582
+ # Determine the type of the character
583
+ if str_c.isnumeric():
584
+ char_type = "number"
585
+ elif str_c in string.punctuation:
586
+ char_type = "punctuation"
587
+ elif str_c in string.whitespace:
588
+ char_type = "whitespace"
589
+ elif str_c.isalpha() and str_c.isascii():
590
+ char_type = "ascii"
591
+ else:
592
+ char_type = "non-ascii"
593
+ return char_type
594
+
595
+ def insert_space(self, text):
596
+ '''
597
+ Check if the text contains numbers and English words and if they are without space inserts space between them.
598
+ '''
599
+ # Initialize variables to track the previous character type and whether a space should be inserted
600
+ prev_char_type = None
601
+ next_char_type = None
602
+ insert_space = False
603
+
604
+ # Output string
605
+ output_string = ""
606
+
607
+ # Iterate through each character in the text
608
+ for i, c in enumerate(text):
609
+ # Determine the type of the character
610
+ char_type = self.__check_char_type(c)
611
+ if i == (len(text) - 1):
612
+ next_char_type = None
613
+ else:
614
+ next_char_type = self.__check_char_type(text[i+1])
615
+ # print(f"{i}: {c} is a {char_type} character and next character is a {next_char_type}")
616
+
617
+ # If the character type has changed from the previous character, check if a space should be inserted
618
+ if (char_type != prev_char_type and prev_char_type != None and char_type != "punctuation" and char_type != "whitespace"):
619
+ if next_char_type != "punctuation" or next_char_type != "whitespace":
620
+ insert_space = True
621
+
622
+ # Insert a space if needed
623
+ if insert_space:
624
+ output_string += " "+c
625
+ insert_space = False
626
+ else:
627
+ output_string += c
628
+
629
+ # Update the previous character type
630
+ prev_char_type = char_type
631
+
632
+ # Print the modified text
633
+ output_string = re.sub(r' +', ' ', output_string)
634
+ return output_string
635
+
636
+ def insert_space_list(self, text):
637
+ '''
638
+ Expect the input to be in form of list of string.
639
+ Check if the text contains numbers and English words and if they are without space inserts space between them.
640
+ '''
641
+ # Output string list
642
+ output_list = []
643
+
644
+ for line in text:
645
+ # Initialize variables to track the previous character type and whether a space should be inserted
646
+ prev_char_type = None
647
+ next_char_type = None
648
+ insert_space = False
649
+ # Output string
650
+ output_string = ""
651
+ # Iterate through each character in the line
652
+ for i, c in enumerate(line):
653
+ # Determine the type of the character
654
+ char_type = self.__check_char_type(c)
655
+ if i == (len(line) - 1):
656
+ next_char_type = None
657
+ else:
658
+ next_char_type = self.__check_char_type(line[i+1])
659
+ # print(f"{i}: {c} is a {char_type} character and next character is a {next_char_type}")
660
+
661
+ # If the character type has changed from the previous character, check if a space should be inserted
662
+ if (char_type != prev_char_type and prev_char_type != None and char_type != "punctuation" and char_type != "whitespace"):
663
+ if next_char_type != "punctuation" or next_char_type != "whitespace":
664
+ insert_space = True
665
+
666
+ # Insert a space if needed
667
+ if insert_space:
668
+ output_string += " "+c
669
+ insert_space = False
670
+ else:
671
+ output_string += c
672
+
673
+ # Update the previous character type
674
+ prev_char_type = char_type
675
+
676
+ # Print the modified line
677
+ output_string = re.sub(r' +', ' ', output_string)
678
+ output_list.append(output_string)
679
+ return output_list
680
+
681
+ def num2text(self, text, language):
682
+ if language in self.keydict.keys():
683
+ digits = sorted(list(map(int, re.findall(r'\d+', text))),reverse=True)
684
+ if digits:
685
+ for digit in digits:
686
+ text = re.sub(str(digit), ' '+num_to_word(digit, self.keydict[language])+' ', text)
687
+ return self.__post_cleaning(text)
688
+ else:
689
+ print(f"No num-to-char for the given language {language}.")
690
+ return self.__post_cleaning(text)
691
+
692
+ def num2text_list(self, text, language):
693
+ # input is supposed to be a list of strings
694
+ if language in self.keydict.keys():
695
+ output_text = []
696
+ for line in text:
697
+ digits = sorted(list(map(int, re.findall(r'\d+', line))),reverse=True)
698
+ if digits:
699
+ for digit in digits:
700
+ line = re.sub(str(digit), ' '+num_to_word(digit, self.keydict[language])+' ', line)
701
+ output_text.append(line)
702
+ return self.__post_cleaning_list(output_text)
703
+ else:
704
+ print(f"No num-to-char for the given language {language}.")
705
+ return self.__post_cleaning_list(text)
706
+
707
+ def numberToTextConverter(self, text, language):
708
+ if language in self.keydict.keys():
709
+ matches = re.findall(r'\d+\.\d+|\d+', text)
710
+ digits = sorted([int(match) if match.isdigit() else match if re.match(r'^\d+(\.\d+)?$', match) else str(match) for match in matches], key=lambda x: float(x) if isinstance(x, str) and '.' in x else x, reverse=True)
711
+ if digits:
712
+ for digit in digits:
713
+
714
+ if isinstance(digit, int):
715
+ text = re.sub(str(digit), ' '+num_to_word(digit, self.keydict[language]).replace(",", "")+' ', text)
716
+ else:
717
+ parts = str(digit).split('.')
718
+ integer_part = int(parts[0])
719
+ data1 = num_to_word(integer_part, self.keydict[language]).replace(",", "")
720
+ decimal_part = str(parts[1])
721
+ data2 = ''
722
+ for i in decimal_part:
723
+ data2 = data2+' '+num_to_word(i, self.keydict[language])
724
+ if language == 'hindi':
725
+ final_data = f'{data1} दशमलव {data2}'
726
+ elif language == 'tamil':
727
+ final_data = f'{data1} புள்ளி {data2}'
728
+ else:
729
+ final_data = f'{data1} point {data2}'
730
+
731
+
732
+ text = re.sub(str(digit), ' '+final_data+' ', text)
733
+
734
+ return self.__post_cleaning(text)
735
+ else:
736
+
737
+
738
+ words = {
739
+ '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
740
+ '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine'
741
+ }
742
+
743
+
744
+ # Use regular expression to find and replace decimal points in numbers
745
+ text = re.sub(r'(?<=\d)\.(?=\d)', ' point ', text)
746
+
747
+ # Find all occurrences of numbers with decimal points and convert them to words
748
+ matches = re.findall(r'point (\d+)', text)
749
+
750
+ for match in matches:
751
+ replacement = ' '.join(words[digit] for digit in match)
752
+ text = text.replace(f'point {match}', f'point {replacement}', 1)
753
+
754
+
755
+ return self.__post_cleaning(text)
756
+
757
+
758
+ def normalize(self, text, language):
759
+ return self.__post_cleaning(text)
760
+
761
+ def normalize_list(self, text, language):
762
+ # input is supposed to be a list of strings
763
+ return self.__post_cleaning_list(text)
764
+
765
+
766
+ class TextPhrasifier:
767
+ @classmethod
768
+ def phrasify(cls, text):
769
+ phrase_list = []
770
+ for phrase in text.split("#"):
771
+ phrase = phrase.strip()
772
+ if phrase != "":
773
+ phrase_list.append(phrase)
774
+ return phrase_list
775
+
776
+ class TextPhrasifier_List:
777
+ @classmethod
778
+ def phrasify(cls, text):
779
+ # input is supposed to be a list of strings
780
+ # output is list of list of strings
781
+ output_list = []
782
+ for line in text:
783
+ phrase_list = []
784
+ for phrase in line.split("#"):
785
+ phrase = phrase.strip()
786
+ if phrase != "":
787
+ phrase_list.append(phrase)
788
+ output_list.append(phrase_list)
789
+ return output_list
790
+
791
+ class DurAlignTextProcessor:
792
+ def __init__(self):
793
+ # this is a static set of cleaning rules to be applied
794
+ self.cleaning_rules = {
795
+ " +" : "",
796
+ "^" : "$",
797
+ "$" : ".",
798
+ }
799
+ self.cleaning_rules_English = {
800
+ " +" : "",
801
+ "$" : ".",
802
+ }
803
+ def textProcesor(self, text):
804
+ for key, replacement in self.cleaning_rules.items():
805
+ for idx in range(0,len(text)):
806
+ text[idx] = re.sub(key, replacement, text[idx])
807
+
808
+ return text
809
+
810
+ def textProcesorForEnglish(self, text):
811
+ for key, replacement in self.cleaning_rules_English.items():
812
+ for idx in range(0,len(text)):
813
+ text[idx] = re.sub(key, replacement, text[idx])
814
+
815
+ return text
816
+
817
+ def textProcesor_list(self, text):
818
+ # input expected in 'list of list of string' format
819
+ output_text = []
820
+ for line in text:
821
+ for key, replacement in self.cleaning_rules.items():
822
+ for idx in range(0,len(line)):
823
+ line[idx] = re.sub(key, replacement, line[idx])
824
+ output_text.append(line)
825
+
826
+ return output_text
827
+
828
+
829
+ class TTSDurAlignPreprocessor:
830
+ def __init__(self,
831
+ text_cleaner = TextCleaner(),
832
+ text_normalizer=TextNormalizer(),
833
+ phonifier = Phonifier(),
834
+ post_processor = DurAlignTextProcessor()):
835
+ self.text_cleaner = text_cleaner
836
+ self.text_normalizer = text_normalizer
837
+ self.phonifier = phonifier
838
+ self.post_processor = post_processor
839
+
840
+ def preprocess(self, text, language, gender, phone_dictionary):
841
+ # text = text.strip()
842
+ print(text)
843
+ text = self.text_normalizer.numberToTextConverter(text, language)
844
+ text = self.text_cleaner.clean(text)
845
+ print("cleaned text", text)
846
+ # text = self.text_normalizer.insert_space(text)
847
+ #text = self.text_normalizer.num2text(text, language)
848
+ # print(text)
849
+ text = self.text_normalizer.normalize(text, language)
850
+ # print(text)
851
+ phrasified_text = TextPhrasifier.phrasify(text)
852
+ #print("phrased",phrasified_text)
853
+
854
+ if language not in list(phone_dictionary.keys()):
855
+ phone_dictionary = self.phonifier.load_lang_dict(language, phone_dictionary)
856
+
857
+ print(phone_dictionary.keys())
858
+
859
+ phonified_text = self.phonifier.phonify(phrasified_text, language, gender, phone_dictionary)
860
+ print("phonetext",phonified_text)
861
+ phonified_text = self.post_processor.textProcesor(phonified_text)
862
+ print(phonified_text)
863
+ return phonified_text, phrasified_text
864
+
865
+ class TTSDurAlignPreprocessor_VTT:
866
+ def __init__(self,
867
+ text_cleaner = TextCleaner(),
868
+ text_normalizer=TextNormalizer(),
869
+ phonifier = Phonifier(),
870
+ post_processor = DurAlignTextProcessor()):
871
+ self.text_cleaner = text_cleaner
872
+ self.text_normalizer = text_normalizer
873
+ self.phonifier = phonifier
874
+ self.post_processor = post_processor
875
+
876
+ def preprocess(self, text, language, gender):
877
+ # text = text.strip()
878
+ text = self.text_cleaner.clean_list(text)
879
+ # text = self.text_normalizer.insert_space_list(text)
880
+ text = self.text_normalizer.num2text_list(text, language)
881
+ text = self.text_normalizer.normalize_list(text, language)
882
+ phrasified_text = TextPhrasifier_List.phrasify(text)
883
+ phonified_text = self.phonifier.phonify_list(phrasified_text, language, gender)
884
+ phonified_text = self.post_processor.textProcesor_list(phonified_text)
885
+ return phonified_text, phrasified_text
886
+
887
+
888
+ class CharTextPreprocessor:
889
+ def __init__(self,
890
+ text_cleaner = TextCleaner(),
891
+ text_normalizer=TextNormalizer()):
892
+ self.text_cleaner = text_cleaner
893
+ self.text_normalizer = text_normalizer
894
+
895
+ def preprocess(self, text, language, gender=None, phone_dictionary=None):
896
+ text = text.strip()
897
+ text = self.text_normalizer.numberToTextConverter(text, language)
898
+ text = self.text_cleaner.clean(text)
899
+ # text = self.text_normalizer.insert_space(text)
900
+ #text = self.text_normalizer.num2text(text, language)
901
+ text = self.text_normalizer.normalize(text, language)
902
+ phrasified_text = TextPhrasifier.phrasify(text)
903
+ phonified_text = phrasified_text # No phonification for character TTS models
904
+ return phonified_text, phrasified_text
905
+
906
+ class CharTextPreprocessor_VTT:
907
+ def __init__(self,
908
+ text_cleaner = TextCleaner(),
909
+ text_normalizer=TextNormalizer()
910
+ ):
911
+ self.text_cleaner = text_cleaner
912
+ self.text_normalizer = text_normalizer
913
+
914
+ def preprocess(self, text, language, gender=None):
915
+ # text = text.strip()
916
+ text = self.text_cleaner.clean_list(text)
917
+ # text = self.text_normalizer.insert_space_list(text)
918
+ text = self.text_normalizer.num2text_list(text, language)
919
+ text = self.text_normalizer.normalize_list(text, language)
920
+ phrasified_text = TextPhrasifier_List.phrasify(text)
921
+ phonified_text = phrasified_text # No phonification for character TTS models
922
+ return phonified_text, phrasified_text
923
+
924
+
925
+ class TTSPreprocessor:
926
+ def __init__(self,
927
+ text_cleaner = TextCleaner(),
928
+ text_normalizer=TextNormalizer(),
929
+ phonifier = Phonifier(),
930
+ text_phrasefier = TextPhrasifier(),
931
+ post_processor = DurAlignTextProcessor()):
932
+ self.text_cleaner = text_cleaner
933
+ self.text_normalizer = text_normalizer
934
+ self.phonifier = phonifier
935
+ self.text_phrasefier = text_phrasefier
936
+ self.post_processor = post_processor
937
+
938
+ def preprocess(self, text, language, gender, phone_dictionary):
939
+ text = text.strip()
940
+ text = self.text_normalizer.numberToTextConverter(text, language)
941
+ text = self.text_cleaner.clean(text)
942
+ # text = self.text_normalizer.insert_space(text)
943
+ #text = self.text_normalizer.num2text(text, language)
944
+ text = self.text_normalizer.normalize(text, language)
945
+ phrasified_text = TextPhrasifier.phrasify(text)
946
+ if language not in list(phone_dictionary.keys()):
947
+ phone_dictionary = self.phonifier.load_lang_dict(language, phone_dictionary)
948
+ phonified_text = self.phonifier.phonify(phrasified_text, language, gender, phone_dictionary)
949
+ print(phonified_text)
950
+ phonified_text = self.post_processor.textProcesorForEnglish(phonified_text)
951
+ print(phonified_text)
952
+ return phonified_text, phrasified_text
953
+
954
+ class TTSPreprocessor_VTT:
955
+ def __init__(self,
956
+ text_cleaner = TextCleaner(),
957
+ text_normalizer=TextNormalizer(),
958
+ phonifier = Phonifier(),
959
+ text_phrasefier = TextPhrasifier_List()):
960
+ self.text_cleaner = text_cleaner
961
+ self.text_normalizer = text_normalizer
962
+ self.phonifier = phonifier
963
+ self.text_phrasefier = text_phrasefier
964
+
965
+ def preprocess(self, text, language, gender):
966
+ # print(f"Original text: {text}")
967
+ text = self.text_cleaner.clean_list(text)
968
+ # print(f"After text cleaner: {text}")
969
+ # text = self.text_normalizer.insert_space_list(text)
970
+ # print(f"After insert space: {text}")
971
+ text = self.text_normalizer.num2text_list(text, language)
972
+ # print(f"After num2text: {text}")
973
+ text = self.text_normalizer.normalize_list(text, language)
974
+ # print(f"After text normalizer: {text}")
975
+ phrasified_text = TextPhrasifier_List.phrasify(text)
976
+ # print(f"phrasified_text: {phrasified_text}")
977
+ phonified_text = self.phonifier.phonify_list(phrasified_text, language, gender)
978
+ # print(f"phonified_text: {phonified_text}")
979
+ return phonified_text, phrasified_text