Pendrokar commited on
Commit
d665bf1
·
1 Parent(s): d1178af

lojban ipa support

Browse files
Files changed (3) hide show
  1. app.py +3 -1
  2. lojban.py +354 -0
  3. styletts2importable.py +12 -4
app.py CHANGED
@@ -139,7 +139,7 @@ with gr.Blocks() as vctk:
139
  voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
140
  lang =gr.Dropdown(
141
  [
142
- ['English', 'en-us'],
143
  ['Czech (Non-native)', 'cs'],
144
  ['Danish (Non-native)', 'da'],
145
  ['Dutch (Non-native)', 'nl'],
@@ -157,6 +157,8 @@ with gr.Blocks() as vctk:
157
  ['Spanish (Non-native)', 'es'],
158
  ['Swedish (Non-native)', 'sv'],
159
  ['Turkish (Non-native)', 'tr'],
 
 
160
  ],
161
  label="Language",
162
  )
 
139
  voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
140
  lang =gr.Dropdown(
141
  [
142
+ ['English (US)', 'en-us'],
143
  ['Czech (Non-native)', 'cs'],
144
  ['Danish (Non-native)', 'da'],
145
  ['Dutch (Non-native)', 'nl'],
 
157
  ['Spanish (Non-native)', 'es'],
158
  ['Swedish (Non-native)', 'sv'],
159
  ['Turkish (Non-native)', 'tr'],
160
+ # artificial
161
+ ['Lojban', 'jb'],
162
  ],
163
  label="Language",
164
  )
lojban.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # credits: gleki
2
+ from __future__ import annotations
3
+ import sys
4
+ import os
5
+
6
+ from re import sub, compile
7
+ from itertools import islice
8
+
9
+ def krulermorna(text: str) -> str:
10
+ text = sub(r"\.", "", text)
11
+ text = sub(r"^", ".", text)
12
+ text = sub(r"u([aeiouy])", r"w\1", text)
13
+ text = sub(r"i([aeiouy])", r"ɩ\1", text)
14
+ text = sub(r"au", "ḁ", text)
15
+ text = sub(r"ai", "ą", text)
16
+ text = sub(r"ei", "ę", text)
17
+ text = sub(r"oi", "ǫ", text)
18
+ text = sub(r"\.", "", text)
19
+ return text
20
+
21
+ def krulermornaize(words: list[str]) -> list[str]:
22
+ return [krulermorna(word) for word in words]
23
+
24
+ ipa_vits = {
25
+ "a$": 'aː',
26
+ "a": 'aː',
27
+ # "e(?=v)": 'ɛːʔ',
28
+ # "e$": 'ɛːʔ',
29
+ "e": 'ɛː',
30
+ "i": 'iː',
31
+ "o": 'oː',
32
+ "u": 'ʊu',
33
+ # "u": 'ʊː',
34
+ "y": 'əː',
35
+ "ą": 'aɪ',
36
+ "ę": 'ɛɪ',
37
+ # "ę(?=\b)(?!')": 'ɛɪʔ',
38
+ "ǫ": 'ɔɪ',
39
+ "ḁ": 'aʊ',
40
+ "ɩa": 'jaː',
41
+ "ɩe": 'jɛː',
42
+ "ɩi": 'jiː',
43
+ "ɩo": 'jɔː',
44
+ "ɩu": 'juː',
45
+ "ɩy": 'jəː',
46
+ "ɩ": 'j',
47
+ "wa": 'waː',
48
+ "we": 'wɛː',
49
+ "wi": 'wiː',
50
+ "wo": 'wɔː',
51
+ "wu": 'wuː',
52
+ "wy": 'wəː',
53
+ "w": 'w',
54
+ "c": 'ʃ',
55
+ # "bj": 'bʒ',
56
+ "j": 'ʒ',
57
+ "s": 's',
58
+ "z": 'z',
59
+ "f": 'f',
60
+ "v": 'v',
61
+ "x": 'hhh',
62
+ "'": 'h',
63
+ # "dj":'dʒ',
64
+ # "tc":'tʃ',
65
+ # "dz":'ʣ',
66
+ # "ts":'ʦ',
67
+ 'r': 'ɹ',
68
+ 'r(?![ˈaeiouyḁąęǫ])': 'ɹɹ',
69
+ # 'r(?=[ˈaeiouyḁąęǫ])': 'ɹ',
70
+ "nˈu": 'nˈʊuː',
71
+ "nu": 'nʊuː',
72
+ "ng": 'n.g',
73
+ "n": 'n',
74
+ "m": 'm',
75
+ "l": 'l',
76
+ "b": 'b',
77
+ "d": 'd',
78
+ "g": 'ɡ',
79
+ "k": 'k',
80
+ "p": 'p',
81
+ "t": 't',
82
+ "h": 'h'
83
+ }
84
+
85
+ ipa_nix = {
86
+ "a$": 'aː',
87
+ "a": 'aː',
88
+ # "e(?=v)": 'ɛːʔ',
89
+ # "e$": 'ɛːʔ',
90
+ "e": 'ɛː',
91
+ "i": 'iː',
92
+ "o": 'oː',
93
+ "u": 'ʊu',
94
+ # "u": 'ʊː',
95
+ "y": 'əː',
96
+ "ą": 'aɪ',
97
+ "ę": 'ɛɪ',
98
+ # "ę(?=\b)(?!')": 'ɛɪʔ',
99
+ "ǫ": 'ɔɪ',
100
+ "ḁ": 'aʊ',
101
+ "ɩa": 'jaː',
102
+ "ɩe": 'jɛː',
103
+ "ɩi": 'jiː',
104
+ "ɩo": 'jɔː',
105
+ "ɩu": 'juː',
106
+ "ɩy": 'jəː',
107
+ "ɩ": 'j',
108
+ "wa": 'waː',
109
+ "we": 'wɛː',
110
+ "wi": 'wiː',
111
+ "wo": 'wɔː',
112
+ "wu": 'wuː',
113
+ "wy": 'wəː',
114
+ "w": 'w',
115
+ "c": 'ʃ',
116
+ "gj": 'gɪʒ',
117
+ "bj": 'bɪʒ',
118
+ "j": 'ʒ',
119
+ "s": 's',
120
+ "z": 'z',
121
+ "f": 'f',
122
+ "v": 'v',
123
+ "x": 'hh',
124
+ "'": 'h',
125
+ # "dj":'dʒ',
126
+ # "tc":'tʃ',
127
+ # "dz":'ʣ',
128
+ # "ts":'ʦ',
129
+ 'r': 'ɹ',
130
+ 'r(?![ˈaeiouyḁąęǫ])': 'ɹɹɹɪ',
131
+ # 'r(?=[ˈaeiouyḁąęǫ])': 'ɹ',
132
+ "nˈu": 'nˈʊuː',
133
+ "nu": 'nʊuː',
134
+ "ng": 'ng',
135
+ "n": 'n',
136
+ "m": 'm',
137
+ "l": 'l',
138
+ "b": 'b',
139
+ "d": 'd',
140
+ "g": 'ɡ',
141
+ "k": 'k',
142
+ "p": 'p',
143
+ "t": 't',
144
+ "h": 'h'
145
+ }
146
+
147
+ vowel_pattern = compile("[aeiouyąęǫḁ]")
148
+ vowel_coming_pattern = compile("(?=[aeiouyąęǫḁ])")
149
+ diphthong_coming_pattern = compile("(?=[ąęǫḁ])")
150
+
151
+ question_words = krulermornaize(["ma", "mo", "xu"])
152
+ starter_words = krulermornaize(["le", "lo", "lei", "loi"])
153
+ terminator_words = krulermornaize(["kei", "ku'o", "vau", "li'u"])
154
+
155
+ def lojban2ipa(text: str, mode: str) -> str:
156
+ if mode == 'vits':
157
+ return lojban2ipa_vits(text)
158
+ if mode == 'nix':
159
+ return lojban2ipa_nix(text)
160
+ return lojban2ipa_vits(text)
161
+
162
+ def lojban2ipa_vits(text: str) -> str:
163
+ text = krulermorna(text.strip())
164
+ words = text.split(' ')
165
+ rebuilt_words = []
166
+ question_sentence = False
167
+ for index, word in enumerate([*words]):
168
+ modified_word = word
169
+ prefix, postfix = "", ""
170
+
171
+ if word in question_words:
172
+ postfix = "?"
173
+ prefix=" " + prefix
174
+ # question_sentence = True
175
+
176
+ if word in starter_words:
177
+ prefix=" " + prefix
178
+ # question_sentence = True
179
+
180
+ if word in terminator_words:
181
+ postfix = ", "
182
+ # if not vowel_pattern.match(word[-1:][0]):
183
+ # postfix += "ʔ"
184
+ # # cmevla
185
+ # if not vowel_pattern.match(word[0]):
186
+ # prefix += "ʔ"
187
+
188
+ # if vowel_pattern.match(word[0]):
189
+ # prefix = "ʔ" + prefix
190
+
191
+ if index == 0 or word in ["ni'o", "i"]:
192
+ prefix = ", " + prefix
193
+
194
+ split_word = vowel_coming_pattern.split(word)
195
+ tail_word = split_word[-2:]
196
+ # add stress to {klama}, {ni'o}
197
+ if len(tail_word) == 2 and len(tail_word[0]) > 0 and bool(vowel_pattern.match(tail_word[0][0])) and bool(vowel_pattern.match(tail_word[1][0])):
198
+ head_word = split_word[:-2]
199
+ modified_word = "".join(head_word) + "ˈ" + "".join(tail_word)
200
+ # prefix=" " + prefix
201
+ # add a pause after two-syllable words
202
+ postfix = postfix + " "
203
+ # add stress to {lau}, {coi}
204
+ elif len(tail_word) == 2 and len(tail_word[0]) > 0 and bool(diphthong_coming_pattern.match(tail_word[1][0])):
205
+ head_word = split_word[:-2]
206
+ modified_word = "".join(head_word) + tail_word[0] + "ˈ" + tail_word[1]
207
+ # prefix=" " + prefix
208
+ postfix = postfix + " "
209
+ # add stress to {le}
210
+ # elif len(tail_word) == 2 and len(tail_word[0]) > 0 and bool(vowel_pattern.match(tail_word[1][0])):
211
+ # head_word = split_word[:-2]
212
+ # modified_word = "".join(head_word) + tail_word[0] + "ˈ" + tail_word[1]+" "
213
+ # postfix =postfix +" "
214
+
215
+ # add a pause even after a cmavo
216
+ if not (index - 1 >= 0 and words[index-1] in starter_words):
217
+ prefix = " " + prefix
218
+
219
+ # # add a pause before {.alis}
220
+ # if bool(vowel_pattern.match(word[0])):
221
+ # word = ", " + word
222
+
223
+ """
224
+ for each letter: if the slice matches then convert the letter
225
+ """
226
+ rebuilt_word = ""
227
+ lit = enumerate([*modified_word])
228
+ for idx, x in lit:
229
+ tail = modified_word[idx:]
230
+ matched = False
231
+ consumed = 1
232
+ for attr, val in sorted(ipa_vits.items(), key=lambda x: len(str(x[0])), reverse=True):
233
+ pattern = compile("^"+attr)
234
+ matches = pattern.findall(tail)
235
+ if len(matches)>0:
236
+ match = matches[0]
237
+ consumed = len(match)
238
+ rebuilt_word += val
239
+ matched = True
240
+ break
241
+ if not matched:
242
+ rebuilt_word += x
243
+ [next(lit, None) for _ in range(consumed - 1)]
244
+
245
+ rebuilt_words.append(prefix+rebuilt_word+postfix)
246
+
247
+ output = "".join(rebuilt_words).strip()
248
+ output = sub(r" {2,}", " ", output)
249
+ output = sub(r", ?(?=,)", "", output)
250
+
251
+ if question_sentence == True:
252
+ output += "?"
253
+ elif bool(vowel_pattern.match(text[-1:][0])):
254
+ output += "."
255
+
256
+ return output
257
+
258
+ def lojban2ipa_nix(text: str) -> str:
259
+ text = krulermorna(text.strip())
260
+ words = text.split(' ')
261
+ rebuilt_words = []
262
+ question_sentence = False
263
+ for index, word in enumerate([*words]):
264
+ modified_word = word
265
+ prefix, postfix = "", ""
266
+
267
+ if word in question_words:
268
+ # postfix = "?"
269
+ prefix=" " + prefix
270
+ # question_sentence = True
271
+
272
+ if word in starter_words:
273
+ prefix=" " + prefix
274
+ # question_sentence = True
275
+
276
+ if word in terminator_words:
277
+ postfix = ", "
278
+ # if not vowel_pattern.match(word[-1:][0]):
279
+ # postfix += "ʔ"
280
+ # # cmevla
281
+ # if not vowel_pattern.match(word[0]):
282
+ # prefix += "ʔ"
283
+
284
+ # if vowel_pattern.match(word[0]):
285
+ # prefix = "ʔ" + prefix
286
+
287
+ if index == 0 or word in ["ni'o", "i"]:
288
+ prefix = ", " + prefix
289
+
290
+ split_word = vowel_coming_pattern.split(word)
291
+ tail_word = split_word[-2:]
292
+ # add stress to {klama}, {ni'o}
293
+ if len(tail_word) == 2 and len(tail_word[0]) > 0 and bool(vowel_pattern.match(tail_word[0][0])) and bool(vowel_pattern.match(tail_word[1][0])):
294
+ head_word = split_word[:-2]
295
+ modified_word = "".join(head_word) + "ˈ" + "".join(tail_word)
296
+ # prefix=" " + prefix
297
+ # add a pause after two-syllable words
298
+ postfix = postfix + " "
299
+ # add stress to {lau}, {coi}
300
+ elif len(tail_word) == 2 and len(tail_word[0]) > 0 and bool(diphthong_coming_pattern.match(tail_word[1][0])):
301
+ head_word = split_word[:-2]
302
+ modified_word = "".join(head_word) + tail_word[0] + "ˈ" + tail_word[1]
303
+ # prefix=" " + prefix
304
+ postfix = postfix + " "
305
+ # add stress to {le}
306
+ # elif len(tail_word) == 2 and len(tail_word[0]) > 0 and bool(vowel_pattern.match(tail_word[1][0])):
307
+ # head_word = split_word[:-2]
308
+ # modified_word = "".join(head_word) + tail_word[0] + "ˈ" + tail_word[1]+" "
309
+ # postfix =postfix +" "
310
+
311
+ # add a pause even after a cmavo
312
+ if not (index - 1 >= 0 and words[index-1] in starter_words):
313
+ prefix = " " + prefix
314
+
315
+ # # add a pause before {.alis}
316
+ # if bool(vowel_pattern.match(word[0])):
317
+ # word = ", " + word
318
+
319
+ """
320
+ for each letter: if the slice matches then convert the letter
321
+ """
322
+ rebuilt_word = ""
323
+ lit = enumerate([*modified_word])
324
+ for idx, x in lit:
325
+ tail = modified_word[idx:]
326
+ matched = False
327
+ consumed = 1
328
+ for attr, val in sorted(ipa_nix.items(), key=lambda x: len(str(x[0])), reverse=True):
329
+ pattern = compile("^"+attr)
330
+ matches = pattern.findall(tail)
331
+ if len(matches)>0:
332
+ match = matches[0]
333
+ consumed = len(match)
334
+ rebuilt_word += val
335
+ matched = True
336
+ break
337
+ if not matched:
338
+ rebuilt_word += x
339
+ [next(lit, None) for _ in range(consumed - 1)]
340
+
341
+ rebuilt_words.append(prefix+rebuilt_word+postfix)
342
+
343
+ output = "".join(rebuilt_words).strip()
344
+ output = sub(r" {2,}", " ", output)
345
+ output = sub(r", ?(?=,)", "", output)
346
+
347
+ if question_sentence == True:
348
+ output += "?"
349
+ elif bool(vowel_pattern.match(text[-1:][0])):
350
+ output += "."
351
+
352
+ return output
353
+
354
+ # print(lojban2ipa("ni'o le pa tirxu be me'e zo .teris. pu ki kansa le za'u pendo be le nei le ka xabju le foldi be loi spati"))
styletts2importable.py CHANGED
@@ -136,6 +136,7 @@ sampler = DiffusionSampler(
136
  )
137
 
138
  LANG_NAMES = {
 
139
  'en-us': 'english',
140
  'cs': 'czech',
141
  'da': 'danish',
@@ -169,10 +170,17 @@ def inference(text, ref_s, lang='en-us', alpha = 0.3, beta = 0.7, diffusion_step
169
  if (ipa_sections is not None):
170
  text = re.sub(regex, '[]', text, 0, re.MULTILINE)
171
 
172
- local_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True, with_stress=True)
173
- ps = local_phonemizer.phonemize([text])
174
- ps = word_tokenize(ps[0], language=LANG_NAMES[lang])
175
- ps = ' '.join(ps)
 
 
 
 
 
 
 
176
 
177
  # add the IPA back
178
  if (ipa_sections is not None):
 
136
  )
137
 
138
  LANG_NAMES = {
139
+ # natural; supported by nltk
140
  'en-us': 'english',
141
  'cs': 'czech',
142
  'da': 'danish',
 
170
  if (ipa_sections is not None):
171
  text = re.sub(regex, '[]', text, 0, re.MULTILINE)
172
 
173
+ if lang in LANG_NAMES:
174
+ local_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True, with_stress=True)
175
+ ps = local_phonemizer.phonemize([text])
176
+ ps = word_tokenize(ps[0], language=LANG_NAMES[lang])
177
+ ps = ' '.join(ps)
178
+ elif lang == 'jb':
179
+ # Lojban language
180
+ import lojban
181
+ ps = lojban.lojban2ipa(text, 'vits')
182
+ else:
183
+ ps = text
184
 
185
  # add the IPA back
186
  if (ipa_sections is not None):