Pendrokar commited on
Commit
d1178af
·
1 Parent(s): e2d5fd4

non-native langs

Browse files
Files changed (2) hide show
  1. app.py +34 -8
  2. styletts2importable.py +26 -4
app.py CHANGED
@@ -47,21 +47,24 @@ for v in voicelist:
47
  if not torch.cuda.is_available(): INTROTXT += "\n\n### You are on a CPU-only system, inference will be much slower.\n\nYou can use the [online demo](https://huggingface.co/spaces/styletts2/styletts2) for fast inference."
48
 
49
  @spaces.GPU(duration=10)
50
- def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
51
  if text.strip() == "":
52
  raise gr.Error("You must enter some text")
53
  if len(text) > 50000:
54
  raise gr.Error("Text must be <50k characters")
55
- print("*** saying ***")
56
- print(text)
57
- print("*** end ***")
58
  texts = txtsplit(text)
59
  v = voice.lower()
60
  audios = []
 
61
  for t in progress.tqdm(texts):
62
  print(t)
63
- audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
64
- return (24000, np.concatenate(audios))
 
 
65
  # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
66
  # if password == os.environ['ACCESS_CODE']:
67
  # if text.strip() == "":
@@ -99,7 +102,6 @@ def clsynthesize(text, voice, vcsteps, embscale, alpha, beta, progress=gr.Progre
99
  print("*** end ***")
100
  texts = txtsplit(text)
101
  audios = []
102
- # vs = styletts2importable.compute_style(voice)
103
  vs = styletts2importable.compute_style(voice)
104
  # print(vs)
105
  for t in progress.tqdm(texts):
@@ -135,12 +137,36 @@ with gr.Blocks() as vctk:
135
  with gr.Column(scale=1):
136
  inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
137
  voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
139
  # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
140
  with gr.Column(scale=1):
141
  btn = gr.Button("Synthesize", variant="primary")
142
  audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
143
- btn.click(synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4)
 
144
  with gr.Blocks() as clone:
145
  with gr.Row():
146
  with gr.Column(scale=1):
 
47
  if not torch.cuda.is_available(): INTROTXT += "\n\n### You are on a CPU-only system, inference will be much slower.\n\nYou can use the [online demo](https://huggingface.co/spaces/styletts2/styletts2) for fast inference."
48
 
49
  @spaces.GPU(duration=10)
50
+ def synthesize(text, voice, lang, lngsteps, password, progress=gr.Progress()):
51
  if text.strip() == "":
52
  raise gr.Error("You must enter some text")
53
  if len(text) > 50000:
54
  raise gr.Error("Text must be <50k characters")
55
+ # print("*** saying ***")
56
+ # print(text)
57
+ # print("*** end ***")
58
  texts = txtsplit(text)
59
  v = voice.lower()
60
  audios = []
61
+ ipa_results = ''
62
  for t in progress.tqdm(texts):
63
  print(t)
64
+ audio, ipa = styletts2importable.inference(t, voices[v], lang=lang, alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1)
65
+ audios.append(audio)
66
+ ipa_results += ipa
67
+ return (24000, np.concatenate(audios)), ipa_results
68
  # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
69
  # if password == os.environ['ACCESS_CODE']:
70
  # if text.strip() == "":
 
102
  print("*** end ***")
103
  texts = txtsplit(text)
104
  audios = []
 
105
  vs = styletts2importable.compute_style(voice)
106
  # print(vs)
107
  for t in progress.tqdm(texts):
 
137
  with gr.Column(scale=1):
138
  inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
139
  voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
140
+ lang =gr.Dropdown(
141
+ [
142
+ ['English', 'en-us'],
143
+ ['Czech (Non-native)', 'cs'],
144
+ ['Danish (Non-native)', 'da'],
145
+ ['Dutch (Non-native)', 'nl'],
146
+ ['Estonian (Non-native)', 'et'],
147
+ ['Finnish (Non-native)', 'fi'],
148
+ ['French (Non-native)', 'fr'],
149
+ ['German (Non-native)', 'de'],
150
+ ['Greek (Non-native)', 'el'],
151
+ ['Italian (Non-native)', 'it'],
152
+ ['Norwegian (Non-native)', 'no'],
153
+ ['Polish (Non-native)', 'pl'],
154
+ ['Portuguese (Non-native)', 'pt'],
155
+ ['Russian (Non-native)', 'ru'],
156
+ ['Slovene (Non-native)', 'sl'],
157
+ ['Spanish (Non-native)', 'es'],
158
+ ['Swedish (Non-native)', 'sv'],
159
+ ['Turkish (Non-native)', 'tr'],
160
+ ],
161
+ label="Language",
162
+ )
163
  multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
164
  # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
165
  with gr.Column(scale=1):
166
  btn = gr.Button("Synthesize", variant="primary")
167
  audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
168
+ ipa_result = gr.Textbox(label="IPA", interactive=False)
169
+ btn.click(synthesize, inputs=[inp, voice, lang, multispeakersteps], outputs=[audio, ipa_result], concurrency_limit=4)
170
  with gr.Blocks() as clone:
171
  with gr.Row():
172
  with gr.Column(scale=1):
styletts2importable.py CHANGED
@@ -135,7 +135,28 @@ sampler = DiffusionSampler(
135
  clamp=False
136
  )
137
 
138
- def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1, use_gruut=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  text = text.strip()
140
 
141
  # search for IPA within []
@@ -148,8 +169,9 @@ def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding
148
  if (ipa_sections is not None):
149
  text = re.sub(regex, '[]', text, 0, re.MULTILINE)
150
 
151
- ps = global_phonemizer.phonemize([text])
152
- ps = word_tokenize(ps[0])
 
153
  ps = ' '.join(ps)
154
 
155
  # add the IPA back
@@ -219,7 +241,7 @@ def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding
219
  F0_pred, N_pred, ref.squeeze().unsqueeze(0))
220
 
221
 
222
- return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later
223
 
224
  def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1, use_gruut=False):
225
  text = text.strip()
 
135
  clamp=False
136
  )
137
 
138
+ LANG_NAMES = {
139
+ 'en-us': 'english',
140
+ 'cs': 'czech',
141
+ 'da': 'danish',
142
+ 'nl': 'dutch',
143
+ 'et': 'estonian',
144
+ 'fi': 'finnish',
145
+ 'fr': 'french',
146
+ 'de': 'german',
147
+ 'el': 'greek',
148
+ 'it': 'italian',
149
+ 'no': 'norwegian',
150
+ 'pl': 'polish',
151
+ 'pt': 'portuguese',
152
+ 'ru': 'russian',
153
+ 'sl': 'slovene',
154
+ 'es': 'spanish',
155
+ 'sv': 'swedish',
156
+ 'tr': 'turkish',
157
+ }
158
+
159
+ def inference(text, ref_s, lang='en-us', alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1, use_gruut=False):
160
  text = text.strip()
161
 
162
  # search for IPA within []
 
169
  if (ipa_sections is not None):
170
  text = re.sub(regex, '[]', text, 0, re.MULTILINE)
171
 
172
+ local_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True, with_stress=True)
173
+ ps = local_phonemizer.phonemize([text])
174
+ ps = word_tokenize(ps[0], language=LANG_NAMES[lang])
175
  ps = ' '.join(ps)
176
 
177
  # add the IPA back
 
241
  F0_pred, N_pred, ref.squeeze().unsqueeze(0))
242
 
243
 
244
+ return out.squeeze().cpu().numpy()[..., :-50], ps # weird pulse at the end of the model, need to be fixed later
245
 
246
  def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1, use_gruut=False):
247
  text = text.strip()