style-tts-2

Running on Zero

App Files Files Community

Pendrokar commited on Oct 26, 2024

Commit

d1178af

1 Parent(s): e2d5fd4

non-native langs

Browse files

Files changed (2) hide show

app.py +34 -8
styletts2importable.py +26 -4

app.py CHANGED Viewed

@@ -47,21 +47,24 @@ for v in voicelist:
 if not torch.cuda.is_available(): INTROTXT += "\n\n### You are on a CPU-only system, inference will be much slower.\n\nYou can use the [online demo](https://huggingface.co/spaces/styletts2/styletts2) for fast inference."
 @spaces.GPU(duration=10)
-def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
     if text.strip() == "":
         raise gr.Error("You must enter some text")
     if len(text) > 50000:
         raise gr.Error("Text must be <50k characters")
-    print("*** saying ***")
-    print(text)
-    print("*** end ***")
     texts = txtsplit(text)
     v = voice.lower()
     audios = []
     for t in progress.tqdm(texts):
         print(t)
-        audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
-    return (24000, np.concatenate(audios))
 # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
 #     if password == os.environ['ACCESS_CODE']:
 #         if text.strip() == "":
@@ -99,7 +102,6 @@ def clsynthesize(text, voice, vcsteps, embscale, alpha, beta, progress=gr.Progre
     print("*** end ***")
     texts = txtsplit(text)
     audios = []
-    # vs = styletts2importable.compute_style(voice)
     vs = styletts2importable.compute_style(voice)
     # print(vs)
     for t in progress.tqdm(texts):
@@ -135,12 +137,36 @@ with gr.Blocks() as vctk:
         with gr.Column(scale=1):
             inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
             voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
             multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
             # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
         with gr.Column(scale=1):
             btn = gr.Button("Synthesize", variant="primary")
             audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
-            btn.click(synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4)
 with gr.Blocks() as clone:
     with gr.Row():
         with gr.Column(scale=1):

 if not torch.cuda.is_available(): INTROTXT += "\n\n### You are on a CPU-only system, inference will be much slower.\n\nYou can use the [online demo](https://huggingface.co/spaces/styletts2/styletts2) for fast inference."
 @spaces.GPU(duration=10)
+def synthesize(text, voice, lang, lngsteps, password, progress=gr.Progress()):
     if text.strip() == "":
         raise gr.Error("You must enter some text")
     if len(text) > 50000:
         raise gr.Error("Text must be <50k characters")
+    # print("*** saying ***")
+    # print(text)
+    # print("*** end ***")
     texts = txtsplit(text)
     v = voice.lower()
     audios = []
+    ipa_results = ''
     for t in progress.tqdm(texts):
         print(t)
+        audio, ipa = styletts2importable.inference(t, voices[v], lang=lang, alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1)
+        audios.append(audio)
+        ipa_results += ipa
+    return (24000, np.concatenate(audios)), ipa_results
 # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
 #     if password == os.environ['ACCESS_CODE']:
 #         if text.strip() == "":
     print("*** end ***")
     texts = txtsplit(text)
     audios = []
     vs = styletts2importable.compute_style(voice)
     # print(vs)
     for t in progress.tqdm(texts):
         with gr.Column(scale=1):
             inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
             voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
+            lang =gr.Dropdown(
+                [
+                    ['English', 'en-us'],
+                    ['Czech (Non-native)', 'cs'],
+                    ['Danish (Non-native)', 'da'],
+                    ['Dutch (Non-native)', 'nl'],
+                    ['Estonian (Non-native)', 'et'],
+                    ['Finnish (Non-native)', 'fi'],
+                    ['French (Non-native)', 'fr'],
+                    ['German (Non-native)', 'de'],
+                    ['Greek (Non-native)', 'el'],
+                    ['Italian (Non-native)', 'it'],
+                    ['Norwegian (Non-native)', 'no'],
+                    ['Polish (Non-native)', 'pl'],
+                    ['Portuguese (Non-native)', 'pt'],
+                    ['Russian (Non-native)', 'ru'],
+                    ['Slovene (Non-native)', 'sl'],
+                    ['Spanish (Non-native)', 'es'],
+                    ['Swedish (Non-native)', 'sv'],
+                    ['Turkish (Non-native)', 'tr'],
+                ],
+                label="Language",
+            )
             multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
             # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
         with gr.Column(scale=1):
             btn = gr.Button("Synthesize", variant="primary")
             audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
+            ipa_result = gr.Textbox(label="IPA", interactive=False)
+            btn.click(synthesize, inputs=[inp, voice, lang, multispeakersteps], outputs=[audio, ipa_result], concurrency_limit=4)
 with gr.Blocks() as clone:
     with gr.Row():
         with gr.Column(scale=1):

styletts2importable.py CHANGED Viewed

@@ -135,7 +135,28 @@ sampler = DiffusionSampler(
     clamp=False
 )
-def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1, use_gruut=False):
     text = text.strip()
     # search for IPA within []
@@ -148,8 +169,9 @@ def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding
     if (ipa_sections is not None):
         text = re.sub(regex, '[]', text, 0, re.MULTILINE)
-    ps = global_phonemizer.phonemize([text])
-    ps = word_tokenize(ps[0])
     ps = ' '.join(ps)
     # add the IPA back
@@ -219,7 +241,7 @@ def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding
                                 F0_pred, N_pred, ref.squeeze().unsqueeze(0))
-    return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later
 def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1, use_gruut=False):
     text = text.strip()

     clamp=False
 )
+LANG_NAMES = {
+    'en-us': 'english',
+    'cs': 'czech',
+    'da': 'danish',
+    'nl': 'dutch',
+    'et': 'estonian',
+    'fi': 'finnish',
+    'fr': 'french',
+    'de': 'german',
+    'el': 'greek',
+    'it': 'italian',
+    'no': 'norwegian',
+    'pl': 'polish',
+    'pt': 'portuguese',
+    'ru': 'russian',
+    'sl': 'slovene',
+    'es': 'spanish',
+    'sv': 'swedish',
+    'tr': 'turkish',
+}
+def inference(text, ref_s, lang='en-us', alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1, use_gruut=False):
     text = text.strip()
     # search for IPA within []
     if (ipa_sections is not None):
         text = re.sub(regex, '[]', text, 0, re.MULTILINE)
+    local_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True, with_stress=True)
+    ps = local_phonemizer.phonemize([text])
+    ps = word_tokenize(ps[0], language=LANG_NAMES[lang])
     ps = ' '.join(ps)
     # add the IPA back
                                 F0_pred, N_pred, ref.squeeze().unsqueeze(0))
+    return out.squeeze().cpu().numpy()[..., :-50], ps # weird pulse at the end of the model, need to be fixed later
 def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1, use_gruut=False):
     text = text.strip()