Spaces:

Flux9665
/

MassivelyMultilingualTTS

Running on T4

App Files Files

Flux9665 commited on Jun 7, 2024

Commit

185fc75

1 Parent(s): 52d3547

implement zero GPU compatible solution (hopefully)

Browse files

Files changed (2) hide show

app.py +78 -3
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -1,11 +1,86 @@
 import os
-import torch
-os.system("git clone --branch v3.1 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
 os.system("mv toucan_codebase/* .")
 from run_model_downloader import download_models
-from run_GUI_demo import TTSWebUI
 download_models()
 TTSWebUI(gpu_id="cuda" if torch.cuda.is_available() else "cpu")

 import os
+import spaces
+os.system("git clone --branch v2.5 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
 os.system("mv toucan_codebase/* .")
 from run_model_downloader import download_models
 download_models()
+import gradio as gr
+import torch.cuda
+from InferenceInterfaces.ControllableInterface import ControllableInterface
+from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
+from Utility.utils import float2pcm
+class TTSWebUI:
+    def __init__(self, gpu_id="cpu", title="Controllable Text-to-Speech for over 7000 Languages", article="", available_artificial_voices=1000, path_to_iso_list="Preprocessing/multilinguality/iso_to_fullname.json"):
+        iso_to_name = load_json_from_path(path_to_iso_list)
+        text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
+        # accent_selection = [f"{iso_to_name[iso_code]} Accent ({iso_code})" for iso_code in iso_to_name]
+        self.controllable_ui = ControllableInterface(gpu_id=gpu_id,
+                                                     available_artificial_voices=available_artificial_voices)
+        self.iface = gr.Interface(fn=self.read,
+                                  inputs=[gr.Textbox(lines=2,
+                                                     placeholder="write what you want the synthesis to read here...",
+                                                     value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
+                                                     label="Text input"),
+                                          gr.Dropdown(text_selection,
+                                                      type="value",
+                                                      value='English Text (eng)',
+                                                      label="Select the Language of the Text (type on your keyboard to find it quickly)"),
+                                          gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
+                                                    value=279,
+                                                    label="Random Seed for the artificial Voice"),
+                                          gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.7, label="Prosody Creativity"),
+                                          gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
+                                          gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
+                                          gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
+                                          gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
+                                          gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
+                                          ],
+                                  outputs=[gr.Audio(type="numpy", label="Speech"),
+                                           gr.Image(label="Visualization")],
+                                  title=title,
+                                  theme="default",
+                                  allow_flagging="never",
+                                  article=article)
+        self.iface.launch()
+    @spaces.GPU
+    def read(self,
+             prompt,
+             language,
+             voice_seed,
+             prosody_creativity,
+             duration_scaling_factor,
+             pitch_variance_scale,
+             energy_variance_scale,
+             emb1,
+             emb2
+             ):
+        sr, wav, fig = self.controllable_ui.read(prompt,
+                                                 language.split(" ")[-1].split("(")[1].split(")")[0],
+                                                 language.split(" ")[-1].split("(")[1].split(")")[0],
+                                                 voice_seed,
+                                                 prosody_creativity,
+                                                 duration_scaling_factor,
+                                                 1.,
+                                                 pitch_variance_scale,
+                                                 energy_variance_scale,
+                                                 emb1,
+                                                 emb2,
+                                                 0.,
+                                                 0.,
+                                                 0.,
+                                                 0.,
+                                                 -24.)
+        return (sr, float2pcm(wav)), fig
 TTSWebUI(gpu_id="cuda" if torch.cuda.is_available() else "cpu")

requirements.txt CHANGED Viewed

@@ -2,7 +2,6 @@ torch_complex~=0.4.3
 tqdm~=4.64.1
 scipy~=1.9.3
 librosa~=0.9.2
-scikit-learn~=1.1.3
 praat-parselmouth~=0.4.2
 torch~=2.3.0
 numpy~=1.23.4

 tqdm~=4.64.1
 scipy~=1.9.3
 librosa~=0.9.2
 praat-parselmouth~=0.4.2
 torch~=2.3.0
 numpy~=1.23.4