Spaces:
Running
on
T4
Running
on
T4
adapt the version number to the release
Browse files- app.py +1 -6
- app_future.py +193 -0
app.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
|
3 |
import spaces
|
4 |
|
5 |
-
os.system("git clone --branch
|
6 |
os.system("mv toucan_codebase/* .")
|
7 |
|
8 |
from run_model_downloader import download_models
|
@@ -39,7 +39,6 @@ class ControllableInterface(torch.nn.Module):
|
|
39 |
language,
|
40 |
accent,
|
41 |
voice_seed,
|
42 |
-
prosody_creativity,
|
43 |
duration_scaling_factor,
|
44 |
pause_duration_scaling_factor,
|
45 |
pitch_variance_scale,
|
@@ -114,7 +113,6 @@ class ControllableInterface(torch.nn.Module):
|
|
114 |
energy_variance_scale=energy_variance_scale,
|
115 |
pause_duration_scaling_factor=pause_duration_scaling_factor,
|
116 |
return_plot_as_filepath=True,
|
117 |
-
prosody_creativity=prosody_creativity,
|
118 |
loudness_in_db=loudness_in_db)
|
119 |
return sr, wav, fig
|
120 |
|
@@ -132,7 +130,6 @@ controllable_ui = ControllableInterface(available_artificial_voices=available_ar
|
|
132 |
def read(prompt,
|
133 |
language,
|
134 |
voice_seed,
|
135 |
-
prosody_creativity,
|
136 |
duration_scaling_factor,
|
137 |
pitch_variance_scale,
|
138 |
energy_variance_scale,
|
@@ -147,7 +144,6 @@ def read(prompt,
|
|
147 |
language.split(" ")[-1].split("(")[1].split(")")[0],
|
148 |
language.split(" ")[-1].split("(")[1].split(")")[0],
|
149 |
voice_seed,
|
150 |
-
prosody_creativity,
|
151 |
duration_scaling_factor,
|
152 |
1.,
|
153 |
pitch_variance_scale,
|
@@ -177,7 +173,6 @@ iface = gr.Interface(fn=read,
|
|
177 |
gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
|
178 |
value=279,
|
179 |
label="Random Seed for the artificial Voice"),
|
180 |
-
gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.7, label="Prosody Creativity"),
|
181 |
gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
|
182 |
gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
|
183 |
gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
|
|
|
2 |
|
3 |
import spaces
|
4 |
|
5 |
+
os.system("git clone --branch v3.0 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
|
6 |
os.system("mv toucan_codebase/* .")
|
7 |
|
8 |
from run_model_downloader import download_models
|
|
|
39 |
language,
|
40 |
accent,
|
41 |
voice_seed,
|
|
|
42 |
duration_scaling_factor,
|
43 |
pause_duration_scaling_factor,
|
44 |
pitch_variance_scale,
|
|
|
113 |
energy_variance_scale=energy_variance_scale,
|
114 |
pause_duration_scaling_factor=pause_duration_scaling_factor,
|
115 |
return_plot_as_filepath=True,
|
|
|
116 |
loudness_in_db=loudness_in_db)
|
117 |
return sr, wav, fig
|
118 |
|
|
|
130 |
def read(prompt,
|
131 |
language,
|
132 |
voice_seed,
|
|
|
133 |
duration_scaling_factor,
|
134 |
pitch_variance_scale,
|
135 |
energy_variance_scale,
|
|
|
144 |
language.split(" ")[-1].split("(")[1].split(")")[0],
|
145 |
language.split(" ")[-1].split("(")[1].split(")")[0],
|
146 |
voice_seed,
|
|
|
147 |
duration_scaling_factor,
|
148 |
1.,
|
149 |
pitch_variance_scale,
|
|
|
173 |
gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
|
174 |
value=279,
|
175 |
label="Random Seed for the artificial Voice"),
|
|
|
176 |
gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
|
177 |
gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
|
178 |
gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
|
app_future.py
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import spaces
|
4 |
+
|
5 |
+
os.system("git clone --branch v3.1 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
|
6 |
+
os.system("mv toucan_codebase/* .")
|
7 |
+
|
8 |
+
from run_model_downloader import download_models
|
9 |
+
|
10 |
+
download_models()
|
11 |
+
|
12 |
+
import gradio as gr
|
13 |
+
import torch.cuda
|
14 |
+
from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
|
15 |
+
from Utility.utils import float2pcm
|
16 |
+
|
17 |
+
import os
|
18 |
+
|
19 |
+
import torch
|
20 |
+
|
21 |
+
from Architectures.ControllabilityGAN.GAN import GanWrapper
|
22 |
+
from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
|
23 |
+
from Utility.storage_config import MODELS_DIR
|
24 |
+
|
25 |
+
|
26 |
+
class ControllableInterface(torch.nn.Module):
|
27 |
+
|
28 |
+
def __init__(self, available_artificial_voices=1000):
|
29 |
+
super().__init__()
|
30 |
+
self.model = ToucanTTSInterface(device="cpu", tts_model_path="Meta")
|
31 |
+
self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cpu")
|
32 |
+
self.generated_speaker_embeds = list()
|
33 |
+
self.available_artificial_voices = available_artificial_voices
|
34 |
+
self.current_language = ""
|
35 |
+
self.current_accent = ""
|
36 |
+
|
37 |
+
def read(self,
|
38 |
+
prompt,
|
39 |
+
language,
|
40 |
+
accent,
|
41 |
+
voice_seed,
|
42 |
+
prosody_creativity,
|
43 |
+
duration_scaling_factor,
|
44 |
+
pause_duration_scaling_factor,
|
45 |
+
pitch_variance_scale,
|
46 |
+
energy_variance_scale,
|
47 |
+
emb_slider_1,
|
48 |
+
emb_slider_2,
|
49 |
+
emb_slider_3,
|
50 |
+
emb_slider_4,
|
51 |
+
emb_slider_5,
|
52 |
+
emb_slider_6,
|
53 |
+
loudness_in_db
|
54 |
+
):
|
55 |
+
if self.current_language != language:
|
56 |
+
self.model.set_phonemizer_language(language)
|
57 |
+
self.current_language = language
|
58 |
+
if self.current_accent != accent:
|
59 |
+
self.model.set_accent_language(accent)
|
60 |
+
self.current_accent = accent
|
61 |
+
|
62 |
+
self.wgan.set_latent(voice_seed)
|
63 |
+
controllability_vector = torch.tensor([emb_slider_1,
|
64 |
+
emb_slider_2,
|
65 |
+
emb_slider_3,
|
66 |
+
emb_slider_4,
|
67 |
+
emb_slider_5,
|
68 |
+
emb_slider_6], dtype=torch.float32)
|
69 |
+
embedding = self.wgan.modify_embed(controllability_vector)
|
70 |
+
self.model.set_utterance_embedding(embedding=embedding)
|
71 |
+
|
72 |
+
phones = self.model.text2phone.get_phone_string(prompt)
|
73 |
+
if len(phones) > 1800:
|
74 |
+
if language == "deu":
|
75 |
+
prompt = "Deine Eingabe war zu lang. Bitte versuche es entweder mit einem kürzeren Text oder teile ihn in mehrere Teile auf."
|
76 |
+
elif language == "ell":
|
77 |
+
prompt = "Η εισήγησή σας ήταν πολύ μεγάλη. Παρακαλώ δοκιμάστε είτε ένα μικρότερο κείμενο είτε χωρίστε το σε διάφορα μέρη."
|
78 |
+
elif language == "spa":
|
79 |
+
prompt = "Su entrada es demasiado larga. Por favor, intente un texto más corto o divídalo en varias partes."
|
80 |
+
elif language == "fin":
|
81 |
+
prompt = "Vastauksesi oli liian pitkä. Kokeile joko lyhyempää tekstiä tai jaa se useampaan osaan."
|
82 |
+
elif language == "rus":
|
83 |
+
prompt = "Ваш текст слишком длинный. Пожалуйста, попробуйте либо сократить текст, либо разделить его на несколько частей."
|
84 |
+
elif language == "hun":
|
85 |
+
prompt = "Túl hosszú volt a bevitele. Kérjük, próbáljon meg rövidebb szöveget írni, vagy ossza több részre."
|
86 |
+
elif language == "nld":
|
87 |
+
prompt = "Uw input was te lang. Probeer een kortere tekst of splits het in verschillende delen."
|
88 |
+
elif language == "fra":
|
89 |
+
prompt = "Votre saisie était trop longue. Veuillez essayer un texte plus court ou le diviser en plusieurs parties."
|
90 |
+
elif language == 'pol':
|
91 |
+
prompt = "Twój wpis był zbyt długi. Spróbuj skrócić tekst lub podzielić go na kilka części."
|
92 |
+
elif language == 'por':
|
93 |
+
prompt = "O seu contributo foi demasiado longo. Por favor, tente um texto mais curto ou divida-o em várias partes."
|
94 |
+
elif language == 'ita':
|
95 |
+
prompt = "Il tuo input era troppo lungo. Per favore, prova un testo più corto o dividilo in più parti."
|
96 |
+
elif language == 'cmn':
|
97 |
+
prompt = "你的输入太长了。请尝试使用较短的文本或将其拆分为多个部分。"
|
98 |
+
elif language == 'vie':
|
99 |
+
prompt = "Đầu vào của bạn quá dài. Vui lòng thử một văn bản ngắn hơn hoặc chia nó thành nhiều phần."
|
100 |
+
else:
|
101 |
+
prompt = "Your input was too long. Please try either a shorter text or split it into several parts."
|
102 |
+
if self.current_language != "eng":
|
103 |
+
self.model.set_phonemizer_language("eng")
|
104 |
+
self.current_language = "eng"
|
105 |
+
if self.current_accent != "eng":
|
106 |
+
self.model.set_accent_language("eng")
|
107 |
+
self.current_accent = "eng"
|
108 |
+
|
109 |
+
print(prompt)
|
110 |
+
wav, sr, fig = self.model(prompt,
|
111 |
+
input_is_phones=False,
|
112 |
+
duration_scaling_factor=duration_scaling_factor,
|
113 |
+
pitch_variance_scale=pitch_variance_scale,
|
114 |
+
energy_variance_scale=energy_variance_scale,
|
115 |
+
pause_duration_scaling_factor=pause_duration_scaling_factor,
|
116 |
+
return_plot_as_filepath=True,
|
117 |
+
prosody_creativity=prosody_creativity,
|
118 |
+
loudness_in_db=loudness_in_db)
|
119 |
+
return sr, wav, fig
|
120 |
+
|
121 |
+
|
122 |
+
title = "Controllable Text-to-Speech for over 7000 Languages"
|
123 |
+
article = "Check out the IMS Toucan TTS Toolkit at https://github.com/DigitalPhonetics/IMS-Toucan"
|
124 |
+
available_artificial_voices = 1000
|
125 |
+
path_to_iso_list = "Preprocessing/multilinguality/iso_to_fullname.json"
|
126 |
+
iso_to_name = load_json_from_path(path_to_iso_list)
|
127 |
+
text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
|
128 |
+
controllable_ui = ControllableInterface(available_artificial_voices=available_artificial_voices)
|
129 |
+
|
130 |
+
|
131 |
+
@spaces.GPU
|
132 |
+
def read(prompt,
|
133 |
+
language,
|
134 |
+
voice_seed,
|
135 |
+
prosody_creativity,
|
136 |
+
duration_scaling_factor,
|
137 |
+
pitch_variance_scale,
|
138 |
+
energy_variance_scale,
|
139 |
+
emb1,
|
140 |
+
emb2
|
141 |
+
):
|
142 |
+
if torch.cuda.is_available():
|
143 |
+
controllable_ui.to("cuda")
|
144 |
+
controllable_ui.device = "cuda"
|
145 |
+
try:
|
146 |
+
sr, wav, fig = controllable_ui.read(prompt,
|
147 |
+
language.split(" ")[-1].split("(")[1].split(")")[0],
|
148 |
+
language.split(" ")[-1].split("(")[1].split(")")[0],
|
149 |
+
voice_seed,
|
150 |
+
prosody_creativity,
|
151 |
+
duration_scaling_factor,
|
152 |
+
1.,
|
153 |
+
pitch_variance_scale,
|
154 |
+
energy_variance_scale,
|
155 |
+
emb1,
|
156 |
+
emb2,
|
157 |
+
0.,
|
158 |
+
0.,
|
159 |
+
0.,
|
160 |
+
0.,
|
161 |
+
-24.)
|
162 |
+
finally:
|
163 |
+
controllable_ui.to("cpu")
|
164 |
+
controllable_ui.device = "cpu"
|
165 |
+
return (sr, float2pcm(wav)), fig
|
166 |
+
|
167 |
+
|
168 |
+
iface = gr.Interface(fn=read,
|
169 |
+
inputs=[gr.Textbox(lines=2,
|
170 |
+
placeholder="write what you want the synthesis to read here...",
|
171 |
+
value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
|
172 |
+
label="Text input"),
|
173 |
+
gr.Dropdown(text_selection,
|
174 |
+
type="value",
|
175 |
+
value='English Text (eng)',
|
176 |
+
label="Select the Language of the Text (type on your keyboard to find it quickly)"),
|
177 |
+
gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
|
178 |
+
value=279,
|
179 |
+
label="Random Seed for the artificial Voice"),
|
180 |
+
gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.7, label="Prosody Creativity"),
|
181 |
+
gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
|
182 |
+
gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
|
183 |
+
gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
|
184 |
+
gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
|
185 |
+
gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
|
186 |
+
],
|
187 |
+
outputs=[gr.Audio(type="numpy", label="Speech"),
|
188 |
+
gr.Image(label="Visualization")],
|
189 |
+
title=title,
|
190 |
+
theme="default",
|
191 |
+
allow_flagging="never",
|
192 |
+
article=article)
|
193 |
+
iface.launch()
|