Spaces:
Running
on
Zero
Running
on
Zero
add voice cloning interface
Browse files- InferenceInterfaces/ControllableInterface.py +13 -9
- app.py +10 -11
InferenceInterfaces/ControllableInterface.py
CHANGED
@@ -23,6 +23,7 @@ class ControllableInterface:
|
|
23 |
|
24 |
def read(self,
|
25 |
prompt,
|
|
|
26 |
voice_seed,
|
27 |
prosody_creativity,
|
28 |
duration_scaling_factor,
|
@@ -37,15 +38,18 @@ class ControllableInterface:
|
|
37 |
emb_slider_6,
|
38 |
loudness_in_db
|
39 |
):
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
49 |
|
50 |
phones = self.model.text2phone.get_phone_string(prompt)
|
51 |
if len(phones) > 1800:
|
|
|
23 |
|
24 |
def read(self,
|
25 |
prompt,
|
26 |
+
audio,
|
27 |
voice_seed,
|
28 |
prosody_creativity,
|
29 |
duration_scaling_factor,
|
|
|
38 |
emb_slider_6,
|
39 |
loudness_in_db
|
40 |
):
|
41 |
+
if audio is None:
|
42 |
+
self.wgan.set_latent(voice_seed)
|
43 |
+
controllability_vector = torch.tensor([emb_slider_1,
|
44 |
+
emb_slider_2,
|
45 |
+
emb_slider_3,
|
46 |
+
emb_slider_4,
|
47 |
+
emb_slider_5,
|
48 |
+
emb_slider_6], dtype=torch.float32)
|
49 |
+
embedding = self.wgan.modify_embed(controllability_vector)
|
50 |
+
self.model.set_utterance_embedding(embedding=embedding)
|
51 |
+
else:
|
52 |
+
self.model.set_utterance_embedding(path_to_reference_audio=audio)
|
53 |
|
54 |
phones = self.model.text2phone.get_phone_string(prompt)
|
55 |
if len(phones) > 1800:
|
app.py
CHANGED
@@ -15,15 +15,14 @@ class TTSWebUI:
|
|
15 |
placeholder="write what you want the synthesis to read here...",
|
16 |
value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
|
17 |
label="Text input"),
|
|
|
|
|
|
|
18 |
gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
|
19 |
value=279,
|
20 |
label="Random Seed for the artificial Voice"),
|
21 |
-
gr.Slider(minimum
|
22 |
-
gr.Slider(minimum
|
23 |
-
gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
|
24 |
-
gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
|
25 |
-
gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
|
26 |
-
gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
|
27 |
],
|
28 |
outputs=[gr.Audio(type="numpy", label="Speech"),
|
29 |
gr.Image(label="Visualization")],
|
@@ -35,21 +34,21 @@ class TTSWebUI:
|
|
35 |
|
36 |
def read(self,
|
37 |
prompt,
|
38 |
-
|
39 |
prosody_creativity,
|
40 |
duration_scaling_factor,
|
41 |
-
|
42 |
-
energy_variance_scale,
|
43 |
emb1,
|
44 |
emb2
|
45 |
):
|
46 |
sr, wav, fig = self.controllable_ui.read(prompt,
|
|
|
47 |
voice_seed,
|
48 |
prosody_creativity,
|
49 |
duration_scaling_factor,
|
50 |
1.,
|
51 |
-
|
52 |
-
|
53 |
emb1,
|
54 |
emb2,
|
55 |
0.,
|
|
|
15 |
placeholder="write what you want the synthesis to read here...",
|
16 |
value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
|
17 |
label="Text input"),
|
18 |
+
gr.Audio(type="filepath", show_label=True, container=True, label="Voice to Clone (if left empty, will use an artificial voice instead)"),
|
19 |
+
gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.4, label="Prosody Creativity"),
|
20 |
+
gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
|
21 |
gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
|
22 |
value=279,
|
23 |
label="Random Seed for the artificial Voice"),
|
24 |
+
gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity of artificial Voice"),
|
25 |
+
gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth of artificial Voice")
|
|
|
|
|
|
|
|
|
26 |
],
|
27 |
outputs=[gr.Audio(type="numpy", label="Speech"),
|
28 |
gr.Image(label="Visualization")],
|
|
|
34 |
|
35 |
def read(self,
|
36 |
prompt,
|
37 |
+
audio,
|
38 |
prosody_creativity,
|
39 |
duration_scaling_factor,
|
40 |
+
voice_seed,
|
|
|
41 |
emb1,
|
42 |
emb2
|
43 |
):
|
44 |
sr, wav, fig = self.controllable_ui.read(prompt,
|
45 |
+
audio,
|
46 |
voice_seed,
|
47 |
prosody_creativity,
|
48 |
duration_scaling_factor,
|
49 |
1.,
|
50 |
+
1.,
|
51 |
+
1.,
|
52 |
emb1,
|
53 |
emb2,
|
54 |
0.,
|