Spaces:

Flux9665
/

EnglishToucan

Running on Zero

App Files Files Community

Flux9665 commited on Jul 5, 2024

Commit

23208c6

1 Parent(s): 0ebcf15

add voice cloning interface

Browse files

Files changed (2) hide show

InferenceInterfaces/ControllableInterface.py +13 -9
app.py +10 -11

InferenceInterfaces/ControllableInterface.py CHANGED Viewed

@@ -23,6 +23,7 @@ class ControllableInterface:
     def read(self,
              prompt,
              voice_seed,
              prosody_creativity,
              duration_scaling_factor,
@@ -37,15 +38,18 @@ class ControllableInterface:
              emb_slider_6,
              loudness_in_db
              ):
-        self.wgan.set_latent(voice_seed)
-        controllability_vector = torch.tensor([emb_slider_1,
-                                               emb_slider_2,
-                                               emb_slider_3,
-                                               emb_slider_4,
-                                               emb_slider_5,
-                                               emb_slider_6], dtype=torch.float32)
-        embedding = self.wgan.modify_embed(controllability_vector)
-        self.model.set_utterance_embedding(embedding=embedding)
         phones = self.model.text2phone.get_phone_string(prompt)
         if len(phones) > 1800:

     def read(self,
              prompt,
+             audio,
              voice_seed,
              prosody_creativity,
              duration_scaling_factor,
              emb_slider_6,
              loudness_in_db
              ):
+        if audio is None:
+            self.wgan.set_latent(voice_seed)
+            controllability_vector = torch.tensor([emb_slider_1,
+                                                   emb_slider_2,
+                                                   emb_slider_3,
+                                                   emb_slider_4,
+                                                   emb_slider_5,
+                                                   emb_slider_6], dtype=torch.float32)
+            embedding = self.wgan.modify_embed(controllability_vector)
+            self.model.set_utterance_embedding(embedding=embedding)
+        else:
+            self.model.set_utterance_embedding(path_to_reference_audio=audio)
         phones = self.model.text2phone.get_phone_string(prompt)
         if len(phones) > 1800:

app.py CHANGED Viewed

@@ -15,15 +15,14 @@ class TTSWebUI:
                                                      placeholder="write what you want the synthesis to read here...",
                                                      value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
                                                      label="Text input"),
                                           gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
                                                     value=279,
                                                     label="Random Seed for the artificial Voice"),
-                                          gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.4, label="Prosody Creativity"),
-                                          gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
-                                          gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
-                                          gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
-                                          gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
-                                          gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
                                           ],
                                   outputs=[gr.Audio(type="numpy", label="Speech"),
                                            gr.Image(label="Visualization")],
@@ -35,21 +34,21 @@ class TTSWebUI:
     def read(self,
              prompt,
-             voice_seed,
              prosody_creativity,
              duration_scaling_factor,
-             pitch_variance_scale,
-             energy_variance_scale,
              emb1,
              emb2
              ):
         sr, wav, fig = self.controllable_ui.read(prompt,
                                                  voice_seed,
                                                  prosody_creativity,
                                                  duration_scaling_factor,
                                                  1.,
-                                                 pitch_variance_scale,
-                                                 energy_variance_scale,
                                                  emb1,
                                                  emb2,
                                                  0.,

                                                      placeholder="write what you want the synthesis to read here...",
                                                      value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
                                                      label="Text input"),
+                                          gr.Audio(type="filepath", show_label=True, container=True, label="Voice to Clone (if left empty, will use an artificial voice instead)"),
+                                          gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.4, label="Prosody Creativity"),
+                                          gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
                                           gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
                                                     value=279,
                                                     label="Random Seed for the artificial Voice"),
+                                          gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity of artificial Voice"),
+                                          gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth of artificial Voice")
                                           ],
                                   outputs=[gr.Audio(type="numpy", label="Speech"),
                                            gr.Image(label="Visualization")],
     def read(self,
              prompt,
+             audio,
              prosody_creativity,
              duration_scaling_factor,
+             voice_seed,
              emb1,
              emb2
              ):
         sr, wav, fig = self.controllable_ui.read(prompt,
+                                                 audio,
                                                  voice_seed,
                                                  prosody_creativity,
                                                  duration_scaling_factor,
                                                  1.,
+                                                 1.,
+                                                 1.,
                                                  emb1,
                                                  emb2,
                                                  0.,