Flux9665 commited on
Commit
23208c6
1 Parent(s): 0ebcf15

add voice cloning interface

Browse files
InferenceInterfaces/ControllableInterface.py CHANGED
@@ -23,6 +23,7 @@ class ControllableInterface:
23
 
24
  def read(self,
25
  prompt,
 
26
  voice_seed,
27
  prosody_creativity,
28
  duration_scaling_factor,
@@ -37,15 +38,18 @@ class ControllableInterface:
37
  emb_slider_6,
38
  loudness_in_db
39
  ):
40
- self.wgan.set_latent(voice_seed)
41
- controllability_vector = torch.tensor([emb_slider_1,
42
- emb_slider_2,
43
- emb_slider_3,
44
- emb_slider_4,
45
- emb_slider_5,
46
- emb_slider_6], dtype=torch.float32)
47
- embedding = self.wgan.modify_embed(controllability_vector)
48
- self.model.set_utterance_embedding(embedding=embedding)
 
 
 
49
 
50
  phones = self.model.text2phone.get_phone_string(prompt)
51
  if len(phones) > 1800:
 
23
 
24
  def read(self,
25
  prompt,
26
+ audio,
27
  voice_seed,
28
  prosody_creativity,
29
  duration_scaling_factor,
 
38
  emb_slider_6,
39
  loudness_in_db
40
  ):
41
+ if audio is None:
42
+ self.wgan.set_latent(voice_seed)
43
+ controllability_vector = torch.tensor([emb_slider_1,
44
+ emb_slider_2,
45
+ emb_slider_3,
46
+ emb_slider_4,
47
+ emb_slider_5,
48
+ emb_slider_6], dtype=torch.float32)
49
+ embedding = self.wgan.modify_embed(controllability_vector)
50
+ self.model.set_utterance_embedding(embedding=embedding)
51
+ else:
52
+ self.model.set_utterance_embedding(path_to_reference_audio=audio)
53
 
54
  phones = self.model.text2phone.get_phone_string(prompt)
55
  if len(phones) > 1800:
app.py CHANGED
@@ -15,15 +15,14 @@ class TTSWebUI:
15
  placeholder="write what you want the synthesis to read here...",
16
  value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
17
  label="Text input"),
 
 
 
18
  gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
19
  value=279,
20
  label="Random Seed for the artificial Voice"),
21
- gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.4, label="Prosody Creativity"),
22
- gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
23
- gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
24
- gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
25
- gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
26
- gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
27
  ],
28
  outputs=[gr.Audio(type="numpy", label="Speech"),
29
  gr.Image(label="Visualization")],
@@ -35,21 +34,21 @@ class TTSWebUI:
35
 
36
  def read(self,
37
  prompt,
38
- voice_seed,
39
  prosody_creativity,
40
  duration_scaling_factor,
41
- pitch_variance_scale,
42
- energy_variance_scale,
43
  emb1,
44
  emb2
45
  ):
46
  sr, wav, fig = self.controllable_ui.read(prompt,
 
47
  voice_seed,
48
  prosody_creativity,
49
  duration_scaling_factor,
50
  1.,
51
- pitch_variance_scale,
52
- energy_variance_scale,
53
  emb1,
54
  emb2,
55
  0.,
 
15
  placeholder="write what you want the synthesis to read here...",
16
  value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
17
  label="Text input"),
18
+ gr.Audio(type="filepath", show_label=True, container=True, label="Voice to Clone (if left empty, will use an artificial voice instead)"),
19
+ gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.4, label="Prosody Creativity"),
20
+ gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
21
  gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
22
  value=279,
23
  label="Random Seed for the artificial Voice"),
24
+ gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity of artificial Voice"),
25
+ gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth of artificial Voice")
 
 
 
 
26
  ],
27
  outputs=[gr.Audio(type="numpy", label="Speech"),
28
  gr.Image(label="Visualization")],
 
34
 
35
  def read(self,
36
  prompt,
37
+ audio,
38
  prosody_creativity,
39
  duration_scaling_factor,
40
+ voice_seed,
 
41
  emb1,
42
  emb2
43
  ):
44
  sr, wav, fig = self.controllable_ui.read(prompt,
45
+ audio,
46
  voice_seed,
47
  prosody_creativity,
48
  duration_scaling_factor,
49
  1.,
50
+ 1.,
51
+ 1.,
52
  emb1,
53
  emb2,
54
  0.,