Flux9665 commited on
Commit
28ea968
·
1 Parent(s): 790d17c

simplify and update to current model

Browse files
InferenceInterfaces/ControllableInterface.py CHANGED
@@ -16,55 +16,24 @@ class ControllableInterface:
16
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
17
  os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
18
  self.device = "cuda" if gpu_id != "cpu" else "cpu"
19
- self.model = ToucanTTSInterface(device=self.device, tts_model_path="Meta")
20
- self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device=self.device)
21
- self.generated_speaker_embeds = list()
22
- self.available_artificial_voices = available_artificial_voices
23
- self.current_language = ""
24
- self.current_accent = ""
25
 
26
  def read(self,
27
  prompt,
28
- reference_audio,
29
- voice_seed,
30
- prosody_creativity,
31
- duration_scaling_factor,
32
- pause_duration_scaling_factor,
33
- pitch_variance_scale,
34
- energy_variance_scale,
35
- emb_slider_1,
36
- emb_slider_2,
37
- emb_slider_3,
38
- emb_slider_4,
39
- emb_slider_5,
40
- emb_slider_6,
41
  loudness_in_db
42
  ):
43
- if reference_audio is None:
44
- self.wgan.set_latent(voice_seed)
45
- controllability_vector = torch.tensor([emb_slider_1,
46
- emb_slider_2,
47
- emb_slider_3,
48
- emb_slider_4,
49
- emb_slider_5,
50
- emb_slider_6], dtype=torch.float32)
51
- embedding = self.wgan.modify_embed(controllability_vector)
52
- self.model.set_utterance_embedding(embedding=embedding)
53
- else:
54
- self.model.set_utterance_embedding(reference_audio)
55
-
56
  phones = self.model.text2phone.get_phone_string(prompt)
57
  if len(phones) > 1800:
58
  prompt = "Your input was too long. Please try either a shorter text or split it into several parts."
59
 
60
  print(prompt + "\n\n")
61
- wav, sr, fig = self.model(prompt,
62
  input_is_phones=False,
63
- duration_scaling_factor=duration_scaling_factor,
64
- pitch_variance_scale=pitch_variance_scale,
65
- energy_variance_scale=energy_variance_scale,
66
- pause_duration_scaling_factor=pause_duration_scaling_factor,
67
- return_plot_as_filepath=True,
68
- prosody_creativity=prosody_creativity,
69
  loudness_in_db=loudness_in_db)
70
- return sr, wav, fig
 
16
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
17
  os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
18
  self.device = "cuda" if gpu_id != "cpu" else "cpu"
19
+ self.model = ToucanTTSInterface(device=self.device, tts_model_path=None)
 
 
 
 
 
20
 
21
  def read(self,
22
  prompt,
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  loudness_in_db
24
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  phones = self.model.text2phone.get_phone_string(prompt)
26
  if len(phones) > 1800:
27
  prompt = "Your input was too long. Please try either a shorter text or split it into several parts."
28
 
29
  print(prompt + "\n\n")
30
+ wav, sr = self.model(prompt,
31
  input_is_phones=False,
32
+ duration_scaling_factor=1.0,
33
+ pitch_variance_scale=1.0,
34
+ energy_variance_scale=1.0,
35
+ pause_duration_scaling_factor=1.0,
36
+ return_plot_as_filepath=False,
37
+ prosody_creativity=0.5,
38
  loudness_in_db=loudness_in_db)
39
+ return sr, wav
InferenceInterfaces/ToucanTTSInterface.py CHANGED
@@ -7,6 +7,7 @@ import pyloudnorm
7
  import sounddevice
8
  import soundfile
9
  import torch
 
10
  from speechbrain.pretrained import EncoderClassifier
11
  from torchaudio.transforms import Resample
12
 
@@ -24,16 +25,14 @@ class ToucanTTSInterface(torch.nn.Module):
24
 
25
  def __init__(self,
26
  device="cpu", # device that everything computes on. If a cuda device is available, this can speed things up by an order of magnitude.
27
- tts_model_path=os.path.join(MODELS_DIR, f"ToucanTTS_Meta", "best.pt"), # path to the ToucanTTS checkpoint or just a shorthand if run standalone
28
- vocoder_model_path=os.path.join(MODELS_DIR, f"Vocoder", "best.pt"), # path to the Vocoder checkpoint
29
  language="eng", # initial language of the model, can be changed later with the setter methods
30
  ):
31
  super().__init__()
32
  self.device = device
33
- if not tts_model_path.endswith(".pt"):
34
- # default to shorthand system
35
- tts_model_path = os.path.join(MODELS_DIR, f"ToucanTTS_{tts_model_path}", "best.pt")
36
-
37
  ################################
38
  # build text to phone #
39
  ################################
 
7
  import sounddevice
8
  import soundfile
9
  import torch
10
+ from huggingface_hub import hf_hub_download
11
  from speechbrain.pretrained import EncoderClassifier
12
  from torchaudio.transforms import Resample
13
 
 
25
 
26
  def __init__(self,
27
  device="cpu", # device that everything computes on. If a cuda device is available, this can speed things up by an order of magnitude.
28
+ tts_model_path=None, # path to the ToucanTTS checkpoint or just a shorthand if run standalone
29
+ vocoder_model_path=None, # path to the Vocoder checkpoint
30
  language="eng", # initial language of the model, can be changed later with the setter methods
31
  ):
32
  super().__init__()
33
  self.device = device
34
+ tts_model_path = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="EnglishToucanTTS.pt")
35
+ vocoder_model_path = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="Vocoder.pt")
 
 
36
  ################################
37
  # build text to phone #
38
  ################################
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: StochasticToucanTTS
3
  emoji: 🦜
4
  colorFrom: green
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 4.37.2
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
1
  ---
2
+ title: SimpleToucanTTS
3
  emoji: 🦜
4
  colorFrom: green
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.3
8
  app_file: app.py
9
  pinned: false
10
  license: mit
app.py CHANGED
@@ -3,68 +3,27 @@ import torch.cuda
3
 
4
  from InferenceInterfaces.ControllableInterface import ControllableInterface
5
  from Utility.utils import float2pcm
6
- from Utility.utils import load_json_from_path
7
 
8
 
9
  class TTSWebUI:
10
 
11
- def __init__(self, gpu_id="cpu", title="Stochastic Speech Synthesis with ToucanTTS", article="For a multilingual version, have a look at https://huggingface.co/spaces/Flux9665/MassivelyMultilingualTTS", available_artificial_voices=1000, path_to_iso_list="Preprocessing/multilinguality/iso_to_fullname.json"):
12
- # iso_to_name = load_json_from_path(path_to_iso_list)
13
- # accent_selection = [f"{iso_to_name[iso_code]} Accent ({iso_code})" for iso_code in iso_to_name]
14
-
15
- self.controllable_ui = ControllableInterface(gpu_id=gpu_id,
16
- available_artificial_voices=available_artificial_voices)
17
  self.iface = gr.Interface(fn=self.read,
18
  inputs=[gr.Textbox(lines=2,
19
  placeholder="write what you want the synthesis to read here...",
20
  value="What I cannot create, I do not understand.",
21
- label="Text input"),
22
- gr.Audio(type="filepath", show_label=True, container=True, label="Voice to Clone (if left empty, will use an artificial voice instead)"),
23
- gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
24
- value=279,
25
- label="Random Seed for the artificial Voice"),
26
- gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.1, label="Prosody Creativity"),
27
- gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
28
- # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
29
- # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
30
- gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
31
- gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
32
- ],
33
- outputs=[gr.Audio(type="numpy", label="Speech"),
34
- gr.Image(label="Visualization")],
35
  title=title,
36
  theme="default",
37
  allow_flagging="never",
38
  article=article)
39
  self.iface.launch()
40
 
41
- def read(self,
42
- prompt,
43
- reference_audio,
44
- voice_seed,
45
- prosody_creativity,
46
- duration_scaling_factor,
47
- # pitch_variance_scale,
48
- # energy_variance_scale,
49
- emb1,
50
- emb2
51
- ):
52
- sr, wav, fig = self.controllable_ui.read(prompt,
53
- reference_audio,
54
- voice_seed,
55
- prosody_creativity,
56
- duration_scaling_factor,
57
- 1.,
58
- 1.0,
59
- 1.0,
60
- emb1,
61
- emb2,
62
- 0.,
63
- 0.,
64
- 0.,
65
- 0.,
66
- -20.)
67
- return (sr, float2pcm(wav)), fig
68
 
69
 
70
  if __name__ == '__main__':
 
3
 
4
  from InferenceInterfaces.ControllableInterface import ControllableInterface
5
  from Utility.utils import float2pcm
 
6
 
7
 
8
  class TTSWebUI:
9
 
10
+ def __init__(self, gpu_id="cpu", title="Simplistic Stochastic Speech Synthesis with ToucanTTS", article="For a multilingual version, have a look at https://huggingface.co/spaces/Flux9665/MassivelyMultilingualTTS"):
11
+ self.controllable_ui = ControllableInterface(gpu_id=gpu_id)
 
 
 
 
12
  self.iface = gr.Interface(fn=self.read,
13
  inputs=[gr.Textbox(lines=2,
14
  placeholder="write what you want the synthesis to read here...",
15
  value="What I cannot create, I do not understand.",
16
+ label="Text input")],
17
+ outputs=[gr.Audio(type="numpy", label="Speech")],
 
 
 
 
 
 
 
 
 
 
 
 
18
  title=title,
19
  theme="default",
20
  allow_flagging="never",
21
  article=article)
22
  self.iface.launch()
23
 
24
+ def read(self, prompt):
25
+ sr, wav = self.controllable_ui.read(prompt, -24.)
26
+ return sr, float2pcm(wav)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
 
29
  if __name__ == '__main__':
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ