Spaces:

Flux9665
/

MassivelyMultilingualTTS

Running on T4

Flux9665 commited on Jun 11, 2024

Commit

ee42912

•

1 Parent(s): d763494

try to figure out how ZeroGPU works

Files changed (2) hide show

InferenceInterfaces/ToucanTTSInterface.py CHANGED Viewed

@@ -8,7 +8,7 @@ import pyloudnorm
 import sounddevice
 import soundfile
 import torch
 with warnings.catch_warnings():
     warnings.simplefilter("ignore")
     from speechbrain.pretrained import EncoderClassifier
@@ -127,6 +127,7 @@ class ToucanTTSInterface(torch.nn.Module):
         self.lang_id = get_language_id(lang_id).to(self.device)
     def forward(self,
                 text,
                 view=False,
@@ -152,6 +153,10 @@ class ToucanTTSInterface(torch.nn.Module):
                                    1.0 means no scaling happens, higher values increase variance of the energy curve,
                                    lower values decrease variance of the energy curve.
         """
         with torch.inference_mode():
             phones = self.text2phone.string_to_tensor(text, input_phonemes=input_is_phones).to(torch.device(self.device))
             mel, durations, pitch, energy = self.phone2mel(phones,
@@ -223,6 +228,8 @@ class ToucanTTSInterface(torch.nn.Module):
             if return_plot_as_filepath:
                 plt.savefig("tmp.png")
                 return wave, sr, "tmp.png"
         return wave, sr
     def read_to_file(self,

 import sounddevice
 import soundfile
 import torch
+import spaces
 with warnings.catch_warnings():
     warnings.simplefilter("ignore")
     from speechbrain.pretrained import EncoderClassifier
         self.lang_id = get_language_id(lang_id).to(self.device)
+    @spaces.GPU
     def forward(self,
                 text,
                 view=False,
                                    1.0 means no scaling happens, higher values increase variance of the energy curve,
                                    lower values decrease variance of the energy curve.
         """
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = device
+        self.to(device)
         with torch.inference_mode():
             phones = self.text2phone.string_to_tensor(text, input_phonemes=input_is_phones).to(torch.device(self.device))
             mel, durations, pitch, energy = self.phone2mel(phones,
             if return_plot_as_filepath:
                 plt.savefig("tmp.png")
                 return wave, sr, "tmp.png"
+        self.to("cpu")
+        self.device="cpu"
         return wave, sr
     def read_to_file(self,

app.py CHANGED Viewed

@@ -35,7 +35,6 @@ class ControllableInterface(torch.nn.Module):
         self.model.device = "cpu"
         self.wgan.to("cpu")
         self.wgan.device = "cpu"
-        self._modules = []
     def read(self,
              prompt,
@@ -123,7 +122,6 @@ class ControllableInterface(torch.nn.Module):
-@spaces.GPU
 def read(prompt,
          language,
          voice_seed,
@@ -133,13 +131,7 @@ def read(prompt,
          emb1,
          emb2
          ):
-    if torch.cuda.is_available():
-        controllable_ui.to("cuda")
-        controllable_ui.device = "cuda"
-        controllable_ui.model.device = "cuda"
-        controllable_ui.wgan.device = "cuda"
-    try:
-        sr, wav, fig = controllable_ui.read(prompt,
                                             language.split(" ")[-1].split("(")[1].split(")")[0],
                                             language.split(" ")[-1].split("(")[1].split(")")[0],
                                             voice_seed,
@@ -154,11 +146,6 @@ def read(prompt,
                                             0.,
                                             0.,
                                             -24.)
-    finally:
-        controllable_ui.to("cpu")
-        controllable_ui.device = "cpu"
-        controllable_ui.model.device = "cpu"
-        controllable_ui.wgan.device = "cpu"
     return (sr, float2pcm(wav)), fig

         self.model.device = "cpu"
         self.wgan.to("cpu")
         self.wgan.device = "cpu"
     def read(self,
              prompt,
 def read(prompt,
          language,
          voice_seed,
          emb1,
          emb2
          ):
+    sr, wav, fig = controllable_ui.read(prompt,
                                             language.split(" ")[-1].split("(")[1].split(")")[0],
                                             language.split(" ")[-1].split("(")[1].split(")")[0],
                                             voice_seed,
                                             0.,
                                             0.,
                                             -24.)
     return (sr, float2pcm(wav)), fig