Spaces:

Flux9665
/

EnglishToucan

Running on Zero

File size: 3,535 Bytes

0ebcf15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20636e5
0ebcf15
23208c6
 
 
0ebcf15
 
 
23208c6
 
0ebcf15
 
 
 
 
 
 
 
 
 
 
23208c6
0ebcf15
 
23208c6
0ebcf15
 
 
 
23208c6
0ebcf15
 
 
 
23208c6
 
0ebcf15

import gradio as gr
import torch.cuda

from InferenceInterfaces.ControllableInterface import ControllableInterface
from Utility.utils import float2pcm


class TTSWebUI:

    def __init__(self, gpu_id="cpu", title="Controllable Text-to-Speech with IMS Toucan", article="", available_artificial_voices=1000):
        self.controllable_ui = ControllableInterface(gpu_id=gpu_id,
                                                     available_artificial_voices=available_artificial_voices)
        self.iface = gr.Interface(fn=self.read,
                                  inputs=[gr.Textbox(lines=2,
                                                     placeholder="write what you want the synthesis to read here...",
                                                     value="What I cannot create, I do not understand.",
                                                     label="Text input"),
                                          gr.Audio(type="filepath", show_label=True, container=True, label="Voice to Clone (if left empty, will use an artificial voice instead)"),
                                          gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.4, label="Prosody Creativity"),
                                          gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
                                          gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
                                                    value=279,
                                                    label="Random Seed for the artificial Voice"),
                                          gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity of artificial Voice"),
                                          gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth of artificial Voice")
                                          ],
                                  outputs=[gr.Audio(type="numpy", label="Speech"),
                                           gr.Image(label="Visualization")],
                                  title=title,
                                  theme="default",
                                  allow_flagging="never",
                                  article=article)
        self.iface.launch()

    def read(self,
             prompt,
             audio,
             prosody_creativity,
             duration_scaling_factor,
             voice_seed,
             emb1,
             emb2
             ):
        sr, wav, fig = self.controllable_ui.read(prompt,
                                                 audio,
                                                 voice_seed,
                                                 prosody_creativity,
                                                 duration_scaling_factor,
                                                 1.,
                                                 1.,
                                                 1.,
                                                 emb1,
                                                 emb2,
                                                 0.,
                                                 0.,
                                                 0.,
                                                 0.,
                                                 -24.)
        return (sr, float2pcm(wav)), fig


if __name__ == '__main__':
    TTSWebUI(gpu_id="cuda" if torch.cuda.is_available() else "cpu")