Spaces:
Running
on
T4
Running
on
T4
File size: 6,727 Bytes
68a11d5 6cd09aa 185fc75 68a11d5 8561621 d7c726f 185fc75 d7c726f 185fc75 6cd09aa 62d7978 6cd09aa fa66f8f 4daea3f 6cd09aa 13fc065 ab12c36 185fc75 6cd09aa 185fc75 6cd09aa 185fc75 6cd09aa 185fc75 6cd09aa fa66f8f 6cd09aa fa66f8f 6cd09aa fa66f8f 6a66802 62d7978 6cd09aa 5ad4ca1 1b1179a 6cd09aa 6a66802 62d7978 6a66802 62d7978 6a66802 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import os
import spaces
from run_model_downloader import download_models
if not os.path.exists("Models/ToucanTTS_Meta/best.pt"):
download_models()
import multiprocessing
multiprocessing.set_start_method("spawn", force=True)
import gradio as gr
from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
from Utility.utils import float2pcm
import os
import torch
from Architectures.ControllabilityGAN.GAN import GanWrapper
from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
from Utility.storage_config import MODELS_DIR
class ControllableInterface(torch.nn.Module):
def __init__(self, available_artificial_voices=1000):
super().__init__()
self.model = ToucanTTSInterface(device="cpu", tts_model_path="Meta", language="eng")
self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device="cpu")
self.generated_speaker_embeds = list()
self.available_artificial_voices = available_artificial_voices
self.current_language = ""
self.current_accent = ""
self.device = "cpu"
self.model.to("cpu")
self.model.device = "cpu"
self.wgan.to("cpu")
self.wgan.device = "cpu"
def read(self,
prompt,
language,
accent,
voice_seed,
duration_scaling_factor,
pause_duration_scaling_factor,
pitch_variance_scale,
energy_variance_scale,
emb_slider_1,
emb_slider_2,
emb_slider_3,
emb_slider_4,
emb_slider_5,
emb_slider_6,
loudness_in_db
):
if self.current_language != language:
self.model = ToucanTTSInterface(device="cpu", tts_model_path="Meta", language=language)
self.current_language = language
self.wgan.set_latent(voice_seed)
controllability_vector = torch.tensor([emb_slider_1,
emb_slider_2,
emb_slider_3,
emb_slider_4,
emb_slider_5,
emb_slider_6], dtype=torch.float32)
embedding = self.wgan.modify_embed(controllability_vector)
self.model.set_utterance_embedding(embedding=embedding)
phones = self.model.text2phone.get_phone_string(prompt)
if len(phones) > 1800:
return
print(prompt)
wav, sr, fig = self.model(prompt,
input_is_phones=False,
duration_scaling_factor=duration_scaling_factor,
pitch_variance_scale=pitch_variance_scale,
energy_variance_scale=energy_variance_scale,
pause_duration_scaling_factor=pause_duration_scaling_factor,
return_plot_as_filepath=True,
loudness_in_db=loudness_in_db)
return sr, wav, fig
title = "🚧UNDER CONSTRUCTION🚧 Controllable Text-to-Speech for over 7000 Languages"
article = "Check out the IMS Toucan TTS Toolkit at https://github.com/DigitalPhonetics/IMS-Toucan"
available_artificial_voices = 1000
path_to_iso_list = "Preprocessing/multilinguality/iso_to_fullname.json"
iso_to_name = load_json_from_path(path_to_iso_list)
text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
controllable_ui = ControllableInterface(available_artificial_voices=available_artificial_voices)
def read(prompt,
language,
voice_seed,
duration_scaling_factor,
pitch_variance_scale,
energy_variance_scale,
emb1,
emb2
):
with torch.no_grad():
sr, wav, fig = controllable_ui.read(prompt,
language.split(" ")[-1].split("(")[1].split(")")[0],
language.split(" ")[-1].split("(")[1].split(")")[0],
voice_seed,
duration_scaling_factor,
1.,
pitch_variance_scale,
energy_variance_scale,
emb1,
emb2,
0.,
0.,
0.,
0.,
-24.)
return (sr, float2pcm(wav)), fig
iface = gr.Interface(fn=read,
inputs=[gr.Textbox(lines=2,
placeholder="write what you want the synthesis to read here...",
value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
label="Text input"),
gr.Dropdown(text_selection,
type="value",
value='English Text (eng)',
label="Select the Language of the Text (type on your keyboard to find it quickly)"),
gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
value=279,
label="Random Seed for the artificial Voice"),
gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
],
outputs=[gr.Audio(type="numpy", label="Speech"),
gr.Image(label="Visualization")],
title=title,
theme="default",
allow_flagging="never",
article=article)
iface.launch()
|