Spaces:
Running
Running
import os | |
import sys | |
import importlib | |
from subprocess import call | |
from pathlib import Path | |
import json | |
import math | |
import matplotlib.pyplot as plt | |
import torch | |
from torch import nn | |
from torch.nn import functional as F | |
from torch.utils.data import DataLoader | |
from scipy.io.wavfile import write | |
import gradio as gr | |
import scipy.io.wavfile | |
import numpy as np | |
def run_cmd(command): | |
try: | |
# print(command) | |
call(command, shell=True) | |
except KeyboardInterrupt: | |
print("Process interrupted") | |
sys.exit(1) | |
current = os.getcwd() | |
full = current + "/vits/monotonic_align" | |
os.chdir(full) | |
run_cmd("python3 setup.py build_ext --inplace") | |
run_cmd("mv vits/monotonic_align/* ./") | |
run_cmd("rm -rf vits") | |
# run_cmd(f"mv {current}/lfs/*.pth {current}/pretrained/") | |
# run_cmd("apt-get install espeak -y") | |
# run_cmd("gdown 'https://drive.google.com/uc?id=1q86w74Ygw2hNzYP9cWkeClGT5X25PvBT'") | |
os.chdir("../..") | |
from lojban.lojban import lojban2ipa | |
sys.path.insert(0, './vits') | |
import vits.commons as commons | |
import vits.utils as utils | |
from vits.data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate | |
from vits.models import SynthesizerTrn | |
from vits.text.symbols import symbols | |
from vits.text import _clean_text | |
from vits.text import cleaners | |
from vits.text.symbols import symbols | |
sys.path.insert(0, './nix_tts_simple') | |
from nix_tts_simple.tts import generate_voice | |
language_id_lookup = { | |
"Lojban": "jbo", | |
"Transcription": "ipa", | |
"English": "en", | |
"German": "de", | |
"Greek": "el", | |
"Spanish": "es", | |
"Finnish": "fi", | |
"Russian": "ru", | |
"Hungarian": "hu", | |
"Dutch": "nl", | |
"French": "fr", | |
'Polish': "pl", | |
'Portuguese': "pt", | |
'Italian': "it", | |
} | |
# def download_pretrained(): | |
# if not all(Path(file).exists() for file in ["pretrained_ljs.pth", "pretrained_vctk.pth"]): | |
# url = "https://drive.google.com/uc?id=1q86w74Ygw2hNzYP9cWkeClGT5X25PvBT" | |
# gdown.download_folder(url, quiet=True, use_cookies=False) | |
# Mappings from symbol to numeric ID and vice versa: | |
_symbol_to_id = {s: i for i, s in enumerate(symbols)} | |
_id_to_symbol = {i: s for i, s in enumerate(symbols)} | |
def text_to_sequence(text, language, cleaner_names, mode): | |
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. | |
Args: | |
text: string to convert to a sequence | |
cleaner_names: names of the cleaner functions to run the text through | |
Returns: | |
List of integers corresponding to the symbols in the text | |
''' | |
sequence = [] | |
if language == 'jbo': | |
clean_text = lojban2ipa(text, mode) | |
elif language == 'ipa': | |
clean_text = text | |
else: | |
clean_text = _clean_text(text, cleaner_names) | |
for symbol in clean_text: | |
symbol_id = _symbol_to_id[symbol] | |
sequence += [symbol_id] | |
return clean_text, sequence | |
def get_text(text, language, hps, mode): | |
ipa_text, text_norm = text_to_sequence( | |
text, language, hps.data.text_cleaners, mode) | |
if hps.data.add_blank: | |
text_norm = commons.intersperse(text_norm, 0) | |
text_norm_tensor = torch.LongTensor(text_norm) | |
return ipa_text, text_norm_tensor | |
def load_checkpoints(): | |
hps = utils.get_hparams_from_file(current + "/vits/configs/ljs_base.json") | |
model = SynthesizerTrn( | |
len(symbols), | |
hps.data.filter_length // 2 + 1, | |
hps.train.segment_size // hps.data.hop_length, | |
**hps.model) | |
_ = model.eval() | |
_ = utils.load_checkpoint(current + "/pretrained/vits/pretrained_ljs.pth", model, None) | |
hps_vctk = utils.get_hparams_from_file(current + "/vits/configs/vctk_base.json") | |
net_g_vctk = SynthesizerTrn( | |
len(symbols), | |
hps_vctk.data.filter_length // 2 + 1, | |
hps_vctk.train.segment_size // hps_vctk.data.hop_length, | |
n_speakers=hps_vctk.data.n_speakers, | |
**hps_vctk.model) | |
_ = model.eval() | |
_ = utils.load_checkpoint(current + "/pretrained/vits/pretrained_vctk.pth", net_g_vctk, None) | |
return model, hps, net_g_vctk, hps_vctk | |
def float2pcm(sig, dtype='int16'): | |
"""Convert floating point signal with a range from -1 to 1 to PCM. | |
Any signal values outside the interval [-1.0, 1.0) are clipped. | |
No dithering is used. | |
Note that there are different possibilities for scaling floating | |
point numbers to PCM numbers, this function implements just one of | |
them. For an overview of alternatives see | |
http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html | |
Parameters | |
---------- | |
sig : array_like | |
Input array, must have floating point type. | |
dtype : data type, optional | |
Desired (integer) data type. | |
Returns | |
------- | |
numpy.ndarray | |
Integer data, scaled and clipped to the range of the given | |
*dtype*. | |
See Also | |
-------- | |
pcm2float, dtype | |
""" | |
sig = np.asarray(sig) | |
if sig.dtype.kind != 'f': | |
raise TypeError("'sig' must be a float array") | |
dtype = np.dtype(dtype) | |
if dtype.kind not in 'iu': | |
raise TypeError("'dtype' must be an integer type") | |
i = np.iinfo(dtype) | |
abs_max = 2 ** (i.bits - 1) | |
offset = i.min + abs_max | |
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype) | |
def inference(text, language, noise_scale, noise_scale_w, length_scale, voice): | |
if len(text.strip())==0: | |
return [] | |
language = language.split()[0] | |
language = language_id_lookup[language] if bool( | |
language_id_lookup[language]) else "jbo" | |
if voice == 'Nix-Deterministic' and language == 'jbo': | |
return generate_voice(lojban2ipa(text,'nix'), current+"/pretrained/nix-tts/nix-ljspeech-v0.1") | |
elif voice == 'Nix-Stochastic' and language == 'jbo': | |
return generate_voice(lojban2ipa(text,'nix'), current+"/pretrained/nix-tts/nix-ljspeech-sdp-v0.1") | |
elif voice == 'LJS': | |
ipa_text, stn_tst = get_text(text, language, hps, mode="VITS") | |
with torch.no_grad(): | |
x_tst = stn_tst.unsqueeze(0) | |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) | |
audio = model.infer(x_tst, x_tst_lengths, noise_scale=noise_scale, | |
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy() | |
return [ipa_text, (hps.data.sampling_rate, float2pcm(audio))] | |
else: | |
ipa_text, stn_tst = get_text(text, language, hps_vctk, mode="VITS") | |
with torch.no_grad(): | |
x_tst = stn_tst.unsqueeze(0) | |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) | |
sid = torch.LongTensor([voice]) | |
audio = model_vctk.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, | |
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy() | |
return [ipa_text, (hps_vctk.data.sampling_rate, float2pcm(audio))] | |
# download_pretrained() | |
model, hps, model_vctk, hps_vctk = load_checkpoints() | |
defaults = { | |
"text": "coi munje", | |
"language": "Lojban", | |
"noise_scale": .667, | |
"noise_scale_w": .8, | |
"speed": 1.8, | |
"voice": "LJS", | |
"example": ["", "Lojban", 0.667, 0.8, 1.8,"LJS"] | |
} | |
inputs = [] | |
outputs = [] | |
css = """ | |
h1 {font-size:200%;} | |
h2 {font-size:120%;} | |
a {color: #0020c5;text-decoration: underline;} | |
img {display: inline-block;height:32px;} | |
""" | |
def conditionally_hide_widgets(voice): | |
if str(voice).startswith("Nix"): | |
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) | |
else: | |
return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True) | |
with gr.Blocks(css=css) as demo: | |
title = "<h1>la vitci voksa - <i><img src='/file/assets/jbolanci.png'/>Lojban text-to-speech</i></h1>" | |
gr.HTML(title) | |
description = "<h2>VITS & Nix-TTS text-to-speech adapted to Lojban. Join <a href='https://discord.gg/BVm4EYR'>Lojban Discord live chat</a> to discuss further.</h2>" | |
gr.HTML(description) | |
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2106.06103'>Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech</a> | <a href='https://github.com/jaywalnut310/vits'>Github Repo</a></p>" | |
with gr.Row(): | |
with gr.Column(): | |
input_text = gr.Textbox(lines=4, value=defaults["text"], label="Input text", placeholder="add your text, or click one of the examples to load them") | |
langs = gr.Radio([ | |
'Lojban', | |
'English', | |
'Transcription', | |
], value=defaults["language"], label="Language") | |
voices = gr.Radio(["LJS", 0, 1, 2, 3, 4, "Nix-Deterministic", "Nix-Stochastic"], value=defaults["voice"], label="Voice") | |
noise_scale = gr.Slider(label="Noise scale", minimum=0, maximum=2, | |
step=0.1, value=defaults["noise_scale"]) | |
noise_scale_w = gr.Slider(label="Noise scale W", minimum=0, maximum=2, | |
step=0.1, value=defaults["noise_scale_w"]) | |
slowness = gr.Slider(label="Slowness", minimum=0.1, maximum=3, | |
step=0.1, value=defaults["speed"]) | |
inputs = [input_text, langs, noise_scale, noise_scale_w, slowness, voices] | |
# events | |
vits_inputs = [noise_scale, noise_scale_w, slowness] | |
voices.change(fn=conditionally_hide_widgets, inputs=voices,outputs=vits_inputs) | |
with gr.Column(): | |
ipa_block = gr.Textbox(label="International Phonetic Alphabet") | |
audio = gr.Audio(type="numpy", label="Output audio") | |
outputs = [ ipa_block, audio ] | |
btn = gr.Button("Vocalize") | |
btn.click(fn=inference, inputs=inputs, outputs=outputs, api_name="cupra") | |
examples = list(map(lambda el: el[0:len(el)] + defaults["example"][len(el):], [ | |
["coi ro do ma nuzba", "Lojban"], | |
["mi djica lo nu do zvati ti", "Lojban", 0.667, 0.8, 1.8,4], | |
["mu xagji sofybakni cu zvati le purdi", "Lojban", 0.667, 0.8, 1.8, "Nix-Deterministic"], | |
["ni'o le pa tirxu be me'e zo .teris. pu ki kansa le za'u pendo be le nei le ka xabju le foldi be loi spati", "Lojban"], | |
[", miː dʒˈiːʃaː loːnʊuː doː zvˈaːtiː tiː.", "Transcription"], | |
["We propose VITS, Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech.", "English"], | |
])) | |
gr.Examples(examples, inputs, fn=inference, outputs=outputs, cache_examples=True, run_on_click=True) | |
demo.launch(server_name="0.0.0.0") | |