Spaces:
Running
Running
import os | |
import sys | |
import importlib | |
from subprocess import call | |
from pathlib import Path | |
import json | |
import math | |
import matplotlib.pyplot as plt | |
import torch | |
from torch import nn | |
from torch.nn import functional as F | |
from torch.utils.data import DataLoader | |
from scipy.io.wavfile import write | |
import gradio as gr | |
import scipy.io.wavfile | |
import numpy as np | |
from libs.audio import wav2ogg, float2pcm | |
def run_cmd(command): | |
try: | |
# print(command) | |
call(command, shell=True) | |
except KeyboardInterrupt: | |
print("Process interrupted") | |
sys.exit(1) | |
current = os.getcwd() | |
full = current + "/vits/monotonic_align" | |
os.chdir(full) | |
run_cmd("python3 setup.py build_ext --inplace") | |
run_cmd("mv vits/monotonic_align/* ./") | |
run_cmd("rm -rf vits") | |
# run_cmd(f"mv {current}/lfs/*.pth {current}/pretrained/") | |
# run_cmd("apt-get install espeak -y") | |
# run_cmd("gdown 'https://drive.google.com/uc?id=1q86w74Ygw2hNzYP9cWkeClGT5X25PvBT'") | |
os.chdir("../..") | |
from lojban.lojban import lojban2ipa | |
sys.path.insert(0, './vits') | |
import vits.commons as commons | |
import vits.utils as utils | |
from vits.data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate | |
from vits.models import SynthesizerTrn | |
from vits.text.symbols import symbols | |
from vits.text import _clean_text | |
from vits.text import cleaners | |
from vits.text.symbols import symbols | |
sys.path.insert(0, './nix_tts_simple') | |
from nix_tts_simple.tts import generate_voice | |
language_id_lookup = { | |
"Lojban": "jbo", | |
"Transcription": "ipa", | |
"English": "en", | |
"German": "de", | |
"Greek": "el", | |
"Spanish": "es", | |
"Finnish": "fi", | |
"Russian": "ru", | |
"Hungarian": "hu", | |
"Dutch": "nl", | |
"French": "fr", | |
'Polish': "pl", | |
'Portuguese': "pt", | |
'Italian': "it", | |
} | |
# def download_pretrained(): | |
# if not all(Path(file).exists() for file in ["pretrained_ljs.pth", "pretrained_vctk.pth"]): | |
# url = "https://drive.google.com/uc?id=1q86w74Ygw2hNzYP9cWkeClGT5X25PvBT" | |
# gdown.download_folder(url, quiet=True, use_cookies=False) | |
# Mappings from symbol to numeric ID and vice versa: | |
_symbol_to_id = {s: i for i, s in enumerate(symbols)} | |
_id_to_symbol = {i: s for i, s in enumerate(symbols)} | |
def text_to_sequence(text, language, cleaner_names, mode): | |
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. | |
Args: | |
text: string to convert to a sequence | |
cleaner_names: names of the cleaner functions to run the text through | |
Returns: | |
List of integers corresponding to the symbols in the text | |
''' | |
sequence = [] | |
if language == 'jbo': | |
clean_text = lojban2ipa(text, mode) | |
elif language == 'ipa': | |
clean_text = text | |
else: | |
clean_text = _clean_text(text, cleaner_names) | |
for symbol in clean_text: | |
symbol_id = _symbol_to_id[symbol] | |
sequence += [symbol_id] | |
return clean_text, sequence | |
def get_text(text, language, hps, mode): | |
ipa_text, text_norm = text_to_sequence( | |
text, language, hps.data.text_cleaners, mode) | |
if hps.data.add_blank: | |
text_norm = commons.intersperse(text_norm, 0) | |
text_norm_tensor = torch.LongTensor(text_norm) | |
return ipa_text, text_norm_tensor | |
def load_checkpoints(): | |
hps = utils.get_hparams_from_file(current + "/vits/configs/ljs_base.json") | |
model = SynthesizerTrn( | |
len(symbols), | |
hps.data.filter_length // 2 + 1, | |
hps.train.segment_size // hps.data.hop_length, | |
**hps.model) | |
_ = model.eval() | |
_ = utils.load_checkpoint(current + "/pretrained/vits/pretrained_ljs.pth", model, None) | |
hps_vctk = utils.get_hparams_from_file(current + "/vits/configs/vctk_base.json") | |
net_g_vctk = SynthesizerTrn( | |
len(symbols), | |
hps_vctk.data.filter_length // 2 + 1, | |
hps_vctk.train.segment_size // hps_vctk.data.hop_length, | |
n_speakers=hps_vctk.data.n_speakers, | |
**hps_vctk.model) | |
_ = model.eval() | |
_ = utils.load_checkpoint(current + "/pretrained/vits/pretrained_vctk.pth", net_g_vctk, None) | |
return model, hps, net_g_vctk, hps_vctk | |
def inference(text, language, noise_scale, noise_scale_w, length_scale, voice, file_format): | |
if len(text.strip())==0: | |
return [] | |
language = language.split()[0] | |
language = language_id_lookup[language] if bool( | |
language_id_lookup[language]) else "jbo" | |
result = [] | |
if voice == 'Nix-Deterministic' and language == 'jbo': | |
result = generate_voice(lojban2ipa(text,'nix'), current+"/pretrained/nix-tts/nix-ljspeech-v0.1") | |
elif voice == 'Nix-Stochastic' and language == 'jbo': | |
result = generate_voice(lojban2ipa(text,'nix'), current+"/pretrained/nix-tts/nix-ljspeech-sdp-v0.1") | |
elif voice == 'LJS': | |
ipa_text, stn_tst = get_text(text, language, hps, mode="VITS") | |
with torch.no_grad(): | |
x_tst = stn_tst.unsqueeze(0) | |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) | |
audio = model.infer(x_tst, x_tst_lengths, noise_scale=noise_scale, | |
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy() | |
result = [ipa_text, (hps_vctk.data.sampling_rate, audio)] | |
else: | |
ipa_text, stn_tst = get_text(text, language, hps_vctk, mode="VITS") | |
with torch.no_grad(): | |
x_tst = stn_tst.unsqueeze(0) | |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) | |
sid = torch.LongTensor([voice]) | |
audio = model_vctk.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, | |
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy() | |
result = [ipa_text, (hps_vctk.data.sampling_rate, audio)] | |
if file_format == 'ogg': | |
result = [result[0], wav2ogg(result[1][1], result[1][0], text, language)] | |
else: | |
result = [result[0], (result[1][0], float2pcm(result[1][1]))] | |
return result | |
# download_pretrained() | |
model, hps, model_vctk, hps_vctk = load_checkpoints() | |
defaults = { | |
"text": "coi munje", | |
"language": "Lojban", | |
"noise_scale": .667, | |
"noise_scale_w": .8, | |
"speed": 1.8, | |
"voice": "LJS", | |
"example": ["", "Lojban", 0.667, 0.8, 1.8,"LJS","wav"] | |
} | |
inputs = [] | |
outputs = [] | |
css = """ | |
h1 {font-size:200%;} | |
h2 {font-size:120%;} | |
h2 a {color: #0020c5;text-decoration: underline;} | |
p a {text-decoration: underline;} | |
img {display: inline-block;height:32px;} | |
#velsku { | |
text-align: left; | |
margin: 0; | |
/* display: none; */ | |
/* justify-content: baseline; */ | |
/* align-items: flex-start; */ | |
padding: 0; | |
/* height: 28px; */ | |
width: 100%; | |
bottom: 0px; | |
left: 0px; | |
position: fixed; | |
background: white; | |
z-index: 10; | |
} | |
#velsku_sebenji { | |
padding: 0.1rem; | |
display: flex; | |
white-space: nowrap; | |
text-overflow: ellipsis; | |
box-shadow: 0 0 0 1px rgb(56 136 233), 0 0 0 2px rgb(34 87 213), | |
0 0 0 3px rgb(38 99 224), 0 0 0 4px rgb(25 65 165); | |
margin-top: 3px; | |
} | |
.velsku_pamei { | |
white-space: nowrap; | |
overflow: hidden; | |
text-overflow: ellipsis; | |
} | |
.velsku_pixra { | |
max-height: 20px; | |
margin-right: 0.2rem; | |
} | |
#velsku a { | |
color: #2b79e0; | |
text-decoration: none; | |
} | |
""" | |
def conditionally_hide_widgets(voice): | |
if str(voice).startswith("Nix"): | |
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) | |
else: | |
return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True) | |
title = "<h1>la vitci voksa - <i><img src='/file/assets/jbolanci.png'/>Lojban text-to-speech</i></h1>" | |
description = "<h2>VITS & Nix-TTS text-to-speech adapted to Lojban. Join <a href='https://discord.gg/BVm4EYR'>Lojban Discord live chat</a> to discuss further.</h2>" | |
article = "<p style='text-align: center'><a href='https://github.com/jaywalnut310/vits'>VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech</a> | <a href='https://github.com/rendchevi/nix-tts'>Nix-TTS: Lightweight and End-to-end Text-to-Speech via Module-wise Distillation</a></p>" | |
scripts = """ | |
async () => { | |
const script = document.createElement("script"); | |
script.onload = () => console.log("socket.io loaded") ; | |
script.src = "https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.5.1/socket.io.js"; | |
document.head.appendChild(script); | |
document.addEventListener("DOMContentLoaded", () => { | |
var socket1Chat_connected; | |
var socket1Chat = io("wss://jbotcan.org:9091", { | |
transports: ["polling", "websocket"], | |
}); | |
socket1Chat.on("connect", function () { | |
console.log(socket1Chat); | |
socket1Chat_connected = true; | |
}); | |
socket1Chat.on("connect_error", function () { | |
console.log("1chat connection error"); | |
}); | |
function trimSocketChunk(text) { | |
return text; | |
} | |
socket1Chat.on("sentFrom", function (data) { | |
if (!socket1Chat_connected) return; | |
const i = data.data; | |
const msg = { | |
d: trimSocketChunk(i.chunk), | |
s: i.channelId, | |
w: i.author, | |
}; | |
const velsku = document.getElementById("velsku_sebenji"); | |
velsku.innerHTML = `<img src="https://la-lojban.github.io/sutysisku/pixra/nunsku.svg" class="velsku_pixra"/> <span class="velsku_pamei">[${msg.s}] ${msg.w}: ${msg.d}</span>`; | |
}); | |
socket1Chat.on("history", function (data) { | |
if (!socket1Chat_connected) return; | |
const i = data.slice(-1)[0]; | |
if (!i) return; | |
const msg = { | |
d: trimSocketChunk(i.chunk), | |
s: i.channelId, | |
w: i.author, | |
}; | |
const velsku = document.getElementById("velsku_sebenji"); | |
velsku.innerHTML = `<img src="https://la-lojban.github.io/sutysisku/pixra/nunsku.svg" class="velsku_pixra"/> <span class="velsku_pamei">[${msg.s}] ${msg.w}: ${msg.d}</span>`; | |
}); | |
}); | |
} | |
""" | |
chat = """ | |
<div id="velsku" class="noselect"> | |
<a id="velsku_sebenji" href="https://discord.gg/4KhzRzpmVr" target="_blank"> | |
<img src="https://la-lojban.github.io/sutysisku/pixra/nunsku.svg" class="velsku_pixra"/> | |
Live chat on Discord | |
</a> | |
</div> | |
""" | |
with gr.Blocks(css=css) as demo: | |
gr.HTML(title) | |
gr.HTML(description) | |
with gr.Row(): | |
with gr.Column(): | |
input_text = gr.Textbox(lines=4, value=defaults["text"], label="Input text", placeholder="add your text, or click one of the examples to load them") | |
langs = gr.Radio([ | |
'Lojban', | |
'English', | |
'Transcription', | |
], value=defaults["language"], label="Language") | |
voices = gr.Radio(["LJS", 0, 1, 2, 3, 4, "Nix-Deterministic", "Nix-Stochastic"], value=defaults["voice"], label="Voice") | |
noise_scale = gr.Slider(label="Noise scale", minimum=0, maximum=2, | |
step=0.1, value=defaults["noise_scale"]) | |
noise_scale_w = gr.Slider(label="Noise scale W", minimum=0, maximum=2, | |
step=0.1, value=defaults["noise_scale_w"]) | |
slowness = gr.Slider(label="Slowness", minimum=0.1, maximum=3, | |
step=0.1, value=defaults["speed"]) | |
file_format = gr.Radio(["wav", "ogg"], value="wav", label="File format") | |
inputs = [input_text, langs, noise_scale, noise_scale_w, slowness, voices, file_format] | |
# events | |
vits_inputs = [noise_scale, noise_scale_w, slowness] | |
voices.change(fn=conditionally_hide_widgets, inputs=voices,outputs=vits_inputs) | |
with gr.Column(): | |
ipa_block = gr.Textbox(label="International Phonetic Alphabet") | |
audio = gr.Audio(type="numpy", label="Output audio") | |
outputs = [ ipa_block, audio ] | |
btn = gr.Button("Vocalize") | |
btn.click(fn=inference, inputs=inputs, outputs=outputs, api_name="cupra") | |
examples = list(map(lambda el: el[0:len(el)] + defaults["example"][len(el):], [ | |
["coi ro do ma nuzba", "Lojban"], | |
["mi djica lo nu do zvati ti", "Lojban", 0.667, 0.8, 1.8,4], | |
["mu xagji sofybakni cu zvati le purdi", "Lojban", 0.667, 0.8, 1.8, "Nix-Deterministic"], | |
["ni'o le pa tirxu be me'e zo .teris. pu ki kansa le za'u pendo be le nei le ka xabju le foldi be loi spati", "Lojban"], | |
[", miː dʒˈiːʃaː loːnʊuː doː zvˈaːtiː tiː.", "Transcription"], | |
["We propose VITS, Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech.", "English"], | |
])) | |
gr.Examples(examples, inputs, fn=inference, outputs=outputs, cache_examples=True, run_on_click=True) | |
gr.HTML(article) | |
gr.HTML(chat) | |
# gr.HTML(scripts) | |
demo.launch() | |