Spaces:
Running
Running
save Nix-Stochastic as ogg
Browse files- .gitignore +1 -0
- app.py +8 -42
- dev.sh +1 -0
- libs/audio.py +55 -0
- packages.txt +2 -1
.gitignore
CHANGED
|
@@ -6,6 +6,7 @@ __pycache__
|
|
| 6 |
build
|
| 7 |
.ipynb_checkpoints
|
| 8 |
.*.swp
|
|
|
|
| 9 |
lfs/*
|
| 10 |
cache/*
|
| 11 |
nix-tts/*
|
|
|
|
| 6 |
build
|
| 7 |
.ipynb_checkpoints
|
| 8 |
.*.swp
|
| 9 |
+
*.so
|
| 10 |
lfs/*
|
| 11 |
cache/*
|
| 12 |
nix-tts/*
|
app.py
CHANGED
|
@@ -15,6 +15,7 @@ from scipy.io.wavfile import write
|
|
| 15 |
import gradio as gr
|
| 16 |
import scipy.io.wavfile
|
| 17 |
import numpy as np
|
|
|
|
| 18 |
|
| 19 |
def run_cmd(command):
|
| 20 |
try:
|
|
@@ -133,53 +134,18 @@ def load_checkpoints():
|
|
| 133 |
|
| 134 |
return model, hps, net_g_vctk, hps_vctk
|
| 135 |
|
| 136 |
-
|
| 137 |
-
def float2pcm(sig, dtype='int16'):
|
| 138 |
-
"""Convert floating point signal with a range from -1 to 1 to PCM.
|
| 139 |
-
Any signal values outside the interval [-1.0, 1.0) are clipped.
|
| 140 |
-
No dithering is used.
|
| 141 |
-
Note that there are different possibilities for scaling floating
|
| 142 |
-
point numbers to PCM numbers, this function implements just one of
|
| 143 |
-
them. For an overview of alternatives see
|
| 144 |
-
http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
|
| 145 |
-
Parameters
|
| 146 |
-
----------
|
| 147 |
-
sig : array_like
|
| 148 |
-
Input array, must have floating point type.
|
| 149 |
-
dtype : data type, optional
|
| 150 |
-
Desired (integer) data type.
|
| 151 |
-
Returns
|
| 152 |
-
-------
|
| 153 |
-
numpy.ndarray
|
| 154 |
-
Integer data, scaled and clipped to the range of the given
|
| 155 |
-
*dtype*.
|
| 156 |
-
See Also
|
| 157 |
-
--------
|
| 158 |
-
pcm2float, dtype
|
| 159 |
-
"""
|
| 160 |
-
sig = np.asarray(sig)
|
| 161 |
-
if sig.dtype.kind != 'f':
|
| 162 |
-
raise TypeError("'sig' must be a float array")
|
| 163 |
-
dtype = np.dtype(dtype)
|
| 164 |
-
if dtype.kind not in 'iu':
|
| 165 |
-
raise TypeError("'dtype' must be an integer type")
|
| 166 |
-
|
| 167 |
-
i = np.iinfo(dtype)
|
| 168 |
-
abs_max = 2 ** (i.bits - 1)
|
| 169 |
-
offset = i.min + abs_max
|
| 170 |
-
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
|
| 171 |
-
|
| 172 |
-
|
| 173 |
def inference(text, language, noise_scale, noise_scale_w, length_scale, voice):
|
| 174 |
if len(text.strip())==0:
|
| 175 |
return []
|
| 176 |
language = language.split()[0]
|
| 177 |
language = language_id_lookup[language] if bool(
|
| 178 |
language_id_lookup[language]) else "jbo"
|
|
|
|
| 179 |
if voice == 'Nix-Deterministic' and language == 'jbo':
|
| 180 |
-
|
| 181 |
elif voice == 'Nix-Stochastic' and language == 'jbo':
|
| 182 |
-
|
|
|
|
| 183 |
elif voice == 'LJS':
|
| 184 |
ipa_text, stn_tst = get_text(text, language, hps, mode="VITS")
|
| 185 |
with torch.no_grad():
|
|
@@ -187,7 +153,7 @@ def inference(text, language, noise_scale, noise_scale_w, length_scale, voice):
|
|
| 187 |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
| 188 |
audio = model.infer(x_tst, x_tst_lengths, noise_scale=noise_scale,
|
| 189 |
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()
|
| 190 |
-
|
| 191 |
else:
|
| 192 |
ipa_text, stn_tst = get_text(text, language, hps_vctk, mode="VITS")
|
| 193 |
with torch.no_grad():
|
|
@@ -196,8 +162,8 @@ def inference(text, language, noise_scale, noise_scale_w, length_scale, voice):
|
|
| 196 |
sid = torch.LongTensor([voice])
|
| 197 |
audio = model_vctk.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
|
| 198 |
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
|
| 199 |
-
|
| 200 |
-
|
| 201 |
|
| 202 |
# download_pretrained()
|
| 203 |
model, hps, model_vctk, hps_vctk = load_checkpoints()
|
|
|
|
| 15 |
import gradio as gr
|
| 16 |
import scipy.io.wavfile
|
| 17 |
import numpy as np
|
| 18 |
+
from libs.audio import wav2ogg, float2pcm
|
| 19 |
|
| 20 |
def run_cmd(command):
|
| 21 |
try:
|
|
|
|
| 134 |
|
| 135 |
return model, hps, net_g_vctk, hps_vctk
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
def inference(text, language, noise_scale, noise_scale_w, length_scale, voice):
|
| 138 |
if len(text.strip())==0:
|
| 139 |
return []
|
| 140 |
language = language.split()[0]
|
| 141 |
language = language_id_lookup[language] if bool(
|
| 142 |
language_id_lookup[language]) else "jbo"
|
| 143 |
+
result = []
|
| 144 |
if voice == 'Nix-Deterministic' and language == 'jbo':
|
| 145 |
+
result = generate_voice(lojban2ipa(text,'nix'), current+"/pretrained/nix-tts/nix-ljspeech-v0.1")
|
| 146 |
elif voice == 'Nix-Stochastic' and language == 'jbo':
|
| 147 |
+
result = generate_voice(lojban2ipa(text,'nix'), current+"/pretrained/nix-tts/nix-ljspeech-sdp-v0.1")
|
| 148 |
+
result = [result[0], wav2ogg(result[1][1], result[1][0], text, language)]
|
| 149 |
elif voice == 'LJS':
|
| 150 |
ipa_text, stn_tst = get_text(text, language, hps, mode="VITS")
|
| 151 |
with torch.no_grad():
|
|
|
|
| 153 |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
| 154 |
audio = model.infer(x_tst, x_tst_lengths, noise_scale=noise_scale,
|
| 155 |
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()
|
| 156 |
+
result = [ipa_text, (hps_vctk.data.sampling_rate, float2pcm(audio))]
|
| 157 |
else:
|
| 158 |
ipa_text, stn_tst = get_text(text, language, hps_vctk, mode="VITS")
|
| 159 |
with torch.no_grad():
|
|
|
|
| 162 |
sid = torch.LongTensor([voice])
|
| 163 |
audio = model_vctk.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
|
| 164 |
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
|
| 165 |
+
result = [ipa_text, (hps_vctk.data.sampling_rate, float2pcm(audio))]
|
| 166 |
+
return result
|
| 167 |
|
| 168 |
# download_pretrained()
|
| 169 |
model, hps, model_vctk, hps_vctk = load_checkpoints()
|
dev.sh
CHANGED
|
@@ -8,6 +8,7 @@ docker rm -f jboselvoha 2> /dev/null
|
|
| 8 |
# -p 7860:7860 \
|
| 9 |
# jboselvoha
|
| 10 |
docker run -d -it --name jboselvoha \
|
|
|
|
| 11 |
-v $(pwd)/assets:/home/user/app/assets:Z \
|
| 12 |
-v $(pwd)/pretrained/nix-tts:/home/user/app/pretrained/nix-tts/:Z \
|
| 13 |
-v $(pwd)/lfs/vits:/home/user/app/pretrained/vits/:Z \
|
|
|
|
| 8 |
# -p 7860:7860 \
|
| 9 |
# jboselvoha
|
| 10 |
docker run -d -it --name jboselvoha \
|
| 11 |
+
-v $(pwd)/libs:/home/user/app/libs:Z \
|
| 12 |
-v $(pwd)/assets:/home/user/app/assets:Z \
|
| 13 |
-v $(pwd)/pretrained/nix-tts:/home/user/app/pretrained/nix-tts/:Z \
|
| 14 |
-v $(pwd)/lfs/vits:/home/user/app/pretrained/vits/:Z \
|
libs/audio.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pydub
|
| 3 |
+
from re import sub
|
| 4 |
+
|
| 5 |
+
def float2pcm(sig, dtype='int16'):
|
| 6 |
+
"""Convert floating point signal with a range from -1 to 1 to PCM.
|
| 7 |
+
Any signal values outside the interval [-1.0, 1.0) are clipped.
|
| 8 |
+
No dithering is used.
|
| 9 |
+
Note that there are different possibilities for scaling floating
|
| 10 |
+
point numbers to PCM numbers, this function implements just one of
|
| 11 |
+
them. For an overview of alternatives see
|
| 12 |
+
http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
|
| 13 |
+
Parameters
|
| 14 |
+
----------
|
| 15 |
+
sig : array_like
|
| 16 |
+
Input array, must have floating point type.
|
| 17 |
+
dtype : data type, optional
|
| 18 |
+
Desired (integer) data type.
|
| 19 |
+
Returns
|
| 20 |
+
-------
|
| 21 |
+
numpy.ndarray
|
| 22 |
+
Integer data, scaled and clipped to the range of the given
|
| 23 |
+
*dtype*.
|
| 24 |
+
See Also
|
| 25 |
+
--------
|
| 26 |
+
pcm2float, dtype
|
| 27 |
+
"""
|
| 28 |
+
sig = np.asarray(sig)
|
| 29 |
+
if sig.dtype.kind != 'f':
|
| 30 |
+
raise TypeError("'sig' must be a float array")
|
| 31 |
+
dtype = np.dtype(dtype)
|
| 32 |
+
if dtype.kind not in 'iu':
|
| 33 |
+
raise TypeError("'dtype' must be an integer type")
|
| 34 |
+
|
| 35 |
+
i = np.iinfo(dtype)
|
| 36 |
+
abs_max = 2 ** (i.bits - 1)
|
| 37 |
+
offset = i.min + abs_max
|
| 38 |
+
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
|
| 39 |
+
|
| 40 |
+
def strip_text(text: str) -> str:
|
| 41 |
+
return sub(r"[^a-zA-Z0-9 ]", "", text)
|
| 42 |
+
|
| 43 |
+
def wav2ogg(x, sr, text, language, normalized=True):
|
| 44 |
+
print(x,sr,text,language)
|
| 45 |
+
"""numpy array to MP3"""
|
| 46 |
+
channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
|
| 47 |
+
if normalized: # normalized array - each item should be a float in [-1, 1)
|
| 48 |
+
y = np.int16(x * 2 ** 15)
|
| 49 |
+
else:
|
| 50 |
+
y = np.int16(x)
|
| 51 |
+
song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels)
|
| 52 |
+
path = f"/tmp/{language}-{strip_text(text)}.ogg"
|
| 53 |
+
song.export(path, format="ogg", codec="libvorbis")
|
| 54 |
+
# samples = song.get_array_of_samples()
|
| 55 |
+
return path # np.array(samples)
|
packages.txt
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
libsndfile1
|
| 2 |
-
espeak
|
|
|
|
|
|
| 1 |
libsndfile1
|
| 2 |
+
espeak
|
| 3 |
+
ffmpeg
|