Spaces:

lojban
/

text-to-speech

Running

App Files Files Community

lojban commited on Dec 23, 2022

Commit

e85d807

1 Parent(s): f9d0d4d

save Nix-Stochastic as ogg

Browse files

Files changed (5) hide show

.gitignore +1 -0
app.py +8 -42
dev.sh +1 -0
libs/audio.py +55 -0
packages.txt +2 -1

.gitignore CHANGED Viewed

@@ -6,6 +6,7 @@ __pycache__
 build
 .ipynb_checkpoints
 .*.swp
 lfs/*
 cache/*
 nix-tts/*

 build
 .ipynb_checkpoints
 .*.swp
+*.so
 lfs/*
 cache/*
 nix-tts/*

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ from scipy.io.wavfile import write
 import gradio as gr
 import scipy.io.wavfile
 import numpy as np
 def run_cmd(command):
     try:
@@ -133,53 +134,18 @@ def load_checkpoints():
     return model, hps, net_g_vctk, hps_vctk
-def float2pcm(sig, dtype='int16'):
-    """Convert floating point signal with a range from -1 to 1 to PCM.
-    Any signal values outside the interval [-1.0, 1.0) are clipped.
-    No dithering is used.
-    Note that there are different possibilities for scaling floating
-    point numbers to PCM numbers, this function implements just one of
-    them.  For an overview of alternatives see
-    http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
-    Parameters
-    ----------
-    sig : array_like
-        Input array, must have floating point type.
-    dtype : data type, optional
-        Desired (integer) data type.
-    Returns
-    -------
-    numpy.ndarray
-        Integer data, scaled and clipped to the range of the given
-        *dtype*.
-    See Also
-    --------
-    pcm2float, dtype
-    """
-    sig = np.asarray(sig)
-    if sig.dtype.kind != 'f':
-        raise TypeError("'sig' must be a float array")
-    dtype = np.dtype(dtype)
-    if dtype.kind not in 'iu':
-        raise TypeError("'dtype' must be an integer type")
-    i = np.iinfo(dtype)
-    abs_max = 2 ** (i.bits - 1)
-    offset = i.min + abs_max
-    return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
 def inference(text, language, noise_scale, noise_scale_w, length_scale, voice):
     if len(text.strip())==0:
         return []
     language = language.split()[0]
     language = language_id_lookup[language] if bool(
         language_id_lookup[language]) else "jbo"
     if voice == 'Nix-Deterministic' and language == 'jbo':
-        return generate_voice(lojban2ipa(text,'nix'), current+"/pretrained/nix-tts/nix-ljspeech-v0.1")
     elif voice == 'Nix-Stochastic' and language == 'jbo':
-        return generate_voice(lojban2ipa(text,'nix'), current+"/pretrained/nix-tts/nix-ljspeech-sdp-v0.1")
     elif voice == 'LJS':
         ipa_text, stn_tst = get_text(text, language, hps, mode="VITS")
         with torch.no_grad():
@@ -187,7 +153,7 @@ def inference(text, language, noise_scale, noise_scale_w, length_scale, voice):
             x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
             audio = model.infer(x_tst, x_tst_lengths, noise_scale=noise_scale,
                                 noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()
-            return [ipa_text, (hps.data.sampling_rate, float2pcm(audio))]
     else:
         ipa_text, stn_tst = get_text(text, language, hps_vctk, mode="VITS")
         with torch.no_grad():
@@ -196,8 +162,8 @@ def inference(text, language, noise_scale, noise_scale_w, length_scale, voice):
             sid = torch.LongTensor([voice])
             audio = model_vctk.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
                                      noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
-            return [ipa_text, (hps_vctk.data.sampling_rate, float2pcm(audio))]
 # download_pretrained()
 model, hps, model_vctk, hps_vctk = load_checkpoints()

 import gradio as gr
 import scipy.io.wavfile
 import numpy as np
+from libs.audio import wav2ogg, float2pcm
 def run_cmd(command):
     try:
     return model, hps, net_g_vctk, hps_vctk
 def inference(text, language, noise_scale, noise_scale_w, length_scale, voice):
     if len(text.strip())==0:
         return []
     language = language.split()[0]
     language = language_id_lookup[language] if bool(
         language_id_lookup[language]) else "jbo"
+    result = []
     if voice == 'Nix-Deterministic' and language == 'jbo':
+        result = generate_voice(lojban2ipa(text,'nix'), current+"/pretrained/nix-tts/nix-ljspeech-v0.1")
     elif voice == 'Nix-Stochastic' and language == 'jbo':
+        result = generate_voice(lojban2ipa(text,'nix'), current+"/pretrained/nix-tts/nix-ljspeech-sdp-v0.1")
+        result = [result[0], wav2ogg(result[1][1], result[1][0], text, language)]
     elif voice == 'LJS':
         ipa_text, stn_tst = get_text(text, language, hps, mode="VITS")
         with torch.no_grad():
             x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
             audio = model.infer(x_tst, x_tst_lengths, noise_scale=noise_scale,
                                 noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()
+            result = [ipa_text, (hps_vctk.data.sampling_rate, float2pcm(audio))]
     else:
         ipa_text, stn_tst = get_text(text, language, hps_vctk, mode="VITS")
         with torch.no_grad():
             sid = torch.LongTensor([voice])
             audio = model_vctk.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
                                      noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
+            result = [ipa_text, (hps_vctk.data.sampling_rate, float2pcm(audio))]
+    return result
 # download_pretrained()
 model, hps, model_vctk, hps_vctk = load_checkpoints()

dev.sh CHANGED Viewed

@@ -8,6 +8,7 @@ docker rm -f jboselvoha 2> /dev/null
 # -p 7860:7860 \
 # jboselvoha
 docker run -d -it --name jboselvoha  \
 -v $(pwd)/assets:/home/user/app/assets:Z \
 -v $(pwd)/pretrained/nix-tts:/home/user/app/pretrained/nix-tts/:Z \
 -v $(pwd)/lfs/vits:/home/user/app/pretrained/vits/:Z \

 # -p 7860:7860 \
 # jboselvoha
 docker run -d -it --name jboselvoha  \
+-v $(pwd)/libs:/home/user/app/libs:Z \
 -v $(pwd)/assets:/home/user/app/assets:Z \
 -v $(pwd)/pretrained/nix-tts:/home/user/app/pretrained/nix-tts/:Z \
 -v $(pwd)/lfs/vits:/home/user/app/pretrained/vits/:Z \

libs/audio.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import numpy as np
+import pydub
+from re import sub
+def float2pcm(sig, dtype='int16'):
+    """Convert floating point signal with a range from -1 to 1 to PCM.
+    Any signal values outside the interval [-1.0, 1.0) are clipped.
+    No dithering is used.
+    Note that there are different possibilities for scaling floating
+    point numbers to PCM numbers, this function implements just one of
+    them.  For an overview of alternatives see
+    http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
+    Parameters
+    ----------
+    sig : array_like
+        Input array, must have floating point type.
+    dtype : data type, optional
+        Desired (integer) data type.
+    Returns
+    -------
+    numpy.ndarray
+        Integer data, scaled and clipped to the range of the given
+        *dtype*.
+    See Also
+    --------
+    pcm2float, dtype
+    """
+    sig = np.asarray(sig)
+    if sig.dtype.kind != 'f':
+        raise TypeError("'sig' must be a float array")
+    dtype = np.dtype(dtype)
+    if dtype.kind not in 'iu':
+        raise TypeError("'dtype' must be an integer type")
+    i = np.iinfo(dtype)
+    abs_max = 2 ** (i.bits - 1)
+    offset = i.min + abs_max
+    return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
+def strip_text(text: str) -> str:
+    return sub(r"[^a-zA-Z0-9 ]", "", text)
+def wav2ogg(x, sr, text, language, normalized=True):
+    print(x,sr,text,language)
+    """numpy array to MP3"""
+    channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
+    if normalized:  # normalized array - each item should be a float in [-1, 1)
+        y = np.int16(x * 2 ** 15)
+    else:
+        y = np.int16(x)
+    song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels)
+    path = f"/tmp/{language}-{strip_text(text)}.ogg"
+    song.export(path, format="ogg", codec="libvorbis")
+    # samples = song.get_array_of_samples()
+    return path # np.array(samples)

packages.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 libsndfile1
-espeak

 libsndfile1
+espeak
+ffmpeg