Spaces:
Runtime error
Runtime error
Upload 2 files
Browse files- app.py +33 -45
- requirements.txt +0 -1
app.py
CHANGED
@@ -2,7 +2,6 @@ from huggingface_hub import snapshot_download
|
|
2 |
from katsu import Katsu
|
3 |
from models import build_model
|
4 |
import gradio as gr
|
5 |
-
import noisereduce as nr
|
6 |
import numpy as np
|
7 |
import os
|
8 |
import phonemizer
|
@@ -112,33 +111,15 @@ def tokenize(ps):
|
|
112 |
# ⭐ Starred voices are averages of similar voices. 🧪 Experimental voices may be unstable.
|
113 |
CHOICES = {
|
114 |
'🇺🇸 🚺 American Female ⭐': 'af',
|
115 |
-
'🇺🇸 🚺 American Female 1': 'af_1',
|
116 |
-
'🇺🇸 🚺 Alloy 🧪': 'af_alloy',
|
117 |
'🇺🇸 🚺 Bella': 'af_bella',
|
118 |
-
'🇺🇸 🚺 Jessica 🧪': 'af_jessica',
|
119 |
-
'🇺🇸 🚺 Nicole': 'af_nicole',
|
120 |
-
'🇺🇸 🚺 Nova 🧪': 'af_nova',
|
121 |
-
'🇺🇸 🚺 River 🧪': 'af_river',
|
122 |
'🇺🇸 🚺 Sarah': 'af_sarah',
|
123 |
'🇺🇸 🚺 Sky 🧪': 'af_sky',
|
124 |
-
'🇺🇸 🚹 Adam': 'am_adam',
|
125 |
-
'🇺🇸 🚹 Echo 🧪': 'am_echo',
|
126 |
-
'🇺🇸 🚹 Eric 🧪': 'am_eric',
|
127 |
-
'🇺🇸 🚹 Liam 🧪': 'am_liam',
|
128 |
'🇺🇸 🚹 Michael': 'am_michael',
|
129 |
-
'
|
130 |
-
'
|
131 |
-
'🇬🇧 🚺 Alice 🧪': 'bf_alice',
|
132 |
-
'🇬🇧 🚺 Lily 🧪': 'bf_lily',
|
133 |
-
'🇬🇧 🚹 British Male 0': 'bm_0',
|
134 |
-
'🇬🇧 🚹 British Male 1': 'bm_1',
|
135 |
-
'🇬🇧 🚹 British Male 2': 'bm_2',
|
136 |
-
'🇬🇧 🚹 Daniel 🧪': 'bm_daniel',
|
137 |
-
'🇬🇧 🚹 Fable 🧪': 'bm_fable',
|
138 |
-
'🇬🇧 🚹 George 🧪': 'bm_george',
|
139 |
-
'🇯🇵 🚺 Japanese Female 0': 'jf_0',
|
140 |
}
|
141 |
-
VOICES = {k: torch.load(os.path.join(snapshot, '
|
142 |
|
143 |
np_log_99 = np.log(99)
|
144 |
def s_curve(p):
|
@@ -155,7 +136,7 @@ SAMPLE_RATE = 24000
|
|
155 |
@spaces.GPU(duration=10)
|
156 |
@torch.no_grad()
|
157 |
def forward(tokens, voice, speed):
|
158 |
-
ref_s = VOICES[voice]
|
159 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
160 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
161 |
text_mask = length_to_mask(input_lengths).to(device)
|
@@ -178,7 +159,7 @@ def forward(tokens, voice, speed):
|
|
178 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
179 |
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
180 |
|
181 |
-
def generate(text, voice, ps=None, speed=1.0,
|
182 |
if voice not in VOICES:
|
183 |
# Ensure stability for https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
184 |
voice = 'af'
|
@@ -194,8 +175,6 @@ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000
|
|
194 |
except gr.exceptions.Error as e:
|
195 |
raise gr.Error(e)
|
196 |
return (None, '')
|
197 |
-
if reduce_noise > 0:
|
198 |
-
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
|
199 |
opening_cut = int(opening_cut / speed)
|
200 |
if opening_cut > 0:
|
201 |
out = out[opening_cut:]
|
@@ -216,6 +195,9 @@ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000
|
|
216 |
out = np.concatenate([out, np.zeros(pad_after)])
|
217 |
return ((SAMPLE_RATE, out), ps)
|
218 |
|
|
|
|
|
|
|
219 |
with gr.Blocks() as basic_tts:
|
220 |
with gr.Row():
|
221 |
gr.Markdown('Generate speech for one segment of text (up to 510 tokens) using Kokoro, a TTS model with 80 million parameters.')
|
@@ -234,12 +216,12 @@ with gr.Blocks() as basic_tts:
|
|
234 |
phonemize_btn = gr.Button('Tokenize Input Text', variant='primary')
|
235 |
phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
|
236 |
with gr.Column():
|
237 |
-
audio = gr.Audio(interactive=False, label='Output Audio')
|
238 |
with gr.Accordion('Output Tokens', open=True):
|
239 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
240 |
with gr.Accordion('Audio Settings', open=False):
|
241 |
with gr.Row():
|
242 |
-
|
243 |
with gr.Row():
|
244 |
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
245 |
with gr.Row():
|
@@ -257,15 +239,18 @@ with gr.Blocks() as basic_tts:
|
|
257 |
pad_before = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad Before', info='🔇 How many samples of silence to insert before the start.')
|
258 |
with gr.Column():
|
259 |
pad_after = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad After', info='🔇 How many samples of silence to append after the end.')
|
260 |
-
|
|
|
|
|
261 |
|
262 |
@spaces.GPU
|
263 |
@torch.no_grad()
|
264 |
def lf_forward(token_lists, voice, speed):
|
265 |
-
|
266 |
-
s = ref_s[:, 128:]
|
267 |
outs = []
|
268 |
for tokens in token_lists:
|
|
|
|
|
269 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
270 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
271 |
text_mask = length_to_mask(input_lengths).to(device)
|
@@ -340,7 +325,7 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
|
|
340 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
341 |
return [(i, *row) for i, row in enumerate(segments)]
|
342 |
|
343 |
-
def lf_generate(segments, voice, speed=1.0,
|
344 |
token_lists = list(map(tokenize, segments['Tokens']))
|
345 |
wavs = []
|
346 |
opening_cut = int(opening_cut / speed)
|
@@ -357,8 +342,6 @@ def lf_generate(segments, voice, speed=1.0, reduce_noise=0.5, opening_cut=4000,
|
|
357 |
raise gr.Error(e)
|
358 |
break
|
359 |
for out in outs:
|
360 |
-
if reduce_noise > 0:
|
361 |
-
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
|
362 |
if opening_cut > 0:
|
363 |
out = out[opening_cut:]
|
364 |
if closing_cut > 0:
|
@@ -415,8 +398,6 @@ with gr.Blocks() as lf_tts:
|
|
415 |
with gr.Column():
|
416 |
audio = gr.Audio(interactive=False, label='Output Audio')
|
417 |
with gr.Accordion('Audio Settings', open=False):
|
418 |
-
with gr.Row():
|
419 |
-
reduce_noise = gr.Slider(minimum=0, maximum=1, value=0.5, label='Reduce Noise', info='👻 Fix it in post: non-stationary noise reduction via spectral gating.')
|
420 |
with gr.Row():
|
421 |
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
422 |
with gr.Row():
|
@@ -440,7 +421,7 @@ with gr.Blocks() as lf_tts:
|
|
440 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
441 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
442 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
443 |
-
generate_btn.click(lf_generate, inputs=[segments, voice, speed,
|
444 |
|
445 |
with gr.Blocks() as about:
|
446 |
gr.Markdown("""
|
@@ -453,11 +434,6 @@ The model was trained on 1x A100-class 80GB instances rented from [Vast.ai](http
|
|
453 |
Vast was chosen over other compute providers due to its competitive on-demand hourly rates.<br/>
|
454 |
The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
|
455 |
|
456 |
-
### Updates
|
457 |
-
This Space and the underlying Kokoro model are both under development and subject to change.<br/>
|
458 |
-
Last model update: 2024 Nov 15<br/>
|
459 |
-
Model trained by: Raven (@rzvzn on Discord)
|
460 |
-
|
461 |
### Licenses
|
462 |
Inference code: MIT<br/>
|
463 |
espeak-ng dependency: GPL-3.0<sup>[4]</sup><br/>
|
@@ -471,6 +447,9 @@ Random Japanese texts: CC0 public domain<sup>[6]</sup>
|
|
471 |
4. eSpeak NG | https://github.com/espeak-ng/espeak-ng
|
472 |
5. Quotable Data | https://github.com/quotable-io/data/blob/master/data/quotes.json
|
473 |
6. Common Voice Japanese sentences | https://github.com/common-voice/common-voice/tree/main/server/data/ja
|
|
|
|
|
|
|
474 |
""")
|
475 |
|
476 |
with gr.Blocks() as api_info:
|
@@ -499,10 +478,19 @@ print(out_ps)
|
|
499 |
Note that this Space and the underlying Kokoro model are both under development and subject to change. Reliability is not guaranteed. Hugging Face and/or Gradio might enforce their own rate limits.
|
500 |
""")
|
501 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
502 |
with gr.Blocks() as app:
|
503 |
gr.TabbedInterface(
|
504 |
-
[basic_tts, lf_tts, about, api_info],
|
505 |
-
['🗣️ Basic TTS', '📖 Long-Form', 'ℹ️ About', '🚀 Gradio API'],
|
506 |
)
|
507 |
|
508 |
if __name__ == '__main__':
|
|
|
2 |
from katsu import Katsu
|
3 |
from models import build_model
|
4 |
import gradio as gr
|
|
|
5 |
import numpy as np
|
6 |
import os
|
7 |
import phonemizer
|
|
|
111 |
# ⭐ Starred voices are averages of similar voices. 🧪 Experimental voices may be unstable.
|
112 |
CHOICES = {
|
113 |
'🇺🇸 🚺 American Female ⭐': 'af',
|
|
|
|
|
114 |
'🇺🇸 🚺 Bella': 'af_bella',
|
|
|
|
|
|
|
|
|
115 |
'🇺🇸 🚺 Sarah': 'af_sarah',
|
116 |
'🇺🇸 🚺 Sky 🧪': 'af_sky',
|
117 |
+
'🇺🇸 🚹 Adam 🧪': 'am_adam',
|
|
|
|
|
|
|
118 |
'🇺🇸 🚹 Michael': 'am_michael',
|
119 |
+
'🇬🇧 🚹 Lewis': 'bm_lewis',
|
120 |
+
'🇯🇵 🚺 Japanese Female 🧪': 'jf_0',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
}
|
122 |
+
VOICES = {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()}
|
123 |
|
124 |
np_log_99 = np.log(99)
|
125 |
def s_curve(p):
|
|
|
136 |
@spaces.GPU(duration=10)
|
137 |
@torch.no_grad()
|
138 |
def forward(tokens, voice, speed):
|
139 |
+
ref_s = VOICES[voice][len(tokens)]
|
140 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
141 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
142 |
text_mask = length_to_mask(input_lengths).to(device)
|
|
|
159 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
160 |
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
161 |
|
162 |
+
def generate(text, voice, ps=None, speed=1.0, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_before=5000, pad_after=5000):
|
163 |
if voice not in VOICES:
|
164 |
# Ensure stability for https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
165 |
voice = 'af'
|
|
|
175 |
except gr.exceptions.Error as e:
|
176 |
raise gr.Error(e)
|
177 |
return (None, '')
|
|
|
|
|
178 |
opening_cut = int(opening_cut / speed)
|
179 |
if opening_cut > 0:
|
180 |
out = out[opening_cut:]
|
|
|
195 |
out = np.concatenate([out, np.zeros(pad_after)])
|
196 |
return ((SAMPLE_RATE, out), ps)
|
197 |
|
198 |
+
def toggle_autoplay(autoplay):
|
199 |
+
return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
|
200 |
+
|
201 |
with gr.Blocks() as basic_tts:
|
202 |
with gr.Row():
|
203 |
gr.Markdown('Generate speech for one segment of text (up to 510 tokens) using Kokoro, a TTS model with 80 million parameters.')
|
|
|
216 |
phonemize_btn = gr.Button('Tokenize Input Text', variant='primary')
|
217 |
phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
|
218 |
with gr.Column():
|
219 |
+
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
220 |
with gr.Accordion('Output Tokens', open=True):
|
221 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
222 |
with gr.Accordion('Audio Settings', open=False):
|
223 |
with gr.Row():
|
224 |
+
autoplay = gr.Checkbox(value=True, label='Autoplay')
|
225 |
with gr.Row():
|
226 |
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
227 |
with gr.Row():
|
|
|
239 |
pad_before = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad Before', info='🔇 How many samples of silence to insert before the start.')
|
240 |
with gr.Column():
|
241 |
pad_after = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad After', info='🔇 How many samples of silence to append after the end.')
|
242 |
+
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
243 |
+
text.submit(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
|
244 |
+
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
|
245 |
|
246 |
@spaces.GPU
|
247 |
@torch.no_grad()
|
248 |
def lf_forward(token_lists, voice, speed):
|
249 |
+
voicepack = VOICES[voice]
|
|
|
250 |
outs = []
|
251 |
for tokens in token_lists:
|
252 |
+
ref_s = voicepack[len(tokens)]
|
253 |
+
s = ref_s[:, 128:]
|
254 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
255 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
256 |
text_mask = length_to_mask(input_lengths).to(device)
|
|
|
325 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
326 |
return [(i, *row) for i, row in enumerate(segments)]
|
327 |
|
328 |
+
def lf_generate(segments, voice, speed=1.0, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_before=5000, pad_after=5000, pad_between=10000):
|
329 |
token_lists = list(map(tokenize, segments['Tokens']))
|
330 |
wavs = []
|
331 |
opening_cut = int(opening_cut / speed)
|
|
|
342 |
raise gr.Error(e)
|
343 |
break
|
344 |
for out in outs:
|
|
|
|
|
345 |
if opening_cut > 0:
|
346 |
out = out[opening_cut:]
|
347 |
if closing_cut > 0:
|
|
|
398 |
with gr.Column():
|
399 |
audio = gr.Audio(interactive=False, label='Output Audio')
|
400 |
with gr.Accordion('Audio Settings', open=False):
|
|
|
|
|
401 |
with gr.Row():
|
402 |
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
403 |
with gr.Row():
|
|
|
421 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
422 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
423 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
424 |
+
generate_btn.click(lf_generate, inputs=[segments, voice, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after, pad_between], outputs=[audio])
|
425 |
|
426 |
with gr.Blocks() as about:
|
427 |
gr.Markdown("""
|
|
|
434 |
Vast was chosen over other compute providers due to its competitive on-demand hourly rates.<br/>
|
435 |
The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
|
436 |
|
|
|
|
|
|
|
|
|
|
|
437 |
### Licenses
|
438 |
Inference code: MIT<br/>
|
439 |
espeak-ng dependency: GPL-3.0<sup>[4]</sup><br/>
|
|
|
447 |
4. eSpeak NG | https://github.com/espeak-ng/espeak-ng
|
448 |
5. Quotable Data | https://github.com/quotable-io/data/blob/master/data/quotes.json
|
449 |
6. Common Voice Japanese sentences | https://github.com/common-voice/common-voice/tree/main/server/data/ja
|
450 |
+
|
451 |
+
### Contact
|
452 |
+
@rzvzn on Discord
|
453 |
""")
|
454 |
|
455 |
with gr.Blocks() as api_info:
|
|
|
478 |
Note that this Space and the underlying Kokoro model are both under development and subject to change. Reliability is not guaranteed. Hugging Face and/or Gradio might enforce their own rate limits.
|
479 |
""")
|
480 |
|
481 |
+
with gr.Blocks() as version_info:
|
482 |
+
gr.Markdown("""
|
483 |
+
| Model Version | Date | Validation losses (mel/dur/f0) |
|
484 |
+
| ------- | ---- | ------------------------------ |
|
485 |
+
| v0.19 | 2024 Nov 22 | 0.261 / 0.627 / 1.897 |
|
486 |
+
| v0.16 | 2024 Nov 15 | 0.263 / 0.646 / 1.934 |
|
487 |
+
| v0.14 | 2024 Nov 12 | 0.262 / 0.642 / 1.889 |
|
488 |
+
""")
|
489 |
+
|
490 |
with gr.Blocks() as app:
|
491 |
gr.TabbedInterface(
|
492 |
+
[basic_tts, lf_tts, about, api_info, version_info],
|
493 |
+
['🗣️ Basic TTS', '📖 Long-Form', 'ℹ️ About', '🚀 Gradio API', '📝 Version History'],
|
494 |
)
|
495 |
|
496 |
if __name__ == '__main__':
|
requirements.txt
CHANGED
@@ -2,7 +2,6 @@ fugashi
|
|
2 |
gradio
|
3 |
mojimoji
|
4 |
munch
|
5 |
-
noisereduce
|
6 |
phonemizer
|
7 |
pypdf
|
8 |
scipy
|
|
|
2 |
gradio
|
3 |
mojimoji
|
4 |
munch
|
|
|
5 |
phonemizer
|
6 |
pypdf
|
7 |
scipy
|