Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -8,13 +8,11 @@ from data.tokenizer import (
|
|
8 |
AudioTokenizer,
|
9 |
TextTokenizer,
|
10 |
)
|
11 |
-
print('debug1')
|
12 |
from edit_utils_zh import parse_edit_zh
|
13 |
from edit_utils_en import parse_edit_en
|
14 |
from edit_utils_zh import parse_tts_zh
|
15 |
from edit_utils_en import parse_tts_en
|
16 |
from inference_scale import inference_one_sample
|
17 |
-
print('debug2')
|
18 |
import librosa
|
19 |
import soundfile as sf
|
20 |
from models import ssr
|
@@ -25,7 +23,6 @@ import uuid
|
|
25 |
import spaces
|
26 |
import nltk
|
27 |
nltk.download('punkt')
|
28 |
-
print('debug3')
|
29 |
|
30 |
DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
|
31 |
TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
|
@@ -460,101 +457,101 @@ def get_app():
|
|
460 |
choices=[None, "base.en", "small.en", "medium.en", "large"])
|
461 |
align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
|
462 |
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
|
559 |
return app
|
560 |
|
@@ -577,6 +574,4 @@ if __name__ == "__main__":
|
|
577 |
MODELS_PATH = args.models_path
|
578 |
|
579 |
app = get_app()
|
580 |
-
print('debug5')
|
581 |
app.queue().launch(share=args.share, server_port=args.port)
|
582 |
-
print('debug6')
|
|
|
8 |
AudioTokenizer,
|
9 |
TextTokenizer,
|
10 |
)
|
|
|
11 |
from edit_utils_zh import parse_edit_zh
|
12 |
from edit_utils_en import parse_edit_en
|
13 |
from edit_utils_zh import parse_tts_zh
|
14 |
from edit_utils_en import parse_tts_en
|
15 |
from inference_scale import inference_one_sample
|
|
|
16 |
import librosa
|
17 |
import soundfile as sf
|
18 |
from models import ssr
|
|
|
23 |
import spaces
|
24 |
import nltk
|
25 |
nltk.download('punkt')
|
|
|
26 |
|
27 |
DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
|
28 |
TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
|
|
|
457 |
choices=[None, "base.en", "small.en", "medium.en", "large"])
|
458 |
align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
|
459 |
|
460 |
+
with gr.Row():
|
461 |
+
with gr.Column(scale=2):
|
462 |
+
input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
|
463 |
+
with gr.Group():
|
464 |
+
original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
|
465 |
+
info="Use whisperx model to get the transcript. Fix and align it if necessary.")
|
466 |
+
with gr.Accordion("Word start time", open=False):
|
467 |
+
transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
|
468 |
+
with gr.Accordion("Word end time", open=False):
|
469 |
+
transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word")
|
470 |
+
|
471 |
+
transcribe_btn = gr.Button(value="Transcribe")
|
472 |
+
align_btn = gr.Button(value="Align")
|
473 |
+
|
474 |
+
with gr.Column(scale=3):
|
475 |
+
with gr.Group():
|
476 |
+
transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["smart"])
|
477 |
+
|
478 |
+
with gr.Row():
|
479 |
+
mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
|
480 |
+
|
481 |
+
run_btn = gr.Button(value="Run")
|
482 |
+
|
483 |
+
with gr.Column(scale=2):
|
484 |
+
output_audio = gr.Audio(label="Output Audio")
|
485 |
+
with gr.Accordion("Inference transcript", open=False):
|
486 |
+
inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
|
487 |
+
info="Inference was performed on this transcript.")
|
488 |
+
with gr.Group(visible=False) as long_tts_sentence_editor:
|
489 |
+
sentence_selector = gr.Dropdown(label="Sentence", value=None,
|
490 |
+
info="Select sentence you want to regenerate")
|
491 |
+
sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
|
492 |
+
rerun_btn = gr.Button(value="Rerun")
|
493 |
+
|
494 |
+
with gr.Row():
|
495 |
+
with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
|
496 |
+
stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=2,
|
497 |
+
info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
|
498 |
+
seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)")
|
499 |
+
kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1,
|
500 |
+
info="set to 0 to use less VRAM, but with slower inference")
|
501 |
+
aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
|
502 |
+
info="set to 1 to use cfg")
|
503 |
+
cfg_coef = gr.Number(label="cfg_coef", value=1.5,
|
504 |
+
info="cfg guidance scale, 1.5 is a good value")
|
505 |
+
sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment")
|
506 |
+
top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
|
507 |
+
temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change")
|
508 |
+
top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
|
509 |
+
codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change')
|
510 |
+
codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change')
|
511 |
+
silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
|
512 |
+
|
513 |
+
success_output = gr.HTML()
|
514 |
+
audio_tensors = gr.State()
|
515 |
+
transcribe_state = gr.State(value={"words_info": demo_words_info})
|
516 |
+
|
517 |
+
load_models_btn.click(fn=load_models,
|
518 |
+
inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, ssrspeech_model_choice],
|
519 |
+
outputs=[models_selector])
|
520 |
+
|
521 |
+
|
522 |
+
transcribe_btn.click(fn=transcribe,
|
523 |
+
inputs=[seed, input_audio],
|
524 |
+
outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
|
525 |
+
align_btn.click(fn=align,
|
526 |
+
inputs=[seed, original_transcript, input_audio],
|
527 |
+
outputs=[transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
|
528 |
+
|
529 |
+
run_btn.click(fn=run,
|
530 |
+
inputs=[
|
531 |
+
seed, sub_amount, ssrspeech_model_choice,
|
532 |
+
codec_audio_sr, codec_sr,
|
533 |
+
top_k, top_p, temperature,
|
534 |
+
stop_repetition,
|
535 |
+
kvcache, silence_tokens, aug_text, cfg_coef,
|
536 |
+
input_audio, transcribe_state, original_transcript, transcript,
|
537 |
+
mode, sentence_selector, audio_tensors
|
538 |
+
],
|
539 |
+
outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors])
|
540 |
+
|
541 |
+
sentence_selector.change(fn=load_sentence,
|
542 |
+
inputs=[sentence_selector, codec_audio_sr, audio_tensors],
|
543 |
+
outputs=[sentence_audio])
|
544 |
+
rerun_btn.click(fn=run,
|
545 |
+
inputs=[
|
546 |
+
seed, sub_amount, ssrspeech_model_choice,
|
547 |
+
codec_audio_sr, codec_sr,
|
548 |
+
top_k, top_p, temperature,
|
549 |
+
stop_repetition,
|
550 |
+
kvcache, silence_tokens, aug_text, cfg_coef,
|
551 |
+
input_audio, transcribe_state, original_transcript, transcript,
|
552 |
+
gr.State(value="Rerun"), sentence_selector, audio_tensors
|
553 |
+
],
|
554 |
+
outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
|
555 |
|
556 |
return app
|
557 |
|
|
|
574 |
MODELS_PATH = args.models_path
|
575 |
|
576 |
app = get_app()
|
|
|
577 |
app.queue().launch(share=args.share, server_port=args.port)
|
|