OpenSound commited on
Commit
4927550
·
1 Parent(s): 97bf543

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -100
app.py CHANGED
@@ -8,13 +8,11 @@ from data.tokenizer import (
8
  AudioTokenizer,
9
  TextTokenizer,
10
  )
11
- print('debug1')
12
  from edit_utils_zh import parse_edit_zh
13
  from edit_utils_en import parse_edit_en
14
  from edit_utils_zh import parse_tts_zh
15
  from edit_utils_en import parse_tts_en
16
  from inference_scale import inference_one_sample
17
- print('debug2')
18
  import librosa
19
  import soundfile as sf
20
  from models import ssr
@@ -25,7 +23,6 @@ import uuid
25
  import spaces
26
  import nltk
27
  nltk.download('punkt')
28
- print('debug3')
29
 
30
  DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
31
  TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
@@ -460,101 +457,101 @@ def get_app():
460
  choices=[None, "base.en", "small.en", "medium.en", "large"])
461
  align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
462
 
463
- # with gr.Row():
464
- # with gr.Column(scale=2):
465
- # input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
466
- # with gr.Group():
467
- # original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
468
- # info="Use whisperx model to get the transcript. Fix and align it if necessary.")
469
- # with gr.Accordion("Word start time", open=False):
470
- # transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
471
- # with gr.Accordion("Word end time", open=False):
472
- # transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word")
473
-
474
- # transcribe_btn = gr.Button(value="Transcribe")
475
- # align_btn = gr.Button(value="Align")
476
-
477
- # with gr.Column(scale=3):
478
- # with gr.Group():
479
- # transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["smart"])
480
-
481
- # with gr.Row():
482
- # mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
483
-
484
- # run_btn = gr.Button(value="Run")
485
-
486
- # with gr.Column(scale=2):
487
- # output_audio = gr.Audio(label="Output Audio")
488
- # with gr.Accordion("Inference transcript", open=False):
489
- # inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
490
- # info="Inference was performed on this transcript.")
491
- # with gr.Group(visible=False) as long_tts_sentence_editor:
492
- # sentence_selector = gr.Dropdown(label="Sentence", value=None,
493
- # info="Select sentence you want to regenerate")
494
- # sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
495
- # rerun_btn = gr.Button(value="Rerun")
496
-
497
- # with gr.Row():
498
- # with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
499
- # stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=2,
500
- # info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
501
- # seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)")
502
- # kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1,
503
- # info="set to 0 to use less VRAM, but with slower inference")
504
- # aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
505
- # info="set to 1 to use cfg")
506
- # cfg_coef = gr.Number(label="cfg_coef", value=1.5,
507
- # info="cfg guidance scale, 1.5 is a good value")
508
- # sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment")
509
- # top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
510
- # temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change")
511
- # top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
512
- # codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change')
513
- # codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change')
514
- # silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
515
-
516
- # success_output = gr.HTML()
517
- # audio_tensors = gr.State()
518
- # transcribe_state = gr.State(value={"words_info": demo_words_info})
519
-
520
- # load_models_btn.click(fn=load_models,
521
- # inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, ssrspeech_model_choice],
522
- # outputs=[models_selector])
523
-
524
-
525
- # transcribe_btn.click(fn=transcribe,
526
- # inputs=[seed, input_audio],
527
- # outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
528
- # align_btn.click(fn=align,
529
- # inputs=[seed, original_transcript, input_audio],
530
- # outputs=[transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
531
-
532
- # run_btn.click(fn=run,
533
- # inputs=[
534
- # seed, sub_amount, ssrspeech_model_choice,
535
- # codec_audio_sr, codec_sr,
536
- # top_k, top_p, temperature,
537
- # stop_repetition,
538
- # kvcache, silence_tokens, aug_text, cfg_coef,
539
- # input_audio, transcribe_state, original_transcript, transcript,
540
- # mode, sentence_selector, audio_tensors
541
- # ],
542
- # outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors])
543
-
544
- # sentence_selector.change(fn=load_sentence,
545
- # inputs=[sentence_selector, codec_audio_sr, audio_tensors],
546
- # outputs=[sentence_audio])
547
- # rerun_btn.click(fn=run,
548
- # inputs=[
549
- # seed, sub_amount, ssrspeech_model_choice,
550
- # codec_audio_sr, codec_sr,
551
- # top_k, top_p, temperature,
552
- # stop_repetition,
553
- # kvcache, silence_tokens, aug_text, cfg_coef,
554
- # input_audio, transcribe_state, original_transcript, transcript,
555
- # gr.State(value="Rerun"), sentence_selector, audio_tensors
556
- # ],
557
- # outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
558
 
559
  return app
560
 
@@ -577,6 +574,4 @@ if __name__ == "__main__":
577
  MODELS_PATH = args.models_path
578
 
579
  app = get_app()
580
- print('debug5')
581
  app.queue().launch(share=args.share, server_port=args.port)
582
- print('debug6')
 
8
  AudioTokenizer,
9
  TextTokenizer,
10
  )
 
11
  from edit_utils_zh import parse_edit_zh
12
  from edit_utils_en import parse_edit_en
13
  from edit_utils_zh import parse_tts_zh
14
  from edit_utils_en import parse_tts_en
15
  from inference_scale import inference_one_sample
 
16
  import librosa
17
  import soundfile as sf
18
  from models import ssr
 
23
  import spaces
24
  import nltk
25
  nltk.download('punkt')
 
26
 
27
  DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
28
  TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
 
457
  choices=[None, "base.en", "small.en", "medium.en", "large"])
458
  align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
459
 
460
+ with gr.Row():
461
+ with gr.Column(scale=2):
462
+ input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
463
+ with gr.Group():
464
+ original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
465
+ info="Use whisperx model to get the transcript. Fix and align it if necessary.")
466
+ with gr.Accordion("Word start time", open=False):
467
+ transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
468
+ with gr.Accordion("Word end time", open=False):
469
+ transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word")
470
+
471
+ transcribe_btn = gr.Button(value="Transcribe")
472
+ align_btn = gr.Button(value="Align")
473
+
474
+ with gr.Column(scale=3):
475
+ with gr.Group():
476
+ transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["smart"])
477
+
478
+ with gr.Row():
479
+ mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
480
+
481
+ run_btn = gr.Button(value="Run")
482
+
483
+ with gr.Column(scale=2):
484
+ output_audio = gr.Audio(label="Output Audio")
485
+ with gr.Accordion("Inference transcript", open=False):
486
+ inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
487
+ info="Inference was performed on this transcript.")
488
+ with gr.Group(visible=False) as long_tts_sentence_editor:
489
+ sentence_selector = gr.Dropdown(label="Sentence", value=None,
490
+ info="Select sentence you want to regenerate")
491
+ sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
492
+ rerun_btn = gr.Button(value="Rerun")
493
+
494
+ with gr.Row():
495
+ with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
496
+ stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=2,
497
+ info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
498
+ seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)")
499
+ kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1,
500
+ info="set to 0 to use less VRAM, but with slower inference")
501
+ aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
502
+ info="set to 1 to use cfg")
503
+ cfg_coef = gr.Number(label="cfg_coef", value=1.5,
504
+ info="cfg guidance scale, 1.5 is a good value")
505
+ sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment")
506
+ top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
507
+ temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change")
508
+ top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
509
+ codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change')
510
+ codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change')
511
+ silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
512
+
513
+ success_output = gr.HTML()
514
+ audio_tensors = gr.State()
515
+ transcribe_state = gr.State(value={"words_info": demo_words_info})
516
+
517
+ load_models_btn.click(fn=load_models,
518
+ inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, ssrspeech_model_choice],
519
+ outputs=[models_selector])
520
+
521
+
522
+ transcribe_btn.click(fn=transcribe,
523
+ inputs=[seed, input_audio],
524
+ outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
525
+ align_btn.click(fn=align,
526
+ inputs=[seed, original_transcript, input_audio],
527
+ outputs=[transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
528
+
529
+ run_btn.click(fn=run,
530
+ inputs=[
531
+ seed, sub_amount, ssrspeech_model_choice,
532
+ codec_audio_sr, codec_sr,
533
+ top_k, top_p, temperature,
534
+ stop_repetition,
535
+ kvcache, silence_tokens, aug_text, cfg_coef,
536
+ input_audio, transcribe_state, original_transcript, transcript,
537
+ mode, sentence_selector, audio_tensors
538
+ ],
539
+ outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors])
540
+
541
+ sentence_selector.change(fn=load_sentence,
542
+ inputs=[sentence_selector, codec_audio_sr, audio_tensors],
543
+ outputs=[sentence_audio])
544
+ rerun_btn.click(fn=run,
545
+ inputs=[
546
+ seed, sub_amount, ssrspeech_model_choice,
547
+ codec_audio_sr, codec_sr,
548
+ top_k, top_p, temperature,
549
+ stop_repetition,
550
+ kvcache, silence_tokens, aug_text, cfg_coef,
551
+ input_audio, transcribe_state, original_transcript, transcript,
552
+ gr.State(value="Rerun"), sentence_selector, audio_tensors
553
+ ],
554
+ outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
555
 
556
  return app
557
 
 
574
  MODELS_PATH = args.models_path
575
 
576
  app = get_app()
 
577
  app.queue().launch(share=args.share, server_port=args.port)