Spaces:

OpenSound
/

SSR-Speech

Running on Zero

App Files Files Community

OpenSound commited on Sep 14, 2024

Commit

4927550

1 Parent(s): 97bf543

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -100

app.py CHANGED Viewed

@@ -8,13 +8,11 @@ from data.tokenizer import (
     AudioTokenizer,
     TextTokenizer,
 )
-print('debug1')
 from edit_utils_zh import parse_edit_zh
 from edit_utils_en import parse_edit_en
 from edit_utils_zh import parse_tts_zh
 from edit_utils_en import parse_tts_en
 from inference_scale import inference_one_sample
-print('debug2')
 import librosa
 import soundfile as sf
 from models import ssr
@@ -25,7 +23,6 @@ import uuid
 import spaces
 import nltk
 nltk.download('punkt')
-print('debug3')
 DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
 TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
@@ -460,101 +457,101 @@ def get_app():
                                                         choices=[None, "base.en", "small.en", "medium.en", "large"])
                         align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
-        # with gr.Row():
-        #     with gr.Column(scale=2):
-        #         input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
-        #         with gr.Group():
-        #             original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
-        #                                             info="Use whisperx model to get the transcript. Fix and align it if necessary.")
-        #             with gr.Accordion("Word start time", open=False):
-        #                 transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
-        #             with gr.Accordion("Word end time", open=False):
-        #                 transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word")
-        #             transcribe_btn = gr.Button(value="Transcribe")
-        #             align_btn = gr.Button(value="Align")
-        #     with gr.Column(scale=3):
-        #         with gr.Group():
-        #             transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["smart"])
-        #             with gr.Row():
-        #                 mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
-        #             run_btn = gr.Button(value="Run")
-        #     with gr.Column(scale=2):
-        #         output_audio = gr.Audio(label="Output Audio")
-        #         with gr.Accordion("Inference transcript", open=False):
-        #             inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
-        #                                             info="Inference was performed on this transcript.")
-        #         with gr.Group(visible=False) as long_tts_sentence_editor:
-        #             sentence_selector = gr.Dropdown(label="Sentence", value=None,
-        #                                             info="Select sentence you want to regenerate")
-        #             sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
-        #             rerun_btn = gr.Button(value="Rerun")
-        # with gr.Row():
-        #     with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
-        #         stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=2,
-        #                                 info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
-        #         seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)")
-        #         kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1,
-        #                             info="set to 0 to use less VRAM, but with slower inference")
-        #         aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
-        #                             info="set to 1 to use cfg")
-        #         cfg_coef = gr.Number(label="cfg_coef", value=1.5,
-        #                             info="cfg guidance scale, 1.5 is a good value")
-        #         sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment")
-        #         top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
-        #         temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change")
-        #         top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
-        #         codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change')
-        #         codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change')
-        #         silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
-        # success_output = gr.HTML()
-        # audio_tensors = gr.State()
-        # transcribe_state = gr.State(value={"words_info": demo_words_info})
-        # load_models_btn.click(fn=load_models,
-        #                     inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, ssrspeech_model_choice],
-        #                     outputs=[models_selector])
-        # transcribe_btn.click(fn=transcribe,
-        #                     inputs=[seed, input_audio],
-        #                     outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
-        # align_btn.click(fn=align,
-        #                 inputs=[seed, original_transcript, input_audio],
-        #                 outputs=[transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
-        # run_btn.click(fn=run,
-        #             inputs=[
-        #                 seed, sub_amount, ssrspeech_model_choice,
-        #                 codec_audio_sr, codec_sr,
-        #                 top_k, top_p, temperature,
-        #                 stop_repetition,
-        #                 kvcache, silence_tokens, aug_text, cfg_coef,
-        #                 input_audio, transcribe_state, original_transcript, transcript,
-        #                 mode, sentence_selector, audio_tensors
-        #             ],
-        #             outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors])
-        # sentence_selector.change(fn=load_sentence,
-        #                         inputs=[sentence_selector, codec_audio_sr, audio_tensors],
-        #                         outputs=[sentence_audio])
-        # rerun_btn.click(fn=run,
-        #                 inputs=[
-        #                     seed, sub_amount, ssrspeech_model_choice,
-        #                     codec_audio_sr, codec_sr,
-        #                     top_k, top_p, temperature,
-        #                     stop_repetition,
-        #                     kvcache, silence_tokens, aug_text, cfg_coef,
-        #                     input_audio, transcribe_state, original_transcript, transcript,
-        #                     gr.State(value="Rerun"), sentence_selector, audio_tensors
-        #                 ],
-        #                 outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
     return app
@@ -577,6 +574,4 @@ if __name__ == "__main__":
     MODELS_PATH = args.models_path
     app = get_app()
-    print('debug5')
     app.queue().launch(share=args.share, server_port=args.port)
-    print('debug6')

     AudioTokenizer,
     TextTokenizer,
 )
 from edit_utils_zh import parse_edit_zh
 from edit_utils_en import parse_edit_en
 from edit_utils_zh import parse_tts_zh
 from edit_utils_en import parse_tts_en
 from inference_scale import inference_one_sample
 import librosa
 import soundfile as sf
 from models import ssr
 import spaces
 import nltk
 nltk.download('punkt')
 DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
 TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
                                                         choices=[None, "base.en", "small.en", "medium.en", "large"])
                         align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
+        with gr.Row():
+            with gr.Column(scale=2):
+                input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
+                with gr.Group():
+                    original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
+                                                    info="Use whisperx model to get the transcript. Fix and align it if necessary.")
+                    with gr.Accordion("Word start time", open=False):
+                        transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
+                    with gr.Accordion("Word end time", open=False):
+                        transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word")
+                    transcribe_btn = gr.Button(value="Transcribe")
+                    align_btn = gr.Button(value="Align")
+            with gr.Column(scale=3):
+                with gr.Group():
+                    transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["smart"])
+                    with gr.Row():
+                        mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
+                    run_btn = gr.Button(value="Run")
+            with gr.Column(scale=2):
+                output_audio = gr.Audio(label="Output Audio")
+                with gr.Accordion("Inference transcript", open=False):
+                    inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
+                                                    info="Inference was performed on this transcript.")
+                with gr.Group(visible=False) as long_tts_sentence_editor:
+                    sentence_selector = gr.Dropdown(label="Sentence", value=None,
+                                                    info="Select sentence you want to regenerate")
+                    sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
+                    rerun_btn = gr.Button(value="Rerun")
+        with gr.Row():
+            with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
+                stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=2,
+                                        info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
+                seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)")
+                kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1,
+                                    info="set to 0 to use less VRAM, but with slower inference")
+                aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
+                                    info="set to 1 to use cfg")
+                cfg_coef = gr.Number(label="cfg_coef", value=1.5,
+                                    info="cfg guidance scale, 1.5 is a good value")
+                sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment")
+                top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
+                temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change")
+                top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
+                codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change')
+                codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change')
+                silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
+        success_output = gr.HTML()
+        audio_tensors = gr.State()
+        transcribe_state = gr.State(value={"words_info": demo_words_info})
+        load_models_btn.click(fn=load_models,
+                            inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, ssrspeech_model_choice],
+                            outputs=[models_selector])
+        transcribe_btn.click(fn=transcribe,
+                            inputs=[seed, input_audio],
+                            outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
+        align_btn.click(fn=align,
+                        inputs=[seed, original_transcript, input_audio],
+                        outputs=[transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
+        run_btn.click(fn=run,
+                    inputs=[
+                        seed, sub_amount, ssrspeech_model_choice,
+                        codec_audio_sr, codec_sr,
+                        top_k, top_p, temperature,
+                        stop_repetition,
+                        kvcache, silence_tokens, aug_text, cfg_coef,
+                        input_audio, transcribe_state, original_transcript, transcript,
+                        mode, sentence_selector, audio_tensors
+                    ],
+                    outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors])
+        sentence_selector.change(fn=load_sentence,
+                                inputs=[sentence_selector, codec_audio_sr, audio_tensors],
+                                outputs=[sentence_audio])
+        rerun_btn.click(fn=run,
+                        inputs=[
+                            seed, sub_amount, ssrspeech_model_choice,
+                            codec_audio_sr, codec_sr,
+                            top_k, top_p, temperature,
+                            stop_repetition,
+                            kvcache, silence_tokens, aug_text, cfg_coef,
+                            input_audio, transcribe_state, original_transcript, transcript,
+                            gr.State(value="Rerun"), sentence_selector, audio_tensors
+                        ],
+                        outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
     return app
     MODELS_PATH = args.models_path
     app = get_app()
     app.queue().launch(share=args.share, server_port=args.port)