Spaces:

OpenSound
/

SSR-Speech

Running on Zero

App Files Files Community

OpenSound commited on Jan 2

Commit

525eeaf

verified ·

1 Parent(s): bc21a92

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -17

app.py CHANGED Viewed

@@ -186,7 +186,7 @@ def transcribe_en(audio_path):
     _, segments = align_en(segments, audio_path)
     state = get_transcribe_state(segments)
     success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
     return [
         state["transcript"], state['segments'],
         state, success_message
@@ -203,6 +203,7 @@ def transcribe_zh(audio_path):
     success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
     converter = opencc.OpenCC('t2s')
     state["transcript"] = converter.convert(state["transcript"])
     return [
         state["transcript"], state['segments'],
         state, success_message
@@ -215,7 +216,7 @@ def align_en(segments, audio_path):
     audio = load_audio(audio_path)
     segments = align_func(segments, align_model, metadata, audio, device, return_char_alignments=False)["segments"]
     state = get_transcribe_state(segments)
     return state, segments
@@ -226,7 +227,7 @@ def align_zh(segments, audio_path):
     audio = load_audio(audio_path)
     segments = align_func(segments, align_model, metadata, audio, device, return_char_alignments=False)["segments"]
     state = get_transcribe_state(segments)
     return state, segments
@@ -337,7 +338,7 @@ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
     audio_tensors.append(new_audio)
     output_audio = get_output_audio(audio_tensors, codec_audio_sr)
     success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
     return output_audio, success_message
@@ -386,6 +387,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
                     cut_length = min(item['end'], cut_length)
     audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
     sf.write(audio_path, audio, 16000)
     [orig_transcript, segments, _, _] = transcribe_en(audio_path)
@@ -423,7 +425,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
     audio_tensors = []
     # save segments for comparison
     new_audio = new_audio[0].cpu()
-    audio_path = audio_path.replace('.','_tmp.')
     torchaudio.save(audio_path, new_audio, codec_audio_sr)
     [new_transcript, new_segments, _, _] = transcribe_en(audio_path)
@@ -438,7 +440,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
     new_audio, _ = torchaudio.load(audio_path, frame_offset=int(offset*codec_audio_sr))
     audio_tensors.append(new_audio)
     output_audio = get_output_audio(audio_tensors, codec_audio_sr)
     success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
     return output_audio, success_message
@@ -536,7 +538,7 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
     # torchaudio.save(audio_path, new_audio, codec_audio_sr)
     audio_tensors.append(new_audio)
     output_audio = get_output_audio(audio_tensors, codec_audio_sr)
     success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
     return output_audio, success_message
@@ -588,6 +590,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
                     cut_length = min(item['end'], cut_length)
     audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
     sf.write(audio_path, audio, 16000)
     [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
@@ -627,7 +630,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
     audio_tensors = []
     # save segments for comparison
     new_audio = new_audio[0].cpu()
-    audio_path = audio_path.replace('.','_tmp.')
     torchaudio.save(audio_path, new_audio, codec_audio_sr)
     [new_transcript, new_segments, _,_] = transcribe_zh(audio_path)
@@ -645,7 +648,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
     new_audio, _ = torchaudio.load(audio_path, frame_offset=int(offset*codec_audio_sr))
     audio_tensors.append(new_audio)
     output_audio = get_output_audio(audio_tensors, codec_audio_sr)
     success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
     return output_audio, success_message
@@ -724,7 +727,7 @@ if __name__ == "__main__":
                 with gr.Row():
                     with gr.Accordion("Advanced Settings", open=False):
-                        seed1 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
                         aug_text1 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef1 = gr.Number(label="cfg_coef", value=1.5,
@@ -780,7 +783,7 @@ if __name__ == "__main__":
                 with gr.Row():
                     with gr.Accordion("Advanced Settings", open=False):
-                        seed2 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
                         aug_text2 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef2 = gr.Number(label="cfg_coef", value=1.5,
@@ -834,13 +837,13 @@ if __name__ == "__main__":
                 with gr.Row():
                     with gr.Accordion("Advanced Settings", open=False):
-                        seed3 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
                         aug_text3 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef3 = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
-                        cfg_stride3 = gr.Number(label="cfg_stride", value=3,
-                                            info="cfg stride, 3 is a good value for Mandarin, change if you don't like the results")
                         prompt_length3 = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
                         sub_amount3 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
@@ -888,13 +891,13 @@ if __name__ == "__main__":
                 with gr.Row():
                     with gr.Accordion("Advanced Settings", open=False):
-                        seed4 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
                         aug_text4 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef4 = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
-                        cfg_stride4 = gr.Number(label="cfg_stride", value=3,
-                                            info="cfg stride, 3 is a good value for Mandarin, change if you don't like the results")
                         prompt_length4 = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
                         sub_amount4 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")

     _, segments = align_en(segments, audio_path)
     state = get_transcribe_state(segments)
     success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
+    torch.cuda.empty_cache()
     return [
         state["transcript"], state['segments'],
         state, success_message
     success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
     converter = opencc.OpenCC('t2s')
     state["transcript"] = converter.convert(state["transcript"])
+    torch.cuda.empty_cache()
     return [
         state["transcript"], state['segments'],
         state, success_message
     audio = load_audio(audio_path)
     segments = align_func(segments, align_model, metadata, audio, device, return_char_alignments=False)["segments"]
     state = get_transcribe_state(segments)
+    torch.cuda.empty_cache()
     return state, segments
     audio = load_audio(audio_path)
     segments = align_func(segments, align_model, metadata, audio, device, return_char_alignments=False)["segments"]
     state = get_transcribe_state(segments)
+    torch.cuda.empty_cache()
     return state, segments
     audio_tensors.append(new_audio)
     output_audio = get_output_audio(audio_tensors, codec_audio_sr)
+    torch.cuda.empty_cache()
     success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
     return output_audio, success_message
                     cut_length = min(item['end'], cut_length)
     audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
+    audio_path = audio_path.replace('.','_tmp.')
     sf.write(audio_path, audio, 16000)
     [orig_transcript, segments, _, _] = transcribe_en(audio_path)
     audio_tensors = []
     # save segments for comparison
     new_audio = new_audio[0].cpu()
     torchaudio.save(audio_path, new_audio, codec_audio_sr)
     [new_transcript, new_segments, _, _] = transcribe_en(audio_path)
     new_audio, _ = torchaudio.load(audio_path, frame_offset=int(offset*codec_audio_sr))
     audio_tensors.append(new_audio)
     output_audio = get_output_audio(audio_tensors, codec_audio_sr)
+    torch.cuda.empty_cache()
     success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
     return output_audio, success_message
     # torchaudio.save(audio_path, new_audio, codec_audio_sr)
     audio_tensors.append(new_audio)
     output_audio = get_output_audio(audio_tensors, codec_audio_sr)
+    torch.cuda.empty_cache()
     success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
     return output_audio, success_message
                     cut_length = min(item['end'], cut_length)
     audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
+    audio_path = audio_path.replace('.','_tmp.')
     sf.write(audio_path, audio, 16000)
     [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
     audio_tensors = []
     # save segments for comparison
     new_audio = new_audio[0].cpu()
     torchaudio.save(audio_path, new_audio, codec_audio_sr)
     [new_transcript, new_segments, _,_] = transcribe_zh(audio_path)
     new_audio, _ = torchaudio.load(audio_path, frame_offset=int(offset*codec_audio_sr))
     audio_tensors.append(new_audio)
     output_audio = get_output_audio(audio_tensors, codec_audio_sr)
+    torch.cuda.empty_cache()
     success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
     return output_audio, success_message
                 with gr.Row():
                     with gr.Accordion("Advanced Settings", open=False):
+                        seed1 = gr.Number(label="seed", value=1234, precision=0, info="random seeds always works :)")
                         aug_text1 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef1 = gr.Number(label="cfg_coef", value=1.5,
                 with gr.Row():
                     with gr.Accordion("Advanced Settings", open=False):
+                        seed2 = gr.Number(label="seed", value=1234, precision=0, info="random seeds always works :)")
                         aug_text2 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef2 = gr.Number(label="cfg_coef", value=1.5,
                 with gr.Row():
                     with gr.Accordion("Advanced Settings", open=False):
+                        seed3 = gr.Number(label="seed", value=1234, precision=0, info="random seeds always works :)")
                         aug_text3 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef3 = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
+                        cfg_stride3 = gr.Number(label="cfg_stride", value=1,
+                                            info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
                         prompt_length3 = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
                         sub_amount3 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
                 with gr.Row():
                     with gr.Accordion("Advanced Settings", open=False):
+                        seed4 = gr.Number(label="seed", value=1234, precision=0, info="random seeds always works :)")
                         aug_text4 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef4 = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
+                        cfg_stride4 = gr.Number(label="cfg_stride", value=1,
+                                            info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
                         prompt_length4 = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
                         sub_amount4 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")