OpenSound commited on
Commit
525eeaf
·
verified ·
1 Parent(s): bc21a92

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -17
app.py CHANGED
@@ -186,7 +186,7 @@ def transcribe_en(audio_path):
186
  _, segments = align_en(segments, audio_path)
187
  state = get_transcribe_state(segments)
188
  success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
189
-
190
  return [
191
  state["transcript"], state['segments'],
192
  state, success_message
@@ -203,6 +203,7 @@ def transcribe_zh(audio_path):
203
  success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
204
  converter = opencc.OpenCC('t2s')
205
  state["transcript"] = converter.convert(state["transcript"])
 
206
  return [
207
  state["transcript"], state['segments'],
208
  state, success_message
@@ -215,7 +216,7 @@ def align_en(segments, audio_path):
215
  audio = load_audio(audio_path)
216
  segments = align_func(segments, align_model, metadata, audio, device, return_char_alignments=False)["segments"]
217
  state = get_transcribe_state(segments)
218
-
219
  return state, segments
220
 
221
 
@@ -226,7 +227,7 @@ def align_zh(segments, audio_path):
226
  audio = load_audio(audio_path)
227
  segments = align_func(segments, align_model, metadata, audio, device, return_char_alignments=False)["segments"]
228
  state = get_transcribe_state(segments)
229
-
230
  return state, segments
231
 
232
 
@@ -337,7 +338,7 @@ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
337
 
338
  audio_tensors.append(new_audio)
339
  output_audio = get_output_audio(audio_tensors, codec_audio_sr)
340
-
341
  success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
342
  return output_audio, success_message
343
 
@@ -386,6 +387,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
386
  cut_length = min(item['end'], cut_length)
387
 
388
  audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
 
389
  sf.write(audio_path, audio, 16000)
390
  [orig_transcript, segments, _, _] = transcribe_en(audio_path)
391
 
@@ -423,7 +425,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
423
  audio_tensors = []
424
  # save segments for comparison
425
  new_audio = new_audio[0].cpu()
426
- audio_path = audio_path.replace('.','_tmp.')
427
  torchaudio.save(audio_path, new_audio, codec_audio_sr)
428
 
429
  [new_transcript, new_segments, _, _] = transcribe_en(audio_path)
@@ -438,7 +440,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
438
  new_audio, _ = torchaudio.load(audio_path, frame_offset=int(offset*codec_audio_sr))
439
  audio_tensors.append(new_audio)
440
  output_audio = get_output_audio(audio_tensors, codec_audio_sr)
441
-
442
  success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
443
  return output_audio, success_message
444
 
@@ -536,7 +538,7 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
536
  # torchaudio.save(audio_path, new_audio, codec_audio_sr)
537
  audio_tensors.append(new_audio)
538
  output_audio = get_output_audio(audio_tensors, codec_audio_sr)
539
-
540
  success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
541
  return output_audio, success_message
542
 
@@ -588,6 +590,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
588
  cut_length = min(item['end'], cut_length)
589
 
590
  audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
 
591
  sf.write(audio_path, audio, 16000)
592
  [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
593
 
@@ -627,7 +630,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
627
  audio_tensors = []
628
  # save segments for comparison
629
  new_audio = new_audio[0].cpu()
630
- audio_path = audio_path.replace('.','_tmp.')
631
  torchaudio.save(audio_path, new_audio, codec_audio_sr)
632
 
633
  [new_transcript, new_segments, _,_] = transcribe_zh(audio_path)
@@ -645,7 +648,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
645
  new_audio, _ = torchaudio.load(audio_path, frame_offset=int(offset*codec_audio_sr))
646
  audio_tensors.append(new_audio)
647
  output_audio = get_output_audio(audio_tensors, codec_audio_sr)
648
-
649
  success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
650
  return output_audio, success_message
651
 
@@ -724,7 +727,7 @@ if __name__ == "__main__":
724
 
725
  with gr.Row():
726
  with gr.Accordion("Advanced Settings", open=False):
727
- seed1 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
728
  aug_text1 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
729
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
730
  cfg_coef1 = gr.Number(label="cfg_coef", value=1.5,
@@ -780,7 +783,7 @@ if __name__ == "__main__":
780
 
781
  with gr.Row():
782
  with gr.Accordion("Advanced Settings", open=False):
783
- seed2 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
784
  aug_text2 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
785
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
786
  cfg_coef2 = gr.Number(label="cfg_coef", value=1.5,
@@ -834,13 +837,13 @@ if __name__ == "__main__":
834
 
835
  with gr.Row():
836
  with gr.Accordion("Advanced Settings", open=False):
837
- seed3 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
838
  aug_text3 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
839
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
840
  cfg_coef3 = gr.Number(label="cfg_coef", value=1.5,
841
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
842
- cfg_stride3 = gr.Number(label="cfg_stride", value=3,
843
- info="cfg stride, 3 is a good value for Mandarin, change if you don't like the results")
844
  prompt_length3 = gr.Number(label="prompt_length", value=3,
845
  info="used for tts prompt, will automatically cut the prompt audio to this length")
846
  sub_amount3 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
@@ -888,13 +891,13 @@ if __name__ == "__main__":
888
 
889
  with gr.Row():
890
  with gr.Accordion("Advanced Settings", open=False):
891
- seed4 = gr.Number(label="seed", value=2024, precision=0, info="random seeds always works :)")
892
  aug_text4 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
893
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
894
  cfg_coef4 = gr.Number(label="cfg_coef", value=1.5,
895
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
896
- cfg_stride4 = gr.Number(label="cfg_stride", value=3,
897
- info="cfg stride, 3 is a good value for Mandarin, change if you don't like the results")
898
  prompt_length4 = gr.Number(label="prompt_length", value=3,
899
  info="used for tts prompt, will automatically cut the prompt audio to this length")
900
  sub_amount4 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
 
186
  _, segments = align_en(segments, audio_path)
187
  state = get_transcribe_state(segments)
188
  success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
189
+ torch.cuda.empty_cache()
190
  return [
191
  state["transcript"], state['segments'],
192
  state, success_message
 
203
  success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
204
  converter = opencc.OpenCC('t2s')
205
  state["transcript"] = converter.convert(state["transcript"])
206
+ torch.cuda.empty_cache()
207
  return [
208
  state["transcript"], state['segments'],
209
  state, success_message
 
216
  audio = load_audio(audio_path)
217
  segments = align_func(segments, align_model, metadata, audio, device, return_char_alignments=False)["segments"]
218
  state = get_transcribe_state(segments)
219
+ torch.cuda.empty_cache()
220
  return state, segments
221
 
222
 
 
227
  audio = load_audio(audio_path)
228
  segments = align_func(segments, align_model, metadata, audio, device, return_char_alignments=False)["segments"]
229
  state = get_transcribe_state(segments)
230
+ torch.cuda.empty_cache()
231
  return state, segments
232
 
233
 
 
338
 
339
  audio_tensors.append(new_audio)
340
  output_audio = get_output_audio(audio_tensors, codec_audio_sr)
341
+ torch.cuda.empty_cache()
342
  success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
343
  return output_audio, success_message
344
 
 
387
  cut_length = min(item['end'], cut_length)
388
 
389
  audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
390
+ audio_path = audio_path.replace('.','_tmp.')
391
  sf.write(audio_path, audio, 16000)
392
  [orig_transcript, segments, _, _] = transcribe_en(audio_path)
393
 
 
425
  audio_tensors = []
426
  # save segments for comparison
427
  new_audio = new_audio[0].cpu()
428
+
429
  torchaudio.save(audio_path, new_audio, codec_audio_sr)
430
 
431
  [new_transcript, new_segments, _, _] = transcribe_en(audio_path)
 
440
  new_audio, _ = torchaudio.load(audio_path, frame_offset=int(offset*codec_audio_sr))
441
  audio_tensors.append(new_audio)
442
  output_audio = get_output_audio(audio_tensors, codec_audio_sr)
443
+ torch.cuda.empty_cache()
444
  success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
445
  return output_audio, success_message
446
 
 
538
  # torchaudio.save(audio_path, new_audio, codec_audio_sr)
539
  audio_tensors.append(new_audio)
540
  output_audio = get_output_audio(audio_tensors, codec_audio_sr)
541
+ torch.cuda.empty_cache()
542
  success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
543
  return output_audio, success_message
544
 
 
590
  cut_length = min(item['end'], cut_length)
591
 
592
  audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
593
+ audio_path = audio_path.replace('.','_tmp.')
594
  sf.write(audio_path, audio, 16000)
595
  [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
596
 
 
630
  audio_tensors = []
631
  # save segments for comparison
632
  new_audio = new_audio[0].cpu()
633
+
634
  torchaudio.save(audio_path, new_audio, codec_audio_sr)
635
 
636
  [new_transcript, new_segments, _,_] = transcribe_zh(audio_path)
 
648
  new_audio, _ = torchaudio.load(audio_path, frame_offset=int(offset*codec_audio_sr))
649
  audio_tensors.append(new_audio)
650
  output_audio = get_output_audio(audio_tensors, codec_audio_sr)
651
+ torch.cuda.empty_cache()
652
  success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
653
  return output_audio, success_message
654
 
 
727
 
728
  with gr.Row():
729
  with gr.Accordion("Advanced Settings", open=False):
730
+ seed1 = gr.Number(label="seed", value=1234, precision=0, info="random seeds always works :)")
731
  aug_text1 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
732
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
733
  cfg_coef1 = gr.Number(label="cfg_coef", value=1.5,
 
783
 
784
  with gr.Row():
785
  with gr.Accordion("Advanced Settings", open=False):
786
+ seed2 = gr.Number(label="seed", value=1234, precision=0, info="random seeds always works :)")
787
  aug_text2 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
788
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
789
  cfg_coef2 = gr.Number(label="cfg_coef", value=1.5,
 
837
 
838
  with gr.Row():
839
  with gr.Accordion("Advanced Settings", open=False):
840
+ seed3 = gr.Number(label="seed", value=1234, precision=0, info="random seeds always works :)")
841
  aug_text3 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
842
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
843
  cfg_coef3 = gr.Number(label="cfg_coef", value=1.5,
844
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
845
+ cfg_stride3 = gr.Number(label="cfg_stride", value=1,
846
+ info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
847
  prompt_length3 = gr.Number(label="prompt_length", value=3,
848
  info="used for tts prompt, will automatically cut the prompt audio to this length")
849
  sub_amount3 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
 
891
 
892
  with gr.Row():
893
  with gr.Accordion("Advanced Settings", open=False):
894
+ seed4 = gr.Number(label="seed", value=1234, precision=0, info="random seeds always works :)")
895
  aug_text4 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
896
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
897
  cfg_coef4 = gr.Number(label="cfg_coef", value=1.5,
898
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
899
+ cfg_stride4 = gr.Number(label="cfg_stride", value=1,
900
+ info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
901
  prompt_length4 = gr.Number(label="prompt_length", value=3,
902
  info="used for tts prompt, will automatically cut the prompt audio to this length")
903
  sub_amount4 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")