Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -186,7 +186,7 @@ def transcribe_en(audio_path):
|
|
186 |
_, segments = align_en(segments, audio_path)
|
187 |
state = get_transcribe_state(segments)
|
188 |
success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
|
189 |
-
|
190 |
return [
|
191 |
state["transcript"], state['segments'],
|
192 |
state, success_message
|
@@ -203,6 +203,7 @@ def transcribe_zh(audio_path):
|
|
203 |
success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
|
204 |
converter = opencc.OpenCC('t2s')
|
205 |
state["transcript"] = converter.convert(state["transcript"])
|
|
|
206 |
return [
|
207 |
state["transcript"], state['segments'],
|
208 |
state, success_message
|
@@ -215,7 +216,7 @@ def align_en(segments, audio_path):
|
|
215 |
audio = load_audio(audio_path)
|
216 |
segments = align_func(segments, align_model, metadata, audio, device, return_char_alignments=False)["segments"]
|
217 |
state = get_transcribe_state(segments)
|
218 |
-
|
219 |
return state, segments
|
220 |
|
221 |
|
@@ -226,7 +227,7 @@ def align_zh(segments, audio_path):
|
|
226 |
audio = load_audio(audio_path)
|
227 |
segments = align_func(segments, align_model, metadata, audio, device, return_char_alignments=False)["segments"]
|
228 |
state = get_transcribe_state(segments)
|
229 |
-
|
230 |
return state, segments
|
231 |
|
232 |
|
@@ -337,7 +338,7 @@ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
337 |
|
338 |
audio_tensors.append(new_audio)
|
339 |
output_audio = get_output_audio(audio_tensors, codec_audio_sr)
|
340 |
-
|
341 |
success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
|
342 |
return output_audio, success_message
|
343 |
|
@@ -386,6 +387,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
386 |
cut_length = min(item['end'], cut_length)
|
387 |
|
388 |
audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
|
|
|
389 |
sf.write(audio_path, audio, 16000)
|
390 |
[orig_transcript, segments, _, _] = transcribe_en(audio_path)
|
391 |
|
@@ -423,7 +425,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
423 |
audio_tensors = []
|
424 |
# save segments for comparison
|
425 |
new_audio = new_audio[0].cpu()
|
426 |
-
|
427 |
torchaudio.save(audio_path, new_audio, codec_audio_sr)
|
428 |
|
429 |
[new_transcript, new_segments, _, _] = transcribe_en(audio_path)
|
@@ -438,7 +440,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
438 |
new_audio, _ = torchaudio.load(audio_path, frame_offset=int(offset*codec_audio_sr))
|
439 |
audio_tensors.append(new_audio)
|
440 |
output_audio = get_output_audio(audio_tensors, codec_audio_sr)
|
441 |
-
|
442 |
success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
|
443 |
return output_audio, success_message
|
444 |
|
@@ -536,7 +538,7 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
536 |
# torchaudio.save(audio_path, new_audio, codec_audio_sr)
|
537 |
audio_tensors.append(new_audio)
|
538 |
output_audio = get_output_audio(audio_tensors, codec_audio_sr)
|
539 |
-
|
540 |
success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
|
541 |
return output_audio, success_message
|
542 |
|
@@ -588,6 +590,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
588 |
cut_length = min(item['end'], cut_length)
|
589 |
|
590 |
audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
|
|
|
591 |
sf.write(audio_path, audio, 16000)
|
592 |
[orig_transcript, segments, _, _] = transcribe_zh(audio_path)
|
593 |
|
@@ -627,7 +630,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
627 |
audio_tensors = []
|
628 |
# save segments for comparison
|
629 |
new_audio = new_audio[0].cpu()
|
630 |
-
|
631 |
torchaudio.save(audio_path, new_audio, codec_audio_sr)
|
632 |
|
633 |
[new_transcript, new_segments, _,_] = transcribe_zh(audio_path)
|
@@ -645,7 +648,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
645 |
new_audio, _ = torchaudio.load(audio_path, frame_offset=int(offset*codec_audio_sr))
|
646 |
audio_tensors.append(new_audio)
|
647 |
output_audio = get_output_audio(audio_tensors, codec_audio_sr)
|
648 |
-
|
649 |
success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
|
650 |
return output_audio, success_message
|
651 |
|
@@ -724,7 +727,7 @@ if __name__ == "__main__":
|
|
724 |
|
725 |
with gr.Row():
|
726 |
with gr.Accordion("Advanced Settings", open=False):
|
727 |
-
seed1 = gr.Number(label="seed", value=
|
728 |
aug_text1 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
|
729 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
730 |
cfg_coef1 = gr.Number(label="cfg_coef", value=1.5,
|
@@ -780,7 +783,7 @@ if __name__ == "__main__":
|
|
780 |
|
781 |
with gr.Row():
|
782 |
with gr.Accordion("Advanced Settings", open=False):
|
783 |
-
seed2 = gr.Number(label="seed", value=
|
784 |
aug_text2 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
|
785 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
786 |
cfg_coef2 = gr.Number(label="cfg_coef", value=1.5,
|
@@ -834,13 +837,13 @@ if __name__ == "__main__":
|
|
834 |
|
835 |
with gr.Row():
|
836 |
with gr.Accordion("Advanced Settings", open=False):
|
837 |
-
seed3 = gr.Number(label="seed", value=
|
838 |
aug_text3 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
|
839 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
840 |
cfg_coef3 = gr.Number(label="cfg_coef", value=1.5,
|
841 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
842 |
-
cfg_stride3 = gr.Number(label="cfg_stride", value=
|
843 |
-
info="cfg stride,
|
844 |
prompt_length3 = gr.Number(label="prompt_length", value=3,
|
845 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
846 |
sub_amount3 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|
@@ -888,13 +891,13 @@ if __name__ == "__main__":
|
|
888 |
|
889 |
with gr.Row():
|
890 |
with gr.Accordion("Advanced Settings", open=False):
|
891 |
-
seed4 = gr.Number(label="seed", value=
|
892 |
aug_text4 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
|
893 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
894 |
cfg_coef4 = gr.Number(label="cfg_coef", value=1.5,
|
895 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
896 |
-
cfg_stride4 = gr.Number(label="cfg_stride", value=
|
897 |
-
info="cfg stride,
|
898 |
prompt_length4 = gr.Number(label="prompt_length", value=3,
|
899 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
900 |
sub_amount4 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|
|
|
186 |
_, segments = align_en(segments, audio_path)
|
187 |
state = get_transcribe_state(segments)
|
188 |
success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
|
189 |
+
torch.cuda.empty_cache()
|
190 |
return [
|
191 |
state["transcript"], state['segments'],
|
192 |
state, success_message
|
|
|
203 |
success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
|
204 |
converter = opencc.OpenCC('t2s')
|
205 |
state["transcript"] = converter.convert(state["transcript"])
|
206 |
+
torch.cuda.empty_cache()
|
207 |
return [
|
208 |
state["transcript"], state['segments'],
|
209 |
state, success_message
|
|
|
216 |
audio = load_audio(audio_path)
|
217 |
segments = align_func(segments, align_model, metadata, audio, device, return_char_alignments=False)["segments"]
|
218 |
state = get_transcribe_state(segments)
|
219 |
+
torch.cuda.empty_cache()
|
220 |
return state, segments
|
221 |
|
222 |
|
|
|
227 |
audio = load_audio(audio_path)
|
228 |
segments = align_func(segments, align_model, metadata, audio, device, return_char_alignments=False)["segments"]
|
229 |
state = get_transcribe_state(segments)
|
230 |
+
torch.cuda.empty_cache()
|
231 |
return state, segments
|
232 |
|
233 |
|
|
|
338 |
|
339 |
audio_tensors.append(new_audio)
|
340 |
output_audio = get_output_audio(audio_tensors, codec_audio_sr)
|
341 |
+
torch.cuda.empty_cache()
|
342 |
success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
|
343 |
return output_audio, success_message
|
344 |
|
|
|
387 |
cut_length = min(item['end'], cut_length)
|
388 |
|
389 |
audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
|
390 |
+
audio_path = audio_path.replace('.','_tmp.')
|
391 |
sf.write(audio_path, audio, 16000)
|
392 |
[orig_transcript, segments, _, _] = transcribe_en(audio_path)
|
393 |
|
|
|
425 |
audio_tensors = []
|
426 |
# save segments for comparison
|
427 |
new_audio = new_audio[0].cpu()
|
428 |
+
|
429 |
torchaudio.save(audio_path, new_audio, codec_audio_sr)
|
430 |
|
431 |
[new_transcript, new_segments, _, _] = transcribe_en(audio_path)
|
|
|
440 |
new_audio, _ = torchaudio.load(audio_path, frame_offset=int(offset*codec_audio_sr))
|
441 |
audio_tensors.append(new_audio)
|
442 |
output_audio = get_output_audio(audio_tensors, codec_audio_sr)
|
443 |
+
torch.cuda.empty_cache()
|
444 |
success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
|
445 |
return output_audio, success_message
|
446 |
|
|
|
538 |
# torchaudio.save(audio_path, new_audio, codec_audio_sr)
|
539 |
audio_tensors.append(new_audio)
|
540 |
output_audio = get_output_audio(audio_tensors, codec_audio_sr)
|
541 |
+
torch.cuda.empty_cache()
|
542 |
success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
|
543 |
return output_audio, success_message
|
544 |
|
|
|
590 |
cut_length = min(item['end'], cut_length)
|
591 |
|
592 |
audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
|
593 |
+
audio_path = audio_path.replace('.','_tmp.')
|
594 |
sf.write(audio_path, audio, 16000)
|
595 |
[orig_transcript, segments, _, _] = transcribe_zh(audio_path)
|
596 |
|
|
|
630 |
audio_tensors = []
|
631 |
# save segments for comparison
|
632 |
new_audio = new_audio[0].cpu()
|
633 |
+
|
634 |
torchaudio.save(audio_path, new_audio, codec_audio_sr)
|
635 |
|
636 |
[new_transcript, new_segments, _,_] = transcribe_zh(audio_path)
|
|
|
648 |
new_audio, _ = torchaudio.load(audio_path, frame_offset=int(offset*codec_audio_sr))
|
649 |
audio_tensors.append(new_audio)
|
650 |
output_audio = get_output_audio(audio_tensors, codec_audio_sr)
|
651 |
+
torch.cuda.empty_cache()
|
652 |
success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
|
653 |
return output_audio, success_message
|
654 |
|
|
|
727 |
|
728 |
with gr.Row():
|
729 |
with gr.Accordion("Advanced Settings", open=False):
|
730 |
+
seed1 = gr.Number(label="seed", value=1234, precision=0, info="random seeds always works :)")
|
731 |
aug_text1 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
|
732 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
733 |
cfg_coef1 = gr.Number(label="cfg_coef", value=1.5,
|
|
|
783 |
|
784 |
with gr.Row():
|
785 |
with gr.Accordion("Advanced Settings", open=False):
|
786 |
+
seed2 = gr.Number(label="seed", value=1234, precision=0, info="random seeds always works :)")
|
787 |
aug_text2 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
|
788 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
789 |
cfg_coef2 = gr.Number(label="cfg_coef", value=1.5,
|
|
|
837 |
|
838 |
with gr.Row():
|
839 |
with gr.Accordion("Advanced Settings", open=False):
|
840 |
+
seed3 = gr.Number(label="seed", value=1234, precision=0, info="random seeds always works :)")
|
841 |
aug_text3 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
|
842 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
843 |
cfg_coef3 = gr.Number(label="cfg_coef", value=1.5,
|
844 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
845 |
+
cfg_stride3 = gr.Number(label="cfg_stride", value=1,
|
846 |
+
info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
|
847 |
prompt_length3 = gr.Number(label="prompt_length", value=3,
|
848 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
849 |
sub_amount3 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|
|
|
891 |
|
892 |
with gr.Row():
|
893 |
with gr.Accordion("Advanced Settings", open=False):
|
894 |
+
seed4 = gr.Number(label="seed", value=1234, precision=0, info="random seeds always works :)")
|
895 |
aug_text4 = gr.Radio(label="aug_text", choices=[0, 1], value=1,
|
896 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
897 |
cfg_coef4 = gr.Number(label="cfg_coef", value=1.5,
|
898 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
899 |
+
cfg_stride4 = gr.Number(label="cfg_stride", value=1,
|
900 |
+
info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
|
901 |
prompt_length4 = gr.Number(label="prompt_length", value=3,
|
902 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
903 |
sub_amount4 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|