CosyVoice-300M

Runtime error

wenmengzhou commited on Jul 20, 2024

Commit

73c1b13

verified ·

1 Parent(s): 94b950a

Update css/custom.py

Files changed (1) hide show

css/custom.py CHANGED Viewed

@@ -14,6 +14,8 @@ def custom():
     @spaces.GPU
     def generate_audio(_recorded_audio, _prompt_input_textbox, _language_radio,
                        _synthetic_input_textbox, _seed):
         print(_recorded_audio, _prompt_input_textbox, _language_radio, _synthetic_input_textbox, _seed)
         if _synthetic_input_textbox == '':
             gr.Warning('合成文本为空，您是否忘记输入合成文本？')
@@ -24,11 +26,18 @@ def custom():
         else:
             model = cosyvoice
         prompt_speech_16k = postprocess(load_wav(_recorded_audio, prompt_sr))
         if _language_radio == 'cross' or _prompt_input_textbox == '':
             output = model.inference_cross_lingual(_synthetic_input_textbox, prompt_speech_16k)
         else:
             output = model.inference_zero_shot(_synthetic_input_textbox, _prompt_input_textbox, prompt_speech_16k)
         audio_data = postprocess(output['tts_speech']).numpy().flatten()
         return (target_sr, audio_data)
     with gr.Column():

     @spaces.GPU
     def generate_audio(_recorded_audio, _prompt_input_textbox, _language_radio,
                        _synthetic_input_textbox, _seed):
+        import time
+        t1 = time.time()
         print(_recorded_audio, _prompt_input_textbox, _language_radio, _synthetic_input_textbox, _seed)
         if _synthetic_input_textbox == '':
             gr.Warning('合成文本为空，您是否忘记输入合成文本？')
         else:
             model = cosyvoice
         prompt_speech_16k = postprocess(load_wav(_recorded_audio, prompt_sr))
+        t2 = time.time()
         if _language_radio == 'cross' or _prompt_input_textbox == '':
             output = model.inference_cross_lingual(_synthetic_input_textbox, prompt_speech_16k)
         else:
             output = model.inference_zero_shot(_synthetic_input_textbox, _prompt_input_textbox, prompt_speech_16k)
+        t3 = time.time()
         audio_data = postprocess(output['tts_speech']).numpy().flatten()
+        t4 = time.time()
+        print(f'load and preprocess time: {t2-t1}s')
+        print(f'inference time: {t3-t2}s')
+        print(f'postprocess time: {t4-t3}s')
         return (target_sr, audio_data)
     with gr.Column():