Spaces:

tanbw
/

CosyVoice

Running on Zero

App Files Files Community

CosyVoice commited on Jul 22

Commit

2895d99

•

2 Parent(s): ead9644 4042a65

Merge pull request #182 from v3ucn/speed_change_sox_version

Browse files

Files changed (3) hide show

README.md +1 -1
cosyvoice/utils/file_utils.py +12 -0
webui.py +15 -6

README.md CHANGED Viewed

@@ -156,4 +156,4 @@ You can also scan the QR code to join our official Dingding chat group.
 5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
 ## Disclaimer
-The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.

 5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
 ## Disclaimer
+The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.

cosyvoice/utils/file_utils.py CHANGED Viewed

@@ -39,3 +39,15 @@ def load_wav(wav, target_sr):
         assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
         speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
     return speech

         assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
         speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
     return speech
+def speed_change(waveform, sample_rate, speed_factor: str):
+    effects = [
+        ["tempo", speed_factor],  # speed_factor
+        ["rate", f"{sample_rate}"]
+    ]
+    augmented_waveform, new_sample_rate = torchaudio.sox_effects.apply_effects_tensor(
+        waveform,
+        sample_rate,
+        effects
+    )
+    return augmented_waveform, new_sample_rate

webui.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ import logging
 logging.getLogger('matplotlib').setLevel(logging.WARNING)
 from cosyvoice.cli.cosyvoice import CosyVoice
-from cosyvoice.utils.file_utils import load_wav
 logging.basicConfig(level=logging.DEBUG,
                     format='%(asctime)s %(levelname)s %(message)s')
@@ -66,7 +66,7 @@ instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成
 def change_instruction(mode_checkbox_group):
     return instruct_dict[mode_checkbox_group]
-def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed):
     if prompt_wav_upload is not None:
         prompt_wav = prompt_wav_upload
     elif prompt_wav_record is not None:
@@ -132,7 +132,16 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
         logging.info('get instruct inference request')
         set_all_random_seed(seed)
         output = cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text)
-    audio_data = output['tts_speech'].numpy().flatten()
     return (target_sr, audio_data)
 def main():
@@ -141,7 +150,7 @@ def main():
         gr.Markdown("#### 请输入需要合成的文本，选择推理模式，并按照提示步骤进行操作")
         tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型，提供舒适自然的语音合成能力。")
         with gr.Row():
             mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
             instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
@@ -162,7 +171,7 @@ def main():
         seed_button.click(generate_seed, inputs=[], outputs=seed)
         generate_button.click(generate_audio,
-                              inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed],
                               outputs=[audio_output])
         mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
     demo.queue(max_size=4, default_concurrency_limit=2)

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 logging.getLogger('matplotlib').setLevel(logging.WARNING)
 from cosyvoice.cli.cosyvoice import CosyVoice
+from cosyvoice.utils.file_utils import load_wav, speed_change
 logging.basicConfig(level=logging.DEBUG,
                     format='%(asctime)s %(levelname)s %(message)s')
 def change_instruction(mode_checkbox_group):
     return instruct_dict[mode_checkbox_group]
+def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, speed_factor):
     if prompt_wav_upload is not None:
         prompt_wav = prompt_wav_upload
     elif prompt_wav_record is not None:
         logging.info('get instruct inference request')
         set_all_random_seed(seed)
         output = cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text)
+    if speed_factor != 1.0:
+        try:
+            audio_data, sample_rate = speed_change(output["tts_speech"], target_sr, str(speed_factor))
+            audio_data = audio_data.numpy().flatten()
+        except Exception as e:
+            print(f"Failed to change speed of audio: \n{e}")
+    else:
+        audio_data = output['tts_speech'].numpy().flatten()
     return (target_sr, audio_data)
 def main():
         gr.Markdown("#### 请输入需要合成的文本，选择推理模式，并按照提示步骤进行操作")
         tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型，提供舒适自然的语音合成能力。")
+        speed_factor = gr.Slider(minimum=0.25, maximum=4, step=0.05, label="语速调节", value=1.0, interactive=True)
         with gr.Row():
             mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
             instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
         seed_button.click(generate_seed, inputs=[], outputs=seed)
         generate_button.click(generate_audio,
+                              inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, speed_factor],
                               outputs=[audio_output])
         mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
     demo.queue(max_size=4, default_concurrency_limit=2)