Spaces:
Running
on
Zero
Running
on
Zero
Merge pull request #182 from v3ucn/speed_change_sox_version
Browse files- README.md +1 -1
- cosyvoice/utils/file_utils.py +12 -0
- webui.py +15 -6
README.md
CHANGED
@@ -156,4 +156,4 @@ You can also scan the QR code to join our official Dingding chat group.
|
|
156 |
5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
|
157 |
|
158 |
## Disclaimer
|
159 |
-
The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
|
|
|
156 |
5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
|
157 |
|
158 |
## Disclaimer
|
159 |
+
The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
|
cosyvoice/utils/file_utils.py
CHANGED
@@ -39,3 +39,15 @@ def load_wav(wav, target_sr):
|
|
39 |
assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
|
40 |
speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
|
41 |
return speech
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
|
40 |
speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
|
41 |
return speech
|
42 |
+
|
43 |
+
def speed_change(waveform, sample_rate, speed_factor: str):
|
44 |
+
effects = [
|
45 |
+
["tempo", speed_factor], # speed_factor
|
46 |
+
["rate", f"{sample_rate}"]
|
47 |
+
]
|
48 |
+
augmented_waveform, new_sample_rate = torchaudio.sox_effects.apply_effects_tensor(
|
49 |
+
waveform,
|
50 |
+
sample_rate,
|
51 |
+
effects
|
52 |
+
)
|
53 |
+
return augmented_waveform, new_sample_rate
|
webui.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
|
2 |
#
|
3 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
# you may not use this file except in compliance with the License.
|
@@ -28,7 +28,7 @@ import logging
|
|
28 |
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
29 |
|
30 |
from cosyvoice.cli.cosyvoice import CosyVoice
|
31 |
-
from cosyvoice.utils.file_utils import load_wav
|
32 |
|
33 |
logging.basicConfig(level=logging.DEBUG,
|
34 |
format='%(asctime)s %(levelname)s %(message)s')
|
@@ -66,7 +66,7 @@ instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成
|
|
66 |
def change_instruction(mode_checkbox_group):
|
67 |
return instruct_dict[mode_checkbox_group]
|
68 |
|
69 |
-
def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed):
|
70 |
if prompt_wav_upload is not None:
|
71 |
prompt_wav = prompt_wav_upload
|
72 |
elif prompt_wav_record is not None:
|
@@ -132,7 +132,16 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
|
|
132 |
logging.info('get instruct inference request')
|
133 |
set_all_random_seed(seed)
|
134 |
output = cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text)
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
return (target_sr, audio_data)
|
137 |
|
138 |
def main():
|
@@ -141,7 +150,7 @@ def main():
|
|
141 |
gr.Markdown("#### 请输入需要合成的文本,选择推理模式,并按照提示步骤进行操作")
|
142 |
|
143 |
tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型,提供舒适自然的语音合成能力。")
|
144 |
-
|
145 |
with gr.Row():
|
146 |
mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
|
147 |
instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
|
@@ -162,7 +171,7 @@ def main():
|
|
162 |
|
163 |
seed_button.click(generate_seed, inputs=[], outputs=seed)
|
164 |
generate_button.click(generate_audio,
|
165 |
-
inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed],
|
166 |
outputs=[audio_output])
|
167 |
mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
|
168 |
demo.queue(max_size=4, default_concurrency_limit=2)
|
|
|
1 |
+
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue)
|
2 |
#
|
3 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
# you may not use this file except in compliance with the License.
|
|
|
28 |
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
29 |
|
30 |
from cosyvoice.cli.cosyvoice import CosyVoice
|
31 |
+
from cosyvoice.utils.file_utils import load_wav, speed_change
|
32 |
|
33 |
logging.basicConfig(level=logging.DEBUG,
|
34 |
format='%(asctime)s %(levelname)s %(message)s')
|
|
|
66 |
def change_instruction(mode_checkbox_group):
|
67 |
return instruct_dict[mode_checkbox_group]
|
68 |
|
69 |
+
def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, speed_factor):
|
70 |
if prompt_wav_upload is not None:
|
71 |
prompt_wav = prompt_wav_upload
|
72 |
elif prompt_wav_record is not None:
|
|
|
132 |
logging.info('get instruct inference request')
|
133 |
set_all_random_seed(seed)
|
134 |
output = cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text)
|
135 |
+
|
136 |
+
if speed_factor != 1.0:
|
137 |
+
try:
|
138 |
+
audio_data, sample_rate = speed_change(output["tts_speech"], target_sr, str(speed_factor))
|
139 |
+
audio_data = audio_data.numpy().flatten()
|
140 |
+
except Exception as e:
|
141 |
+
print(f"Failed to change speed of audio: \n{e}")
|
142 |
+
else:
|
143 |
+
audio_data = output['tts_speech'].numpy().flatten()
|
144 |
+
|
145 |
return (target_sr, audio_data)
|
146 |
|
147 |
def main():
|
|
|
150 |
gr.Markdown("#### 请输入需要合成的文本,选择推理模式,并按照提示步骤进行操作")
|
151 |
|
152 |
tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型,提供舒适自然的语音合成能力。")
|
153 |
+
speed_factor = gr.Slider(minimum=0.25, maximum=4, step=0.05, label="语速调节", value=1.0, interactive=True)
|
154 |
with gr.Row():
|
155 |
mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
|
156 |
instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
|
|
|
171 |
|
172 |
seed_button.click(generate_seed, inputs=[], outputs=seed)
|
173 |
generate_button.click(generate_audio,
|
174 |
+
inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, speed_factor],
|
175 |
outputs=[audio_output])
|
176 |
mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
|
177 |
demo.queue(max_size=4, default_concurrency_limit=2)
|