CosyVoice commited on
Commit
2895d99
2 Parent(s): ead9644 4042a65

Merge pull request #182 from v3ucn/speed_change_sox_version

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. cosyvoice/utils/file_utils.py +12 -0
  3. webui.py +15 -6
README.md CHANGED
@@ -156,4 +156,4 @@ You can also scan the QR code to join our official Dingding chat group.
156
  5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
157
 
158
  ## Disclaimer
159
- The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
 
156
  5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
157
 
158
  ## Disclaimer
159
+ The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
cosyvoice/utils/file_utils.py CHANGED
@@ -39,3 +39,15 @@ def load_wav(wav, target_sr):
39
  assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
40
  speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
41
  return speech
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  assert sample_rate > target_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
40
  speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
41
  return speech
42
+
43
+ def speed_change(waveform, sample_rate, speed_factor: str):
44
+ effects = [
45
+ ["tempo", speed_factor], # speed_factor
46
+ ["rate", f"{sample_rate}"]
47
+ ]
48
+ augmented_waveform, new_sample_rate = torchaudio.sox_effects.apply_effects_tensor(
49
+ waveform,
50
+ sample_rate,
51
+ effects
52
+ )
53
+ return augmented_waveform, new_sample_rate
webui.py CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
2
  #
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
  # you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ import logging
28
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
29
 
30
  from cosyvoice.cli.cosyvoice import CosyVoice
31
- from cosyvoice.utils.file_utils import load_wav
32
 
33
  logging.basicConfig(level=logging.DEBUG,
34
  format='%(asctime)s %(levelname)s %(message)s')
@@ -66,7 +66,7 @@ instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成
66
  def change_instruction(mode_checkbox_group):
67
  return instruct_dict[mode_checkbox_group]
68
 
69
- def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed):
70
  if prompt_wav_upload is not None:
71
  prompt_wav = prompt_wav_upload
72
  elif prompt_wav_record is not None:
@@ -132,7 +132,16 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
132
  logging.info('get instruct inference request')
133
  set_all_random_seed(seed)
134
  output = cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text)
135
- audio_data = output['tts_speech'].numpy().flatten()
 
 
 
 
 
 
 
 
 
136
  return (target_sr, audio_data)
137
 
138
  def main():
@@ -141,7 +150,7 @@ def main():
141
  gr.Markdown("#### 请输入需要合成的文本,选择推理模式,并按照提示步骤进行操作")
142
 
143
  tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型,提供舒适自然的语音合成能力。")
144
-
145
  with gr.Row():
146
  mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
147
  instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
@@ -162,7 +171,7 @@ def main():
162
 
163
  seed_button.click(generate_seed, inputs=[], outputs=seed)
164
  generate_button.click(generate_audio,
165
- inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed],
166
  outputs=[audio_output])
167
  mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
168
  demo.queue(max_size=4, default_concurrency_limit=2)
 
1
+ # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue)
2
  #
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
  # you may not use this file except in compliance with the License.
 
28
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
29
 
30
  from cosyvoice.cli.cosyvoice import CosyVoice
31
+ from cosyvoice.utils.file_utils import load_wav, speed_change
32
 
33
  logging.basicConfig(level=logging.DEBUG,
34
  format='%(asctime)s %(levelname)s %(message)s')
 
66
  def change_instruction(mode_checkbox_group):
67
  return instruct_dict[mode_checkbox_group]
68
 
69
+ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, speed_factor):
70
  if prompt_wav_upload is not None:
71
  prompt_wav = prompt_wav_upload
72
  elif prompt_wav_record is not None:
 
132
  logging.info('get instruct inference request')
133
  set_all_random_seed(seed)
134
  output = cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text)
135
+
136
+ if speed_factor != 1.0:
137
+ try:
138
+ audio_data, sample_rate = speed_change(output["tts_speech"], target_sr, str(speed_factor))
139
+ audio_data = audio_data.numpy().flatten()
140
+ except Exception as e:
141
+ print(f"Failed to change speed of audio: \n{e}")
142
+ else:
143
+ audio_data = output['tts_speech'].numpy().flatten()
144
+
145
  return (target_sr, audio_data)
146
 
147
  def main():
 
150
  gr.Markdown("#### 请输入需要合成的文本,选择推理模式,并按照提示步骤进行操作")
151
 
152
  tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型,提供舒适自然的语音合成能力。")
153
+ speed_factor = gr.Slider(minimum=0.25, maximum=4, step=0.05, label="语速调节", value=1.0, interactive=True)
154
  with gr.Row():
155
  mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
156
  instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
 
171
 
172
  seed_button.click(generate_seed, inputs=[], outputs=seed)
173
  generate_button.click(generate_audio,
174
+ inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, speed_factor],
175
  outputs=[audio_output])
176
  mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
177
  demo.queue(max_size=4, default_concurrency_limit=2)