CosyVoice commited on
Commit
ee98842
2 Parent(s): 18599be 6620129

Merge branch 'inference_streaming' into flow_tensorrt

Browse files
README.md CHANGED
@@ -4,6 +4,36 @@
4
 
5
  For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVoice) and [SenseVoice space](https://www.modelscope.cn/studios/iic/SenseVoice).
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  ## Install
8
 
9
  **Clone and install**
 
4
 
5
  For `SenseVoice`, visit [SenseVoice repo](https://github.com/FunAudioLLM/SenseVoice) and [SenseVoice space](https://www.modelscope.cn/studios/iic/SenseVoice).
6
 
7
+ ## Roadmap
8
+
9
+ - [x] 2024/07
10
+
11
+ - [x] Flow matching training support
12
+ - [x] WeTextProcessing support when ttsfrd is not avaliable
13
+ - [x] Fastapi server and client
14
+
15
+ - [ ] 2024/08
16
+
17
+ - [ ] Repetition Aware Sampling(RAS) inference for llm stability
18
+ - [ ] Streaming inference mode support, including kv cache and sdpa for rtf optimization
19
+
20
+ - [ ] 2024/09
21
+
22
+ - [ ] 50hz llm model which supports 10 language
23
+
24
+ - [ ] 2024/10
25
+
26
+ - [ ] 50hz llama based llm model which supports lora finetune
27
+
28
+ - [ ] TBD
29
+
30
+ - [ ] Support more instruction mode
31
+ - [ ] Voice conversion
32
+ - [ ] Music generation
33
+ - [ ] Training script sample based on Mandarin
34
+ - [ ] CosyVoice-500M trained with more multi-lingual data
35
+ - [ ] More...
36
+
37
  ## Install
38
 
39
  **Clone and install**
cosyvoice/cli/cosyvoice.py CHANGED
@@ -43,7 +43,6 @@ class CosyVoice:
43
  if load_jit:
44
  self.model.load_jit('{}/llm.text_encoder.fp16.zip'.format(model_dir),
45
  '{}/llm.llm.fp16.zip'.format(model_dir))
46
-
47
  if load_trt:
48
  self.model.load_trt(model_dir, use_fp16)
49
 
 
43
  if load_jit:
44
  self.model.load_jit('{}/llm.text_encoder.fp16.zip'.format(model_dir),
45
  '{}/llm.llm.fp16.zip'.format(model_dir))
 
46
  if load_trt:
47
  self.model.load_trt(model_dir, use_fp16)
48
 
cosyvoice/cli/model.py CHANGED
@@ -137,7 +137,6 @@ class CosyVoiceModel:
137
  self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid], self.mel_overlap_dict[this_uuid], self.hift_cache_dict[this_uuid] = [], False, None, None
138
  p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
139
  p.start()
140
- p.join()
141
  if stream is True:
142
  token_hop_len = self.token_min_hop_len
143
  while True:
@@ -158,7 +157,7 @@ class CosyVoiceModel:
158
  token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
159
  if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
160
  break
161
- # p.join()
162
  # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
163
  this_tts_speech_token = torch.concat(self.tts_speech_token_dict[this_uuid], dim=1)
164
  with self.flow_hift_context:
@@ -171,7 +170,7 @@ class CosyVoiceModel:
171
  yield {'tts_speech': this_tts_speech.cpu()}
172
  else:
173
  # deal with all tokens
174
- # p.join()
175
  this_tts_speech_token = torch.concat(self.tts_speech_token_dict[this_uuid], dim=1)
176
  with self.flow_hift_context:
177
  this_tts_speech = self.token2wav(token=this_tts_speech_token,
 
137
  self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid], self.mel_overlap_dict[this_uuid], self.hift_cache_dict[this_uuid] = [], False, None, None
138
  p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
139
  p.start()
 
140
  if stream is True:
141
  token_hop_len = self.token_min_hop_len
142
  while True:
 
157
  token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
158
  if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
159
  break
160
+ p.join()
161
  # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
162
  this_tts_speech_token = torch.concat(self.tts_speech_token_dict[this_uuid], dim=1)
163
  with self.flow_hift_context:
 
170
  yield {'tts_speech': this_tts_speech.cpu()}
171
  else:
172
  # deal with all tokens
173
+ p.join()
174
  this_tts_speech_token = torch.concat(self.tts_speech_token_dict[this_uuid], dim=1)
175
  with self.flow_hift_context:
176
  this_tts_speech = self.token2wav(token=this_tts_speech_token,
cosyvoice/flow/flow.py CHANGED
@@ -12,6 +12,7 @@
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
  import logging
 
15
  from typing import Dict, Optional
16
  import torch
17
  import torch.nn as nn
@@ -77,6 +78,11 @@ class MaskedDiffWithXvec(torch.nn.Module):
77
 
78
  # get conditions
79
  conds = torch.zeros(feat.shape, device=token.device)
 
 
 
 
 
80
  conds = conds.transpose(1, 2)
81
 
82
  mask = (~make_pad_mask(feat_len)).to(h)
 
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
  import logging
15
+ import random
16
  from typing import Dict, Optional
17
  import torch
18
  import torch.nn as nn
 
78
 
79
  # get conditions
80
  conds = torch.zeros(feat.shape, device=token.device)
81
+ for i, j in enumerate(feat_len):
82
+ if random.random() < 0.5:
83
+ continue
84
+ index = random.randint(0, int(0.3 * j))
85
+ conds[i, :index] = feat[i, :index]
86
  conds = conds.transpose(1, 2)
87
 
88
  mask = (~make_pad_mask(feat_len)).to(h)
cosyvoice/flow/flow_matching.py CHANGED
@@ -78,10 +78,10 @@ class ConditionalCFM(BASECFM):
78
  sol = []
79
 
80
  for step in range(1, len(t_span)):
81
- dphi_dt = self.forward_estimator(x, mask, mu, t, spks, cond)
82
  # Classifier-Free Guidance inference introduced in VoiceBox
83
  if self.inference_cfg_rate > 0:
84
- cfg_dphi_dt = self.forward_estimator(
85
  x, mask,
86
  torch.zeros_like(mu), t,
87
  torch.zeros_like(spks) if spks is not None else None,
 
78
  sol = []
79
 
80
  for step in range(1, len(t_span)):
81
+ dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
82
  # Classifier-Free Guidance inference introduced in VoiceBox
83
  if self.inference_cfg_rate > 0:
84
+ cfg_dphi_dt = self.estimator(
85
  x, mask,
86
  torch.zeros_like(mu), t,
87
  torch.zeros_like(spks) if spks is not None else None,
cosyvoice/transformer/encoder.py CHANGED
@@ -299,7 +299,7 @@ class BaseEncoder(torch.nn.Module):
299
  rate.
300
  3. Currently, nn.Sequential is used to stack all the convolution
301
  layers in subsampling, we need to rewrite it to make it work
302
- with cache, which is not prefered.
303
  Args:
304
  xs (torch.Tensor): (1, max_len, dim)
305
  chunk_size (int): decoding chunk size
 
299
  rate.
300
  3. Currently, nn.Sequential is used to stack all the convolution
301
  layers in subsampling, we need to rewrite it to make it work
302
+ with cache, which is not preferred.
303
  Args:
304
  xs (torch.Tensor): (1, max_len, dim)
305
  chunk_size (int): decoding chunk size
examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1986]
3
+ __set_seed2: !apply:numpy.random.seed [1986]
4
+ __set_seed3: !apply:torch.manual_seed [1986]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
+
7
+ # fixed params
8
+ sample_rate: 22050
9
+ text_encoder_input_size: 512
10
+ llm_input_size: 1024
11
+ llm_output_size: 1024
12
+ spk_embed_dim: 192
13
+
14
+ # model params
15
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
16
+ # for system/third_party class/function, we do not require this.
17
+ llm: !new:cosyvoice.llm.llm.TransformerLM
18
+ text_encoder_input_size: !ref <text_encoder_input_size>
19
+ llm_input_size: !ref <llm_input_size>
20
+ llm_output_size: !ref <llm_output_size>
21
+ text_token_size: 51866
22
+ speech_token_size: 4096
23
+ length_normalized_loss: True
24
+ lsm_weight: 0
25
+ spk_embed_dim: !ref <spk_embed_dim>
26
+ text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
27
+ input_size: !ref <text_encoder_input_size>
28
+ output_size: 1024
29
+ attention_heads: 8
30
+ linear_units: 2048
31
+ num_blocks: 3
32
+ dropout_rate: 0.1
33
+ positional_dropout_rate: 0.1
34
+ attention_dropout_rate: 0.0
35
+ normalize_before: True
36
+ input_layer: 'linear'
37
+ pos_enc_layer_type: 'rel_pos_espnet'
38
+ selfattention_layer_type: 'rel_selfattn'
39
+ use_cnn_module: False
40
+ macaron_style: False
41
+ use_dynamic_chunk: False
42
+ use_dynamic_left_chunk: False
43
+ static_chunk_size: 1
44
+ llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
45
+ input_size: !ref <llm_input_size>
46
+ output_size: !ref <llm_output_size>
47
+ attention_heads: 8
48
+ linear_units: 2048
49
+ num_blocks: 7
50
+ dropout_rate: 0.1
51
+ positional_dropout_rate: 0.1
52
+ attention_dropout_rate: 0.0
53
+ input_layer: 'linear_legacy'
54
+ pos_enc_layer_type: 'rel_pos_espnet'
55
+ selfattention_layer_type: 'rel_selfattn'
56
+ static_chunk_size: 1
57
+
58
+ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
59
+ input_size: 512
60
+ output_size: 80
61
+ spk_embed_dim: !ref <spk_embed_dim>
62
+ output_type: 'mel'
63
+ vocab_size: 4096
64
+ input_frame_rate: 50
65
+ only_mask_loss: True
66
+ encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
67
+ output_size: 512
68
+ attention_heads: 4
69
+ linear_units: 1024
70
+ num_blocks: 3
71
+ dropout_rate: 0.1
72
+ positional_dropout_rate: 0.1
73
+ attention_dropout_rate: 0.1
74
+ normalize_before: True
75
+ input_layer: 'linear'
76
+ pos_enc_layer_type: 'rel_pos_espnet'
77
+ selfattention_layer_type: 'rel_selfattn'
78
+ input_size: 512
79
+ use_cnn_module: False
80
+ macaron_style: False
81
+ length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
82
+ channels: 80
83
+ sampling_ratios: [1, 1, 1, 1]
84
+ decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
85
+ in_channels: 240
86
+ n_spks: 1
87
+ spk_emb_dim: 80
88
+ cfm_params: !new:omegaconf.DictConfig
89
+ content:
90
+ sigma_min: 1e-06
91
+ solver: 'euler'
92
+ t_scheduler: 'cosine'
93
+ training_cfg_rate: 0.2
94
+ inference_cfg_rate: 0.7
95
+ reg_loss_type: 'l1'
96
+ estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
97
+ in_channels: 320
98
+ out_channels: 80
99
+ channels: [256, 256]
100
+ dropout: 0.0
101
+ attention_head_dim: 64
102
+ n_blocks: 4
103
+ num_mid_blocks: 8
104
+ num_heads: 8
105
+ act_fn: 'gelu'
106
+
107
+ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
108
+ in_channels: 80
109
+ base_channels: 512
110
+ nb_harmonics: 8
111
+ sampling_rate: !ref <sample_rate>
112
+ nsf_alpha: 0.1
113
+ nsf_sigma: 0.003
114
+ nsf_voiced_threshold: 10
115
+ upsample_rates: [8, 8]
116
+ upsample_kernel_sizes: [16, 16]
117
+ istft_params:
118
+ n_fft: 16
119
+ hop_len: 4
120
+ resblock_kernel_sizes: [3, 7, 11]
121
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
122
+ source_resblock_kernel_sizes: [7, 11]
123
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
124
+ lrelu_slope: 0.1
125
+ audio_limit: 0.99
126
+ f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
127
+ num_class: 1
128
+ in_channels: 80
129
+ cond_channels: 512
130
+
131
+ # processor functions
132
+ parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
133
+ get_tokenizer: !name:whisper.tokenizer.get_tokenizer
134
+ multilingual: True
135
+ num_languages: 100
136
+ language: 'en'
137
+ task: 'transcribe'
138
+ allowed_special: 'all'
139
+ tokenize: !name:cosyvoice.dataset.processor.tokenize
140
+ get_tokenizer: !ref <get_tokenizer>
141
+ allowed_special: !ref <allowed_special>
142
+ filter: !name:cosyvoice.dataset.processor.filter
143
+ max_length: 40960
144
+ min_length: 0
145
+ token_max_length: 200
146
+ token_min_length: 1
147
+ resample: !name:cosyvoice.dataset.processor.resample
148
+ resample_rate: !ref <sample_rate>
149
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
150
+ n_fft: 1024
151
+ num_mels: 80
152
+ sampling_rate: !ref <sample_rate>
153
+ hop_size: 256
154
+ win_size: 1024
155
+ fmin: 0
156
+ fmax: 8000
157
+ center: False
158
+ compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
159
+ feat_extractor: !ref <feat_extractor>
160
+ parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
161
+ normalize: True
162
+ shuffle: !name:cosyvoice.dataset.processor.shuffle
163
+ shuffle_size: 1000
164
+ sort: !name:cosyvoice.dataset.processor.sort
165
+ sort_size: 500 # sort_size should be less than shuffle_size
166
+ batch: !name:cosyvoice.dataset.processor.batch
167
+ batch_type: 'dynamic'
168
+ max_frames_in_batch: 12000
169
+ padding: !name:cosyvoice.dataset.processor.padding
170
+ use_spk_embedding: False # change to True during sft
171
+
172
+ # dataset processor pipeline
173
+ data_pipeline: [
174
+ !ref <parquet_opener>,
175
+ !ref <tokenize>,
176
+ !ref <filter>,
177
+ !ref <resample>,
178
+ !ref <compute_fbank>,
179
+ !ref <parse_embedding>,
180
+ !ref <shuffle>,
181
+ !ref <sort>,
182
+ !ref <batch>,
183
+ !ref <padding>,
184
+ ]
185
+
186
+ # train conf
187
+ train_conf:
188
+ optim: adam
189
+ optim_conf:
190
+ lr: 0.002 # change to 0.001 if you want to train flow from scratch
191
+ scheduler: warmuplr
192
+ scheduler_conf:
193
+ warmup_steps: 25000
194
+ max_epoch: 200
195
+ grad_clip: 5
196
+ accum_grad: 2
197
+ log_interval: 100
198
+ save_per_step: -1
examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1986]
3
+ __set_seed2: !apply:numpy.random.seed [1986]
4
+ __set_seed3: !apply:torch.manual_seed [1986]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
+
7
+ # fixed params
8
+ sample_rate: 22050
9
+ text_encoder_input_size: 512
10
+ llm_input_size: 1024
11
+ llm_output_size: 1024
12
+ spk_embed_dim: 192
13
+
14
+ # model params
15
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
16
+ # for system/third_party class/function, we do not require this.
17
+ llm: !new:cosyvoice.llm.llm.TransformerLM
18
+ text_encoder_input_size: !ref <text_encoder_input_size>
19
+ llm_input_size: !ref <llm_input_size>
20
+ llm_output_size: !ref <llm_output_size>
21
+ text_token_size: 51866
22
+ speech_token_size: 4096
23
+ length_normalized_loss: True
24
+ lsm_weight: 0
25
+ spk_embed_dim: !ref <spk_embed_dim>
26
+ text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
27
+ input_size: !ref <text_encoder_input_size>
28
+ output_size: 1024
29
+ attention_heads: 16
30
+ linear_units: 4096
31
+ num_blocks: 6
32
+ dropout_rate: 0.1
33
+ positional_dropout_rate: 0.1
34
+ attention_dropout_rate: 0.0
35
+ normalize_before: True
36
+ input_layer: 'linear'
37
+ pos_enc_layer_type: 'rel_pos_espnet'
38
+ selfattention_layer_type: 'rel_selfattn'
39
+ use_cnn_module: False
40
+ macaron_style: False
41
+ use_dynamic_chunk: False
42
+ use_dynamic_left_chunk: False
43
+ static_chunk_size: 1
44
+ llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
45
+ input_size: !ref <llm_input_size>
46
+ output_size: !ref <llm_output_size>
47
+ attention_heads: 16
48
+ linear_units: 4096
49
+ num_blocks: 14
50
+ dropout_rate: 0.1
51
+ positional_dropout_rate: 0.1
52
+ attention_dropout_rate: 0.0
53
+ input_layer: 'linear_legacy'
54
+ pos_enc_layer_type: 'rel_pos_espnet'
55
+ selfattention_layer_type: 'rel_selfattn'
56
+ static_chunk_size: 1
57
+
58
+ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
59
+ input_size: 512
60
+ output_size: 80
61
+ spk_embed_dim: !ref <spk_embed_dim>
62
+ output_type: 'mel'
63
+ vocab_size: 4096
64
+ input_frame_rate: 50
65
+ only_mask_loss: True
66
+ encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
67
+ output_size: 512
68
+ attention_heads: 8
69
+ linear_units: 2048
70
+ num_blocks: 6
71
+ dropout_rate: 0.1
72
+ positional_dropout_rate: 0.1
73
+ attention_dropout_rate: 0.1
74
+ normalize_before: True
75
+ input_layer: 'linear'
76
+ pos_enc_layer_type: 'rel_pos_espnet'
77
+ selfattention_layer_type: 'rel_selfattn'
78
+ input_size: 512
79
+ use_cnn_module: False
80
+ macaron_style: False
81
+ length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
82
+ channels: 80
83
+ sampling_ratios: [1, 1, 1, 1]
84
+ decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
85
+ in_channels: 240
86
+ n_spks: 1
87
+ spk_emb_dim: 80
88
+ cfm_params: !new:omegaconf.DictConfig
89
+ content:
90
+ sigma_min: 1e-06
91
+ solver: 'euler'
92
+ t_scheduler: 'cosine'
93
+ training_cfg_rate: 0.2
94
+ inference_cfg_rate: 0.7
95
+ reg_loss_type: 'l1'
96
+ estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
97
+ in_channels: 320
98
+ out_channels: 80
99
+ channels: [256, 256]
100
+ dropout: 0.0
101
+ attention_head_dim: 64
102
+ n_blocks: 4
103
+ num_mid_blocks: 12
104
+ num_heads: 8
105
+ act_fn: 'gelu'
106
+
107
+ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
108
+ in_channels: 80
109
+ base_channels: 512
110
+ nb_harmonics: 8
111
+ sampling_rate: !ref <sample_rate>
112
+ nsf_alpha: 0.1
113
+ nsf_sigma: 0.003
114
+ nsf_voiced_threshold: 10
115
+ upsample_rates: [8, 8]
116
+ upsample_kernel_sizes: [16, 16]
117
+ istft_params:
118
+ n_fft: 16
119
+ hop_len: 4
120
+ resblock_kernel_sizes: [3, 7, 11]
121
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
122
+ source_resblock_kernel_sizes: [7, 11]
123
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
124
+ lrelu_slope: 0.1
125
+ audio_limit: 0.99
126
+ f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
127
+ num_class: 1
128
+ in_channels: 80
129
+ cond_channels: 512
130
+
131
+ # processor functions
132
+ parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
133
+ get_tokenizer: !name:whisper.tokenizer.get_tokenizer
134
+ multilingual: True
135
+ num_languages: 100
136
+ language: 'en'
137
+ task: 'transcribe'
138
+ allowed_special: 'all'
139
+ tokenize: !name:cosyvoice.dataset.processor.tokenize
140
+ get_tokenizer: !ref <get_tokenizer>
141
+ allowed_special: !ref <allowed_special>
142
+ filter: !name:cosyvoice.dataset.processor.filter
143
+ max_length: 40960
144
+ min_length: 0
145
+ token_max_length: 200
146
+ token_min_length: 1
147
+ resample: !name:cosyvoice.dataset.processor.resample
148
+ resample_rate: !ref <sample_rate>
149
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
150
+ n_fft: 1024
151
+ num_mels: 80
152
+ sampling_rate: !ref <sample_rate>
153
+ hop_size: 256
154
+ win_size: 1024
155
+ fmin: 0
156
+ fmax: 8000
157
+ center: False
158
+ compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
159
+ feat_extractor: !ref <feat_extractor>
160
+ parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
161
+ normalize: True
162
+ shuffle: !name:cosyvoice.dataset.processor.shuffle
163
+ shuffle_size: 1000
164
+ sort: !name:cosyvoice.dataset.processor.sort
165
+ sort_size: 500 # sort_size should be less than shuffle_size
166
+ batch: !name:cosyvoice.dataset.processor.batch
167
+ batch_type: 'dynamic'
168
+ max_frames_in_batch: 2000
169
+ padding: !name:cosyvoice.dataset.processor.padding
170
+ use_spk_embedding: False # change to True during sft
171
+
172
+ # dataset processor pipeline
173
+ data_pipeline: [
174
+ !ref <parquet_opener>,
175
+ !ref <tokenize>,
176
+ !ref <filter>,
177
+ !ref <resample>,
178
+ !ref <compute_fbank>,
179
+ !ref <parse_embedding>,
180
+ !ref <shuffle>,
181
+ !ref <sort>,
182
+ !ref <batch>,
183
+ !ref <padding>,
184
+ ]
185
+
186
+ # train conf
187
+ train_conf:
188
+ optim: adam
189
+ optim_conf:
190
+ lr: 0.001 # change to 1e-5 during sft
191
+ scheduler: warmuplr # change to constantlr during sft
192
+ scheduler_conf:
193
+ warmup_steps: 2500
194
+ max_epoch: 200
195
+ grad_clip: 5
196
+ accum_grad: 2
197
+ log_interval: 100
198
+ save_per_step: -1
examples/magicdata-read/cosyvoice/conf/ds_stage2.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_micro_batch_size_per_gpu": 1,
3
+ "gradient_accumulation_steps": 1,
4
+ "steps_per_print": 100,
5
+ "gradient_clipping": 5,
6
+ "fp16": {
7
+ "enabled": false,
8
+ "auto_cast": false,
9
+ "loss_scale": 0,
10
+ "initial_scale_power": 16,
11
+ "loss_scale_window": 256,
12
+ "hysteresis": 2,
13
+ "consecutive_hysteresis": false,
14
+ "min_loss_scale": 1
15
+ },
16
+ "bf16": {
17
+ "enabled": false
18
+ },
19
+ "zero_force_ds_cpu_optimizer": false,
20
+ "zero_optimization": {
21
+ "stage": 2,
22
+ "offload_optimizer": {
23
+ "device": "none",
24
+ "pin_memory": true
25
+ },
26
+ "allgather_partitions": true,
27
+ "allgather_bucket_size": 5e8,
28
+ "overlap_comm": false,
29
+ "reduce_scatter": true,
30
+ "reduce_bucket_size": 5e8,
31
+ "contiguous_gradients" : true
32
+ },
33
+ "optimizer": {
34
+ "type": "AdamW",
35
+ "params": {
36
+ "lr": 0.001,
37
+ "weight_decay": 0.0001,
38
+ "torch_adam": true,
39
+ "adam_w_mode": true
40
+ }
41
+ }
42
+ }
examples/magicdata-read/cosyvoice/cosyvoice ADDED
@@ -0,0 +1 @@
 
 
1
+ ../../../cosyvoice
examples/magicdata-read/cosyvoice/local/download_and_untar.sh ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright 2014 Johns Hopkins University (author: Daniel Povey)
4
+ # Apache 2.0
5
+
6
+ remove_archive=false
7
+
8
+ if [ "$1" == --remove-archive ]; then
9
+ remove_archive=true
10
+ shift
11
+ fi
12
+
13
+ if [ $# -ne 3 ]; then
14
+ echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
15
+ echo "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean"
16
+ echo "With --remove-archive it will remove the archive after successfully un-tarring it."
17
+ echo "<corpus-part> can be one of: dev-clean, test-clean, dev-other, test-other,"
18
+ echo " train-clean-100, train-clean-360, train-other-500."
19
+ exit 1
20
+ fi
21
+
22
+ data=$1
23
+ url=$2
24
+ part=$3
25
+
26
+ if [ ! -d "$data" ]; then
27
+ echo "$0: no such directory $data"
28
+ exit 1
29
+ fi
30
+
31
+ part_ok=false
32
+ list="dev_set test_set train_set"
33
+ for x in $list; do
34
+ if [ "$part" == $x ]; then part_ok=true; fi
35
+ done
36
+ if ! $part_ok; then
37
+ echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
38
+ exit 1
39
+ fi
40
+
41
+ if [ -z "$url" ]; then
42
+ echo "$0: empty URL base."
43
+ exit 1
44
+ fi
45
+
46
+ if [ -f $data/.$part.complete ]; then
47
+ echo "$0: data part $part was already successfully extracted, nothing to do."
48
+ exit 0
49
+ fi
50
+
51
+
52
+ # sizes of the archive files in bytes. This is some older versions.
53
+ sizes_old="1035537823 2201936013 52627842921"
54
+ # sizes_new is the archive file sizes of the final release. Some of these sizes are of
55
+ # things we probably won't download.
56
+ sizes_new="3886385"
57
+
58
+ if [ -f $data/$part.tar.gz ]; then
59
+ size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
60
+ size_ok=false
61
+ for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done
62
+ if ! $size_ok; then
63
+ echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
64
+ echo "does not equal the size of one of the archives."
65
+ rm $data/$part.tar.gz
66
+ else
67
+ echo "$data/$part.tar.gz exists and appears to be complete."
68
+ fi
69
+ fi
70
+
71
+ if [ ! -f $data/$part.tar.gz ]; then
72
+ if ! which wget >/dev/null; then
73
+ echo "$0: wget is not installed."
74
+ exit 1
75
+ fi
76
+ full_url=$url/$part.tar.gz
77
+ echo "$0: downloading data from $full_url. This may take some time, please be patient."
78
+
79
+ if ! wget -P $data --no-check-certificate $full_url; then
80
+ echo "$0: error executing wget $full_url"
81
+ exit 1
82
+ fi
83
+ fi
84
+
85
+ if ! tar -C $data -xvzf $data/$part.tar.gz; then
86
+ echo "$0: error un-tarring archive $data/$part.tar.gz"
87
+ exit 1
88
+ fi
89
+
90
+ touch $data/.$part.complete
91
+
92
+ echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
93
+
94
+ if $remove_archive; then
95
+ echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
96
+ rm $data/$part.tar.gz
97
+ fi
examples/magicdata-read/cosyvoice/local/prepare_data.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import os
4
+ from tqdm import tqdm
5
+
6
+
7
+ logger = logging.getLogger()
8
+
9
+ def main():
10
+ utt2wav, utt2text, utt2spk, spk2utt = {}, {}, {}, {}
11
+ with open(os.path.join(args.src_dir, "TRANS.txt"), "r") as f:
12
+ lines = f.readlines()[1:]
13
+ lines = [l.split('\t') for l in lines]
14
+ for wav, spk, content in tqdm(lines):
15
+ wav, spk, content = wav.strip(), spk.strip(), content.strip()
16
+ content = content.replace('[FIL]', '')
17
+ content = content.replace('[SPK]', '')
18
+ wav = os.path.join(args.src_dir, spk, wav)
19
+ if not os.path.exists(wav):
20
+ continue
21
+ utt = os.path.basename(wav).replace('.wav', '')
22
+ utt2wav[utt] = wav
23
+ utt2text[utt] = content
24
+ utt2spk[utt] = spk
25
+ if spk not in spk2utt:
26
+ spk2utt[spk] = []
27
+ spk2utt[spk].append(utt)
28
+
29
+ with open('{}/wav.scp'.format(args.des_dir), 'w') as f:
30
+ for k, v in utt2wav.items():
31
+ f.write('{} {}\n'.format(k, v))
32
+ with open('{}/text'.format(args.des_dir), 'w') as f:
33
+ for k, v in utt2text.items():
34
+ f.write('{} {}\n'.format(k, v))
35
+ with open('{}/utt2spk'.format(args.des_dir), 'w') as f:
36
+ for k, v in utt2spk.items():
37
+ f.write('{} {}\n'.format(k, v))
38
+ with open('{}/spk2utt'.format(args.des_dir), 'w') as f:
39
+ for k, v in spk2utt.items():
40
+ f.write('{} {}\n'.format(k, ' '.join(v)))
41
+ return
42
+
43
+ if __name__ == "__main__":
44
+ parser = argparse.ArgumentParser()
45
+ parser.add_argument('--src_dir',
46
+ type=str)
47
+ parser.add_argument('--des_dir',
48
+ type=str)
49
+ args = parser.parse_args()
50
+ main()
examples/magicdata-read/cosyvoice/path.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
2
+ export PYTHONIOENCODING=UTF-8
3
+ export PYTHONPATH=../../../:../../../third_party/Matcha-TTS:$PYTHONPATH
examples/magicdata-read/cosyvoice/run.sh ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright 2024 Alibaba Inc. All Rights Reserved.
3
+ . ./path.sh || exit 1;
4
+
5
+ stage=-1
6
+ stop_stage=3
7
+
8
+ data_url=www.openslr.org/resources/68
9
+ data_dir=/mnt/hengwu.zty/data/tts/openslr/magicdata-read
10
+ pretrained_model_dir=../../../pretrained_models/CosyVoice-300M
11
+
12
+ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
13
+ echo "Data Download"
14
+ for part in dev_set test_set train_set; do
15
+ local/download_and_untar.sh ${data_dir} ${data_url} ${part}
16
+ done
17
+ fi
18
+
19
+ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
20
+ echo "Data preparation, prepare wav.scp/text/utt2spk/spk2utt"
21
+ for x in dev test train; do
22
+ mkdir -p data/$x
23
+ python local/prepare_data.py --src_dir $data_dir/$x --des_dir data/$x
24
+ done
25
+ fi
26
+
27
+ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
28
+ echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir"
29
+ for x in dev test train; do
30
+ tools/extract_embedding.py --dir data/$x \
31
+ --onnx_path $pretrained_model_dir/campplus.onnx
32
+ done
33
+ fi
34
+
35
+ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
36
+ echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir"
37
+ for x in dev test train; do
38
+ tools/extract_speech_token.py --dir data/$x \
39
+ --onnx_path $pretrained_model_dir/speech_tokenizer_v1.onnx
40
+ done
41
+ fi
42
+
43
+ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
44
+ echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt"
45
+ for x in dev test train; do
46
+ mkdir -p data/$x/parquet
47
+ tools/make_parquet_list.py --num_utts_per_parquet 1000 \
48
+ --num_processes 10 \
49
+ --src_dir data/$x \
50
+ --des_dir data/$x/parquet
51
+ done
52
+ fi
53
+
54
+ # inference
55
+ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
56
+ echo "Run inference. Please make sure utt in tts_text is in prompt_data"
57
+ for mode in sft zero_shot; do
58
+ python cosyvoice/bin/inference.py --mode $mode \
59
+ --gpu 0 \
60
+ --config conf/cosyvoice.yaml \
61
+ --prompt_data data/test/parquet/data.list \
62
+ --prompt_utt2data data/test/parquet/utt2data.list \
63
+ --tts_text `pwd`/tts_text.json \
64
+ --llm_model $pretrained_model_dir/llm.pt \
65
+ --flow_model $pretrained_model_dir/flow.pt \
66
+ --hifigan_model $pretrained_model_dir/hift.pt \
67
+ --result_dir `pwd`/exp/cosyvoice/test/$mode
68
+ done
69
+ fi
70
+
71
+ # train llm
72
+ export CUDA_VISIBLE_DEVICES="0,1,2,3"
73
+ num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
74
+ job_id=1986
75
+ dist_backend="nccl"
76
+ num_workers=2
77
+ prefetch=100
78
+ train_engine=torch_ddp
79
+ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
80
+ echo "Run train. We only support llm traning for now. If your want to train from scratch, please use conf/cosyvoice.fromscratch.yaml"
81
+ if [ $train_engine == 'deepspeed' ]; then
82
+ echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary"
83
+ fi
84
+ cp data/train/parquet/data.list data/train.data.list
85
+ cp data/dev/parquet/data.list data/dev.data.list
86
+ for model in llm; do
87
+ torchrun --nnodes=1 --nproc_per_node=$num_gpus \
88
+ --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
89
+ cosyvoice/bin/train.py \
90
+ --train_engine $train_engine \
91
+ --config conf/cosyvoice.yaml \
92
+ --train_data data/train.data.list \
93
+ --cv_data data/dev.data.list \
94
+ --model $model \
95
+ --checkpoint $pretrained_model_dir/$model.pt \
96
+ --model_dir `pwd`/exp/cosyvoice/$model/$train_engine \
97
+ --tensorboard_dir `pwd`/tensorboard/cosyvoice/$model/$train_engine \
98
+ --ddp.dist_backend $dist_backend \
99
+ --num_workers ${num_workers} \
100
+ --prefetch ${prefetch} \
101
+ --pin_memory \
102
+ --deepspeed_config ./conf/ds_stage2.json \
103
+ --deepspeed.save_states model+optimizer
104
+ done
105
+ fi
examples/magicdata-read/cosyvoice/tools ADDED
@@ -0,0 +1 @@
 
 
1
+ ../../../tools
examples/magicdata-read/cosyvoice/tts_text.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "38_5718_20170915093303": [
3
+ "我想这出最好歌曲把歌词发到网上请别人帮我作曲急急",
4
+ "叫他明天早上差五分儿九点去机场"
5
+ ],
6
+ "38_5721_20170915091235": [
7
+ "变温室调到零下两度档",
8
+ "交谈中请勿轻信汇款信息陌生电话请勿使用外挂软件"
9
+ ],
10
+ "38_5733_20170915130323": [
11
+ "这是老鹰乐队的一首经典歌曲",
12
+ "我急用这段音乐我自己找到一段但是有现场杂音"
13
+ ],
14
+ "38_5836_20170916221414": [
15
+ "给我播一个陶喆的专辑",
16
+ "这套餐好贵呀我发这么多短信贵死了"
17
+ ]
18
+ }