kevinwang676 commited on
Commit
fe5241b
·
verified ·
1 Parent(s): 384dc5c

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gradio/certificate.pem +31 -0
  2. cosyvoice/utils/common.py +166 -0
  3. cosyvoice/utils/losses.py +20 -0
  4. examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml +257 -0
  5. examples/libritts/cosyvoice/conf/ds_stage2.json +42 -0
  6. examples/libritts/cosyvoice/tts_text.json +5 -0
  7. examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml +203 -0
  8. examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml +203 -0
  9. runtime/python/fastapi/client.py +92 -0
  10. third_party/Matcha-TTS/.github/PULL_REQUEST_TEMPLATE.md +22 -0
  11. third_party/Matcha-TTS/.github/dependabot.yml +17 -0
  12. third_party/Matcha-TTS/.github/release-drafter.yml +44 -0
  13. third_party/Matcha-TTS/.pylintrc +525 -0
  14. third_party/Matcha-TTS/LICENSE +21 -0
  15. third_party/Matcha-TTS/MANIFEST.in +14 -0
  16. third_party/Matcha-TTS/README.md +278 -0
  17. third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml +17 -0
  18. third_party/Matcha-TTS/configs/callbacks/none.yaml +0 -0
  19. third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml +4 -0
  20. third_party/Matcha-TTS/configs/debug/default.yaml +35 -0
  21. third_party/Matcha-TTS/configs/debug/overfit.yaml +13 -0
  22. third_party/Matcha-TTS/configs/debug/profiler.yaml +15 -0
  23. third_party/Matcha-TTS/configs/eval.yaml +18 -0
  24. third_party/Matcha-TTS/configs/experiment/ljspeech.yaml +14 -0
  25. third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml +18 -0
  26. third_party/Matcha-TTS/configs/experiment/multispeaker.yaml +14 -0
  27. third_party/Matcha-TTS/configs/extras/default.yaml +8 -0
  28. third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml +52 -0
  29. third_party/Matcha-TTS/configs/local/.gitkeep +0 -0
  30. third_party/Matcha-TTS/configs/logger/aim.yaml +28 -0
  31. third_party/Matcha-TTS/configs/logger/tensorboard.yaml +10 -0
  32. third_party/Matcha-TTS/configs/logger/wandb.yaml +16 -0
  33. third_party/Matcha-TTS/configs/model/cfm/default.yaml +3 -0
  34. third_party/Matcha-TTS/configs/model/decoder/default.yaml +7 -0
  35. third_party/Matcha-TTS/configs/model/encoder/default.yaml +18 -0
  36. third_party/Matcha-TTS/configs/model/matcha.yaml +15 -0
  37. third_party/Matcha-TTS/configs/model/optimizer/adam.yaml +4 -0
  38. third_party/Matcha-TTS/configs/paths/default.yaml +18 -0
  39. third_party/Matcha-TTS/configs/train.yaml +51 -0
  40. third_party/Matcha-TTS/configs/trainer/ddp.yaml +9 -0
  41. third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml +7 -0
  42. third_party/Matcha-TTS/configs/trainer/gpu.yaml +5 -0
  43. third_party/Matcha-TTS/configs/trainer/mps.yaml +5 -0
  44. third_party/Matcha-TTS/matcha/VERSION +1 -0
  45. third_party/Matcha-TTS/matcha/__init__.py +0 -0
  46. third_party/Matcha-TTS/matcha/cli.py +418 -0
  47. third_party/Matcha-TTS/matcha/hifigan/LICENSE +21 -0
  48. third_party/Matcha-TTS/matcha/hifigan/meldataset.py +217 -0
  49. third_party/Matcha-TTS/matcha/onnx/infer.py +168 -0
  50. third_party/Matcha-TTS/matcha/text/__init__.py +53 -0
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
cosyvoice/utils/common.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
2
+ # 2024 Alibaba Inc (authors: Xiang Lyu)
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # Modified from ESPnet(https://github.com/espnet/espnet)
16
+ """Unility functions for Transformer."""
17
+
18
+ import random
19
+ from typing import List
20
+
21
+ import numpy as np
22
+ import torch
23
+
24
+ IGNORE_ID = -1
25
+
26
+
27
+ def pad_list(xs: List[torch.Tensor], pad_value: int):
28
+ """Perform padding for the list of tensors.
29
+
30
+ Args:
31
+ xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
32
+ pad_value (float): Value for padding.
33
+
34
+ Returns:
35
+ Tensor: Padded tensor (B, Tmax, `*`).
36
+
37
+ Examples:
38
+ >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
39
+ >>> x
40
+ [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
41
+ >>> pad_list(x, 0)
42
+ tensor([[1., 1., 1., 1.],
43
+ [1., 1., 0., 0.],
44
+ [1., 0., 0., 0.]])
45
+
46
+ """
47
+ max_len = max([len(item) for item in xs])
48
+ batchs = len(xs)
49
+ ndim = xs[0].ndim
50
+ if ndim == 1:
51
+ pad_res = torch.zeros(batchs,
52
+ max_len,
53
+ dtype=xs[0].dtype,
54
+ device=xs[0].device)
55
+ elif ndim == 2:
56
+ pad_res = torch.zeros(batchs,
57
+ max_len,
58
+ xs[0].shape[1],
59
+ dtype=xs[0].dtype,
60
+ device=xs[0].device)
61
+ elif ndim == 3:
62
+ pad_res = torch.zeros(batchs,
63
+ max_len,
64
+ xs[0].shape[1],
65
+ xs[0].shape[2],
66
+ dtype=xs[0].dtype,
67
+ device=xs[0].device)
68
+ else:
69
+ raise ValueError(f"Unsupported ndim: {ndim}")
70
+ pad_res.fill_(pad_value)
71
+ for i in range(batchs):
72
+ pad_res[i, :len(xs[i])] = xs[i]
73
+ return pad_res
74
+
75
+
76
+ def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor,
77
+ ignore_label: int) -> torch.Tensor:
78
+ """Calculate accuracy.
79
+
80
+ Args:
81
+ pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
82
+ pad_targets (LongTensor): Target label tensors (B, Lmax).
83
+ ignore_label (int): Ignore label id.
84
+
85
+ Returns:
86
+ torch.Tensor: Accuracy value (0.0 - 1.0).
87
+
88
+ """
89
+ pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1),
90
+ pad_outputs.size(1)).argmax(2)
91
+ mask = pad_targets != ignore_label
92
+ numerator = torch.sum(
93
+ pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
94
+ denominator = torch.sum(mask)
95
+ return (numerator / denominator).detach()
96
+
97
+
98
+ def get_padding(kernel_size, dilation=1):
99
+ return int((kernel_size * dilation - dilation) / 2)
100
+
101
+
102
+ def init_weights(m, mean=0.0, std=0.01):
103
+ classname = m.__class__.__name__
104
+ if classname.find("Conv") != -1:
105
+ m.weight.data.normal_(mean, std)
106
+
107
+
108
+ # Repetition Aware Sampling in VALL-E 2
109
+ def ras_sampling(weighted_scores, decoded_tokens, sampling, top_p=0.8, top_k=25, win_size=10, tau_r=0.1):
110
+ top_ids = nucleus_sampling(weighted_scores, top_p=top_p, top_k=top_k)
111
+ rep_num = (torch.tensor(decoded_tokens[-win_size:]).to(weighted_scores.device) == top_ids).sum().item()
112
+ if rep_num >= win_size * tau_r:
113
+ top_ids = random_sampling(weighted_scores, decoded_tokens, sampling)
114
+ return top_ids
115
+
116
+
117
+ def nucleus_sampling(weighted_scores, top_p=0.8, top_k=25):
118
+ prob, indices = [], []
119
+ cum_prob = 0.0
120
+ sorted_value, sorted_idx = weighted_scores.softmax(dim=0).sort(descending=True, stable=True)
121
+ for i in range(len(sorted_idx)):
122
+ # sampling both top-p and numbers.
123
+ if cum_prob < top_p and len(prob) < top_k:
124
+ cum_prob += sorted_value[i]
125
+ prob.append(sorted_value[i])
126
+ indices.append(sorted_idx[i])
127
+ else:
128
+ break
129
+ prob = torch.tensor(prob).to(weighted_scores)
130
+ indices = torch.tensor(indices, dtype=torch.long).to(weighted_scores.device)
131
+ top_ids = indices[prob.multinomial(1, replacement=True)]
132
+ return top_ids
133
+
134
+
135
+ def random_sampling(weighted_scores, decoded_tokens, sampling):
136
+ top_ids = weighted_scores.softmax(dim=0).multinomial(1, replacement=True)
137
+ return top_ids
138
+
139
+
140
+ def fade_in_out(fade_in_mel, fade_out_mel, window):
141
+ device = fade_in_mel.device
142
+ fade_in_mel, fade_out_mel = fade_in_mel.cpu(), fade_out_mel.cpu()
143
+ mel_overlap_len = int(window.shape[0] / 2)
144
+ if fade_in_mel.device == torch.device('cpu'):
145
+ fade_in_mel = fade_in_mel.clone()
146
+ fade_in_mel[..., :mel_overlap_len] = fade_in_mel[..., :mel_overlap_len] * window[:mel_overlap_len] + \
147
+ fade_out_mel[..., -mel_overlap_len:] * window[mel_overlap_len:]
148
+ return fade_in_mel.to(device)
149
+
150
+
151
+ def set_all_random_seed(seed):
152
+ random.seed(seed)
153
+ np.random.seed(seed)
154
+ torch.manual_seed(seed)
155
+ torch.cuda.manual_seed_all(seed)
156
+
157
+
158
+ def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
159
+ assert mask.dtype == torch.bool
160
+ assert dtype in [torch.float32, torch.bfloat16, torch.float16]
161
+ mask = mask.to(dtype)
162
+ # attention mask bias
163
+ # NOTE(Mddct): torch.finfo jit issues
164
+ # chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
165
+ mask = (1.0 - mask) * -1.0e+10
166
+ return mask
cosyvoice/utils/losses.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+
4
+
5
+ def tpr_loss(disc_real_outputs, disc_generated_outputs, tau):
6
+ loss = 0
7
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
8
+ m_DG = torch.median((dr - dg))
9
+ L_rel = torch.mean((((dr - dg) - m_DG) ** 2)[dr < dg + m_DG])
10
+ loss += tau - F.relu(tau - L_rel)
11
+ return loss
12
+
13
+
14
+ def mel_loss(real_speech, generated_speech, mel_transforms):
15
+ loss = 0
16
+ for transform in mel_transforms:
17
+ mel_r = transform(real_speech)
18
+ mel_g = transform(generated_speech)
19
+ loss += F.l1_loss(mel_g, mel_r)
20
+ return loss
examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1986]
3
+ __set_seed2: !apply:numpy.random.seed [1986]
4
+ __set_seed3: !apply:torch.manual_seed [1986]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
+
7
+ # fixed params
8
+ sample_rate: 22050
9
+ text_encoder_input_size: 512
10
+ llm_input_size: 1024
11
+ llm_output_size: 1024
12
+ spk_embed_dim: 192
13
+
14
+ # model params
15
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
16
+ # for system/third_party class/function, we do not require this.
17
+ llm: !new:cosyvoice.llm.llm.TransformerLM
18
+ text_encoder_input_size: !ref <text_encoder_input_size>
19
+ llm_input_size: !ref <llm_input_size>
20
+ llm_output_size: !ref <llm_output_size>
21
+ text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
22
+ speech_token_size: 4096
23
+ length_normalized_loss: True
24
+ lsm_weight: 0
25
+ spk_embed_dim: !ref <spk_embed_dim>
26
+ text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
27
+ input_size: !ref <text_encoder_input_size>
28
+ output_size: 1024
29
+ attention_heads: 8
30
+ linear_units: 2048
31
+ num_blocks: 3
32
+ dropout_rate: 0.1
33
+ positional_dropout_rate: 0.1
34
+ attention_dropout_rate: 0.0
35
+ normalize_before: True
36
+ input_layer: 'linear'
37
+ pos_enc_layer_type: 'rel_pos_espnet'
38
+ selfattention_layer_type: 'rel_selfattn'
39
+ use_cnn_module: False
40
+ macaron_style: False
41
+ use_dynamic_chunk: False
42
+ use_dynamic_left_chunk: False
43
+ static_chunk_size: 1
44
+ llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
45
+ input_size: !ref <llm_input_size>
46
+ output_size: !ref <llm_output_size>
47
+ attention_heads: 8
48
+ linear_units: 2048
49
+ num_blocks: 7
50
+ dropout_rate: 0.1
51
+ positional_dropout_rate: 0.1
52
+ attention_dropout_rate: 0.0
53
+ input_layer: 'linear_legacy'
54
+ pos_enc_layer_type: 'rel_pos_espnet'
55
+ selfattention_layer_type: 'rel_selfattn'
56
+ static_chunk_size: 1
57
+ sampling: !name:cosyvoice.utils.common.ras_sampling
58
+ top_p: 0.8
59
+ top_k: 25
60
+ win_size: 10
61
+ tau_r: 0.1
62
+
63
+ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
64
+ input_size: 512
65
+ output_size: 80
66
+ spk_embed_dim: !ref <spk_embed_dim>
67
+ output_type: 'mel'
68
+ vocab_size: 4096
69
+ input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
70
+ only_mask_loss: True
71
+ encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
72
+ output_size: 512
73
+ attention_heads: 4
74
+ linear_units: 1024
75
+ num_blocks: 3
76
+ dropout_rate: 0.1
77
+ positional_dropout_rate: 0.1
78
+ attention_dropout_rate: 0.1
79
+ normalize_before: True
80
+ input_layer: 'linear'
81
+ pos_enc_layer_type: 'rel_pos_espnet'
82
+ selfattention_layer_type: 'rel_selfattn'
83
+ input_size: 512
84
+ use_cnn_module: False
85
+ macaron_style: False
86
+ length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
87
+ channels: 80
88
+ sampling_ratios: [1, 1, 1, 1]
89
+ decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
90
+ in_channels: 240
91
+ n_spks: 1
92
+ spk_emb_dim: 80
93
+ cfm_params: !new:omegaconf.DictConfig
94
+ content:
95
+ sigma_min: 1e-06
96
+ solver: 'euler'
97
+ t_scheduler: 'cosine'
98
+ training_cfg_rate: 0.2
99
+ inference_cfg_rate: 0.7
100
+ reg_loss_type: 'l1'
101
+ estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
102
+ in_channels: 320
103
+ out_channels: 80
104
+ channels: [256, 256]
105
+ dropout: 0.0
106
+ attention_head_dim: 64
107
+ n_blocks: 4
108
+ num_mid_blocks: 8
109
+ num_heads: 8
110
+ act_fn: 'gelu'
111
+
112
+ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
113
+ in_channels: 80
114
+ base_channels: 512
115
+ nb_harmonics: 8
116
+ sampling_rate: !ref <sample_rate>
117
+ nsf_alpha: 0.1
118
+ nsf_sigma: 0.003
119
+ nsf_voiced_threshold: 10
120
+ upsample_rates: [8, 8]
121
+ upsample_kernel_sizes: [16, 16]
122
+ istft_params:
123
+ n_fft: 16
124
+ hop_len: 4
125
+ resblock_kernel_sizes: [3, 7, 11]
126
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
127
+ source_resblock_kernel_sizes: [7, 11]
128
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
129
+ lrelu_slope: 0.1
130
+ audio_limit: 0.99
131
+ f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
132
+ num_class: 1
133
+ in_channels: 80
134
+ cond_channels: 512
135
+
136
+ # gan related module
137
+ mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
138
+ n_fft: 1024
139
+ num_mels: 80
140
+ sampling_rate: !ref <sample_rate>
141
+ hop_size: 256
142
+ win_size: 1024
143
+ fmin: 0
144
+ fmax: null
145
+ center: False
146
+ hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
147
+ generator: !ref <hift>
148
+ discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
149
+ mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
150
+ mrd: !new:cosyvoice.hifigan.discriminator.MultiResolutionDiscriminator
151
+ mel_spec_transform: [
152
+ !ref <mel_spec_transform1>
153
+ ]
154
+
155
+ # processor functions
156
+ parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
157
+ get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
158
+ multilingual: True
159
+ num_languages: 100
160
+ language: 'en'
161
+ task: 'transcribe'
162
+ allowed_special: 'all'
163
+ tokenize: !name:cosyvoice.dataset.processor.tokenize
164
+ get_tokenizer: !ref <get_tokenizer>
165
+ allowed_special: !ref <allowed_special>
166
+ filter: !name:cosyvoice.dataset.processor.filter
167
+ max_length: 40960
168
+ min_length: 0
169
+ token_max_length: 200
170
+ token_min_length: 1
171
+ resample: !name:cosyvoice.dataset.processor.resample
172
+ resample_rate: !ref <sample_rate>
173
+ truncate: !name:cosyvoice.dataset.processor.truncate
174
+ truncate_length: 24576 # must be a multiplier of hop_size
175
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
176
+ n_fft: 1024
177
+ num_mels: 80
178
+ sampling_rate: !ref <sample_rate>
179
+ hop_size: 256
180
+ win_size: 1024
181
+ fmin: 0
182
+ fmax: 8000
183
+ center: False
184
+ compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
185
+ feat_extractor: !ref <feat_extractor>
186
+ compute_f0: !name:cosyvoice.dataset.processor.compute_f0
187
+ sample_rate: !ref <sample_rate>
188
+ hop_size: 256
189
+ parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
190
+ normalize: True
191
+ shuffle: !name:cosyvoice.dataset.processor.shuffle
192
+ shuffle_size: 1000
193
+ sort: !name:cosyvoice.dataset.processor.sort
194
+ sort_size: 500 # sort_size should be less than shuffle_size
195
+ batch: !name:cosyvoice.dataset.processor.batch
196
+ batch_type: 'dynamic'
197
+ max_frames_in_batch: 12000
198
+ padding: !name:cosyvoice.dataset.processor.padding
199
+ use_spk_embedding: False # change to True during sft
200
+
201
+ # dataset processor pipeline
202
+ data_pipeline: [
203
+ !ref <parquet_opener>,
204
+ !ref <tokenize>,
205
+ !ref <filter>,
206
+ !ref <resample>,
207
+ !ref <compute_fbank>,
208
+ !ref <parse_embedding>,
209
+ !ref <shuffle>,
210
+ !ref <sort>,
211
+ !ref <batch>,
212
+ !ref <padding>,
213
+ ]
214
+ data_pipeline_gan: [
215
+ !ref <parquet_opener>,
216
+ !ref <tokenize>,
217
+ !ref <filter>,
218
+ !ref <resample>,
219
+ !ref <truncate>,
220
+ !ref <compute_fbank>,
221
+ !ref <compute_f0>,
222
+ !ref <parse_embedding>,
223
+ !ref <shuffle>,
224
+ !ref <sort>,
225
+ !ref <batch>,
226
+ !ref <padding>,
227
+ ]
228
+
229
+ # llm flow train conf
230
+ train_conf:
231
+ optim: adam
232
+ optim_conf:
233
+ lr: 0.002 # change to 0.001 if you want to train flow from scratch
234
+ scheduler: warmuplr
235
+ scheduler_conf:
236
+ warmup_steps: 25000
237
+ max_epoch: 200
238
+ grad_clip: 5
239
+ accum_grad: 2
240
+ log_interval: 100
241
+ save_per_step: -1
242
+
243
+ # gan train conf
244
+ train_conf_gan:
245
+ optim: adam
246
+ optim_conf:
247
+ lr: 0.0002 # use small lr for gan training
248
+ scheduler: constantlr
249
+ optim_d: adam
250
+ optim_conf_d:
251
+ lr: 0.0002 # use small lr for gan training
252
+ scheduler_d: constantlr
253
+ max_epoch: 200
254
+ grad_clip: 5
255
+ accum_grad: 1 # in gan training, accum_grad must be 1
256
+ log_interval: 100
257
+ save_per_step: -1
examples/libritts/cosyvoice/conf/ds_stage2.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_micro_batch_size_per_gpu": 1,
3
+ "gradient_accumulation_steps": 1,
4
+ "steps_per_print": 100,
5
+ "gradient_clipping": 5,
6
+ "fp16": {
7
+ "enabled": false,
8
+ "auto_cast": false,
9
+ "loss_scale": 0,
10
+ "initial_scale_power": 16,
11
+ "loss_scale_window": 256,
12
+ "hysteresis": 2,
13
+ "consecutive_hysteresis": false,
14
+ "min_loss_scale": 1
15
+ },
16
+ "bf16": {
17
+ "enabled": false
18
+ },
19
+ "zero_force_ds_cpu_optimizer": false,
20
+ "zero_optimization": {
21
+ "stage": 2,
22
+ "offload_optimizer": {
23
+ "device": "none",
24
+ "pin_memory": true
25
+ },
26
+ "allgather_partitions": true,
27
+ "allgather_bucket_size": 5e8,
28
+ "overlap_comm": false,
29
+ "reduce_scatter": true,
30
+ "reduce_bucket_size": 5e8,
31
+ "contiguous_gradients" : true
32
+ },
33
+ "optimizer": {
34
+ "type": "AdamW",
35
+ "params": {
36
+ "lr": 0.001,
37
+ "weight_decay": 0.0001,
38
+ "torch_adam": true,
39
+ "adam_w_mode": true
40
+ }
41
+ }
42
+ }
examples/libritts/cosyvoice/tts_text.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "1089_134686_000002_000000": [
3
+ "hello, my name is Jack. What is your name?"
4
+ ]
5
+ }
examples/magicdata-read/cosyvoice/conf/cosyvoice.fromscratch.yaml ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1986]
3
+ __set_seed2: !apply:numpy.random.seed [1986]
4
+ __set_seed3: !apply:torch.manual_seed [1986]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
+
7
+ # fixed params
8
+ sample_rate: 22050
9
+ text_encoder_input_size: 512
10
+ llm_input_size: 1024
11
+ llm_output_size: 1024
12
+ spk_embed_dim: 192
13
+
14
+ # model params
15
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
16
+ # for system/third_party class/function, we do not require this.
17
+ llm: !new:cosyvoice.llm.llm.TransformerLM
18
+ text_encoder_input_size: !ref <text_encoder_input_size>
19
+ llm_input_size: !ref <llm_input_size>
20
+ llm_output_size: !ref <llm_output_size>
21
+ text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
22
+ speech_token_size: 4096
23
+ length_normalized_loss: True
24
+ lsm_weight: 0
25
+ spk_embed_dim: !ref <spk_embed_dim>
26
+ text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
27
+ input_size: !ref <text_encoder_input_size>
28
+ output_size: 1024
29
+ attention_heads: 8
30
+ linear_units: 2048
31
+ num_blocks: 3
32
+ dropout_rate: 0.1
33
+ positional_dropout_rate: 0.1
34
+ attention_dropout_rate: 0.0
35
+ normalize_before: True
36
+ input_layer: 'linear'
37
+ pos_enc_layer_type: 'rel_pos_espnet'
38
+ selfattention_layer_type: 'rel_selfattn'
39
+ use_cnn_module: False
40
+ macaron_style: False
41
+ use_dynamic_chunk: False
42
+ use_dynamic_left_chunk: False
43
+ static_chunk_size: 1
44
+ llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
45
+ input_size: !ref <llm_input_size>
46
+ output_size: !ref <llm_output_size>
47
+ attention_heads: 8
48
+ linear_units: 2048
49
+ num_blocks: 7
50
+ dropout_rate: 0.1
51
+ positional_dropout_rate: 0.1
52
+ attention_dropout_rate: 0.0
53
+ input_layer: 'linear_legacy'
54
+ pos_enc_layer_type: 'rel_pos_espnet'
55
+ selfattention_layer_type: 'rel_selfattn'
56
+ static_chunk_size: 1
57
+ sampling: !name:cosyvoice.utils.common.ras_sampling
58
+ top_p: 0.8
59
+ top_k: 25
60
+ win_size: 10
61
+ tau_r: 0.1
62
+
63
+ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
64
+ input_size: 512
65
+ output_size: 80
66
+ spk_embed_dim: !ref <spk_embed_dim>
67
+ output_type: 'mel'
68
+ vocab_size: 4096
69
+ input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
70
+ only_mask_loss: True
71
+ encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
72
+ output_size: 512
73
+ attention_heads: 4
74
+ linear_units: 1024
75
+ num_blocks: 3
76
+ dropout_rate: 0.1
77
+ positional_dropout_rate: 0.1
78
+ attention_dropout_rate: 0.1
79
+ normalize_before: True
80
+ input_layer: 'linear'
81
+ pos_enc_layer_type: 'rel_pos_espnet'
82
+ selfattention_layer_type: 'rel_selfattn'
83
+ input_size: 512
84
+ use_cnn_module: False
85
+ macaron_style: False
86
+ length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
87
+ channels: 80
88
+ sampling_ratios: [1, 1, 1, 1]
89
+ decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
90
+ in_channels: 240
91
+ n_spks: 1
92
+ spk_emb_dim: 80
93
+ cfm_params: !new:omegaconf.DictConfig
94
+ content:
95
+ sigma_min: 1e-06
96
+ solver: 'euler'
97
+ t_scheduler: 'cosine'
98
+ training_cfg_rate: 0.2
99
+ inference_cfg_rate: 0.7
100
+ reg_loss_type: 'l1'
101
+ estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
102
+ in_channels: 320
103
+ out_channels: 80
104
+ channels: [256, 256]
105
+ dropout: 0.0
106
+ attention_head_dim: 64
107
+ n_blocks: 4
108
+ num_mid_blocks: 8
109
+ num_heads: 8
110
+ act_fn: 'gelu'
111
+
112
+ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
113
+ in_channels: 80
114
+ base_channels: 512
115
+ nb_harmonics: 8
116
+ sampling_rate: !ref <sample_rate>
117
+ nsf_alpha: 0.1
118
+ nsf_sigma: 0.003
119
+ nsf_voiced_threshold: 10
120
+ upsample_rates: [8, 8]
121
+ upsample_kernel_sizes: [16, 16]
122
+ istft_params:
123
+ n_fft: 16
124
+ hop_len: 4
125
+ resblock_kernel_sizes: [3, 7, 11]
126
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
127
+ source_resblock_kernel_sizes: [7, 11]
128
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
129
+ lrelu_slope: 0.1
130
+ audio_limit: 0.99
131
+ f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
132
+ num_class: 1
133
+ in_channels: 80
134
+ cond_channels: 512
135
+
136
+ # processor functions
137
+ parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
138
+ get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
139
+ multilingual: True
140
+ num_languages: 100
141
+ language: 'en'
142
+ task: 'transcribe'
143
+ allowed_special: 'all'
144
+ tokenize: !name:cosyvoice.dataset.processor.tokenize
145
+ get_tokenizer: !ref <get_tokenizer>
146
+ allowed_special: !ref <allowed_special>
147
+ filter: !name:cosyvoice.dataset.processor.filter
148
+ max_length: 40960
149
+ min_length: 0
150
+ token_max_length: 200
151
+ token_min_length: 1
152
+ resample: !name:cosyvoice.dataset.processor.resample
153
+ resample_rate: !ref <sample_rate>
154
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
155
+ n_fft: 1024
156
+ num_mels: 80
157
+ sampling_rate: !ref <sample_rate>
158
+ hop_size: 256
159
+ win_size: 1024
160
+ fmin: 0
161
+ fmax: 8000
162
+ center: False
163
+ compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
164
+ feat_extractor: !ref <feat_extractor>
165
+ parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
166
+ normalize: True
167
+ shuffle: !name:cosyvoice.dataset.processor.shuffle
168
+ shuffle_size: 1000
169
+ sort: !name:cosyvoice.dataset.processor.sort
170
+ sort_size: 500 # sort_size should be less than shuffle_size
171
+ batch: !name:cosyvoice.dataset.processor.batch
172
+ batch_type: 'dynamic'
173
+ max_frames_in_batch: 12000
174
+ padding: !name:cosyvoice.dataset.processor.padding
175
+ use_spk_embedding: False # change to True during sft
176
+
177
+ # dataset processor pipeline
178
+ data_pipeline: [
179
+ !ref <parquet_opener>,
180
+ !ref <tokenize>,
181
+ !ref <filter>,
182
+ !ref <resample>,
183
+ !ref <compute_fbank>,
184
+ !ref <parse_embedding>,
185
+ !ref <shuffle>,
186
+ !ref <sort>,
187
+ !ref <batch>,
188
+ !ref <padding>,
189
+ ]
190
+
191
+ # train conf
192
+ train_conf:
193
+ optim: adam
194
+ optim_conf:
195
+ lr: 0.002 # change to 0.001 if you want to train flow from scratch
196
+ scheduler: warmuplr
197
+ scheduler_conf:
198
+ warmup_steps: 25000
199
+ max_epoch: 200
200
+ grad_clip: 5
201
+ accum_grad: 2
202
+ log_interval: 100
203
+ save_per_step: -1
examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1986]
3
+ __set_seed2: !apply:numpy.random.seed [1986]
4
+ __set_seed3: !apply:torch.manual_seed [1986]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
+
7
+ # fixed params
8
+ sample_rate: 22050
9
+ text_encoder_input_size: 512
10
+ llm_input_size: 1024
11
+ llm_output_size: 1024
12
+ spk_embed_dim: 192
13
+
14
+ # model params
15
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
16
+ # for system/third_party class/function, we do not require this.
17
+ llm: !new:cosyvoice.llm.llm.TransformerLM
18
+ text_encoder_input_size: !ref <text_encoder_input_size>
19
+ llm_input_size: !ref <llm_input_size>
20
+ llm_output_size: !ref <llm_output_size>
21
+ text_token_size: 51866 # change to 60515 if you want to train with CosyVoice-300M-25Hz recipe
22
+ speech_token_size: 4096
23
+ length_normalized_loss: True
24
+ lsm_weight: 0
25
+ spk_embed_dim: !ref <spk_embed_dim>
26
+ text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
27
+ input_size: !ref <text_encoder_input_size>
28
+ output_size: 1024
29
+ attention_heads: 16
30
+ linear_units: 4096
31
+ num_blocks: 6
32
+ dropout_rate: 0.1
33
+ positional_dropout_rate: 0.1
34
+ attention_dropout_rate: 0.0
35
+ normalize_before: True
36
+ input_layer: 'linear'
37
+ pos_enc_layer_type: 'rel_pos_espnet'
38
+ selfattention_layer_type: 'rel_selfattn'
39
+ use_cnn_module: False
40
+ macaron_style: False
41
+ use_dynamic_chunk: False
42
+ use_dynamic_left_chunk: False
43
+ static_chunk_size: 1
44
+ llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
45
+ input_size: !ref <llm_input_size>
46
+ output_size: !ref <llm_output_size>
47
+ attention_heads: 16
48
+ linear_units: 4096
49
+ num_blocks: 14
50
+ dropout_rate: 0.1
51
+ positional_dropout_rate: 0.1
52
+ attention_dropout_rate: 0.0
53
+ input_layer: 'linear_legacy'
54
+ pos_enc_layer_type: 'rel_pos_espnet'
55
+ selfattention_layer_type: 'rel_selfattn'
56
+ static_chunk_size: 1
57
+ sampling: !name:cosyvoice.utils.common.ras_sampling
58
+ top_p: 0.8
59
+ top_k: 25
60
+ win_size: 10
61
+ tau_r: 0.1
62
+
63
+ flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
64
+ input_size: 512
65
+ output_size: 80
66
+ spk_embed_dim: !ref <spk_embed_dim>
67
+ output_type: 'mel'
68
+ vocab_size: 4096
69
+ input_frame_rate: 50 # change to 25 if you want to train with CosyVoice-300M-25Hz recipe
70
+ only_mask_loss: True
71
+ encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
72
+ output_size: 512
73
+ attention_heads: 8
74
+ linear_units: 2048
75
+ num_blocks: 6
76
+ dropout_rate: 0.1
77
+ positional_dropout_rate: 0.1
78
+ attention_dropout_rate: 0.1
79
+ normalize_before: True
80
+ input_layer: 'linear'
81
+ pos_enc_layer_type: 'rel_pos_espnet'
82
+ selfattention_layer_type: 'rel_selfattn'
83
+ input_size: 512
84
+ use_cnn_module: False
85
+ macaron_style: False
86
+ length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
87
+ channels: 80
88
+ sampling_ratios: [1, 1, 1, 1]
89
+ decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
90
+ in_channels: 240
91
+ n_spks: 1
92
+ spk_emb_dim: 80
93
+ cfm_params: !new:omegaconf.DictConfig
94
+ content:
95
+ sigma_min: 1e-06
96
+ solver: 'euler'
97
+ t_scheduler: 'cosine'
98
+ training_cfg_rate: 0.2
99
+ inference_cfg_rate: 0.7
100
+ reg_loss_type: 'l1'
101
+ estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
102
+ in_channels: 320
103
+ out_channels: 80
104
+ channels: [256, 256]
105
+ dropout: 0.0
106
+ attention_head_dim: 64
107
+ n_blocks: 4
108
+ num_mid_blocks: 12
109
+ num_heads: 8
110
+ act_fn: 'gelu'
111
+
112
+ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
113
+ in_channels: 80
114
+ base_channels: 512
115
+ nb_harmonics: 8
116
+ sampling_rate: !ref <sample_rate>
117
+ nsf_alpha: 0.1
118
+ nsf_sigma: 0.003
119
+ nsf_voiced_threshold: 10
120
+ upsample_rates: [8, 8]
121
+ upsample_kernel_sizes: [16, 16]
122
+ istft_params:
123
+ n_fft: 16
124
+ hop_len: 4
125
+ resblock_kernel_sizes: [3, 7, 11]
126
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
127
+ source_resblock_kernel_sizes: [7, 11]
128
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
129
+ lrelu_slope: 0.1
130
+ audio_limit: 0.99
131
+ f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
132
+ num_class: 1
133
+ in_channels: 80
134
+ cond_channels: 512
135
+
136
+ # processor functions
137
+ parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
138
+ get_tokenizer: !name:whisper.tokenizer.get_tokenizer # change to !name:cosyvoice.tokenizer.tokenizer.get_tokenizer if you want to train with CosyVoice-300M-25Hz recipe
139
+ multilingual: True
140
+ num_languages: 100
141
+ language: 'en'
142
+ task: 'transcribe'
143
+ allowed_special: 'all'
144
+ tokenize: !name:cosyvoice.dataset.processor.tokenize
145
+ get_tokenizer: !ref <get_tokenizer>
146
+ allowed_special: !ref <allowed_special>
147
+ filter: !name:cosyvoice.dataset.processor.filter
148
+ max_length: 40960
149
+ min_length: 0
150
+ token_max_length: 200
151
+ token_min_length: 1
152
+ resample: !name:cosyvoice.dataset.processor.resample
153
+ resample_rate: !ref <sample_rate>
154
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
155
+ n_fft: 1024
156
+ num_mels: 80
157
+ sampling_rate: !ref <sample_rate>
158
+ hop_size: 256
159
+ win_size: 1024
160
+ fmin: 0
161
+ fmax: 8000
162
+ center: False
163
+ compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
164
+ feat_extractor: !ref <feat_extractor>
165
+ parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
166
+ normalize: True
167
+ shuffle: !name:cosyvoice.dataset.processor.shuffle
168
+ shuffle_size: 1000
169
+ sort: !name:cosyvoice.dataset.processor.sort
170
+ sort_size: 500 # sort_size should be less than shuffle_size
171
+ batch: !name:cosyvoice.dataset.processor.batch
172
+ batch_type: 'dynamic'
173
+ max_frames_in_batch: 2000
174
+ padding: !name:cosyvoice.dataset.processor.padding
175
+ use_spk_embedding: False # change to True during sft
176
+
177
+ # dataset processor pipeline
178
+ data_pipeline: [
179
+ !ref <parquet_opener>,
180
+ !ref <tokenize>,
181
+ !ref <filter>,
182
+ !ref <resample>,
183
+ !ref <compute_fbank>,
184
+ !ref <parse_embedding>,
185
+ !ref <shuffle>,
186
+ !ref <sort>,
187
+ !ref <batch>,
188
+ !ref <padding>,
189
+ ]
190
+
191
+ # train conf
192
+ train_conf:
193
+ optim: adam
194
+ optim_conf:
195
+ lr: 0.001 # change to 1e-5 during sft
196
+ scheduler: warmuplr # change to constantlr during sft
197
+ scheduler_conf:
198
+ warmup_steps: 2500
199
+ max_epoch: 200
200
+ grad_clip: 5
201
+ accum_grad: 2
202
+ log_interval: 100
203
+ save_per_step: -1
runtime/python/fastapi/client.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import argparse
15
+ import logging
16
+ import requests
17
+ import torch
18
+ import torchaudio
19
+ import numpy as np
20
+
21
+
22
+ def main():
23
+ url = "http://{}:{}/inference_{}".format(args.host, args.port, args.mode)
24
+ if args.mode == 'sft':
25
+ payload = {
26
+ 'tts_text': args.tts_text,
27
+ 'spk_id': args.spk_id
28
+ }
29
+ response = requests.request("GET", url, data=payload, stream=True)
30
+ elif args.mode == 'zero_shot':
31
+ payload = {
32
+ 'tts_text': args.tts_text,
33
+ 'prompt_text': args.prompt_text
34
+ }
35
+ files = [('prompt_wav', ('prompt_wav', open(args.prompt_wav, 'rb'), 'application/octet-stream'))]
36
+ response = requests.request("GET", url, data=payload, files=files, stream=True)
37
+ elif args.mode == 'cross_lingual':
38
+ payload = {
39
+ 'tts_text': args.tts_text,
40
+ }
41
+ files = [('prompt_wav', ('prompt_wav', open(args.prompt_wav, 'rb'), 'application/octet-stream'))]
42
+ response = requests.request("GET", url, data=payload, files=files, stream=True)
43
+ else:
44
+ payload = {
45
+ 'tts_text': args.tts_text,
46
+ 'spk_id': args.spk_id,
47
+ 'instruct_text': args.instruct_text
48
+ }
49
+ response = requests.request("GET", url, data=payload, stream=True)
50
+ tts_audio = b''
51
+ for r in response.iter_content(chunk_size=16000):
52
+ tts_audio += r
53
+ tts_speech = torch.from_numpy(np.array(np.frombuffer(tts_audio, dtype=np.int16))).unsqueeze(dim=0)
54
+ logging.info('save response to {}'.format(args.tts_wav))
55
+ torchaudio.save(args.tts_wav, tts_speech, target_sr)
56
+ logging.info('get response')
57
+
58
+
59
+ if __name__ == "__main__":
60
+ parser = argparse.ArgumentParser()
61
+ parser.add_argument('--host',
62
+ type=str,
63
+ default='0.0.0.0')
64
+ parser.add_argument('--port',
65
+ type=int,
66
+ default='50000')
67
+ parser.add_argument('--mode',
68
+ default='sft',
69
+ choices=['sft', 'zero_shot', 'cross_lingual', 'instruct'],
70
+ help='request mode')
71
+ parser.add_argument('--tts_text',
72
+ type=str,
73
+ default='你好,我是通义千问语音合成大模型,请问有什么可以帮您的吗?')
74
+ parser.add_argument('--spk_id',
75
+ type=str,
76
+ default='中文女')
77
+ parser.add_argument('--prompt_text',
78
+ type=str,
79
+ default='希望你以后能够做的比我还好呦。')
80
+ parser.add_argument('--prompt_wav',
81
+ type=str,
82
+ default='../../../asset/zero_shot_prompt.wav')
83
+ parser.add_argument('--instruct_text',
84
+ type=str,
85
+ default='Theo \'Crimson\', is a fiery, passionate rebel leader. \
86
+ Fights with fervor for justice, but struggles with impulsiveness.')
87
+ parser.add_argument('--tts_wav',
88
+ type=str,
89
+ default='demo.wav')
90
+ args = parser.parse_args()
91
+ prompt_sr, target_sr = 16000, 22050
92
+ main()
third_party/Matcha-TTS/.github/PULL_REQUEST_TEMPLATE.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## What does this PR do?
2
+
3
+ <!--
4
+ Please include a summary of the change and which issue is fixed.
5
+ Please also include relevant motivation and context.
6
+ List any dependencies that are required for this change.
7
+ List all the breaking changes introduced by this pull request.
8
+ -->
9
+
10
+ Fixes #\<issue_number>
11
+
12
+ ## Before submitting
13
+
14
+ - [ ] Did you make sure **title is self-explanatory** and **the description concisely explains the PR**?
15
+ - [ ] Did you make sure your **PR does only one thing**, instead of bundling different changes together?
16
+ - [ ] Did you list all the **breaking changes** introduced by this pull request?
17
+ - [ ] Did you **test your PR locally** with `pytest` command?
18
+ - [ ] Did you **run pre-commit hooks** with `pre-commit run -a` command?
19
+
20
+ ## Did you have fun?
21
+
22
+ Make sure you had fun coding 🙃
third_party/Matcha-TTS/.github/dependabot.yml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To get started with Dependabot version updates, you'll need to specify which
2
+ # package ecosystems to update and where the package manifests are located.
3
+ # Please see the documentation for all configuration options:
4
+ # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5
+
6
+ version: 2
7
+ updates:
8
+ - package-ecosystem: "pip" # See documentation for possible values
9
+ directory: "/" # Location of package manifests
10
+ target-branch: "dev"
11
+ schedule:
12
+ interval: "daily"
13
+ ignore:
14
+ - dependency-name: "pytorch-lightning"
15
+ update-types: ["version-update:semver-patch"]
16
+ - dependency-name: "torchmetrics"
17
+ update-types: ["version-update:semver-patch"]
third_party/Matcha-TTS/.github/release-drafter.yml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name-template: "v$RESOLVED_VERSION"
2
+ tag-template: "v$RESOLVED_VERSION"
3
+
4
+ categories:
5
+ - title: "🚀 Features"
6
+ labels:
7
+ - "feature"
8
+ - "enhancement"
9
+ - title: "🐛 Bug Fixes"
10
+ labels:
11
+ - "fix"
12
+ - "bugfix"
13
+ - "bug"
14
+ - title: "🧹 Maintenance"
15
+ labels:
16
+ - "maintenance"
17
+ - "dependencies"
18
+ - "refactoring"
19
+ - "cosmetic"
20
+ - "chore"
21
+ - title: "📝️ Documentation"
22
+ labels:
23
+ - "documentation"
24
+ - "docs"
25
+
26
+ change-template: "- $TITLE @$AUTHOR (#$NUMBER)"
27
+ change-title-escapes: '\<*_&' # You can add # and @ to disable mentions
28
+
29
+ version-resolver:
30
+ major:
31
+ labels:
32
+ - "major"
33
+ minor:
34
+ labels:
35
+ - "minor"
36
+ patch:
37
+ labels:
38
+ - "patch"
39
+ default: patch
40
+
41
+ template: |
42
+ ## Changes
43
+
44
+ $CHANGES
third_party/Matcha-TTS/.pylintrc ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [MASTER]
2
+
3
+ # A comma-separated list of package or module names from where C extensions may
4
+ # be loaded. Extensions are loading into the active Python interpreter and may
5
+ # run arbitrary code.
6
+ extension-pkg-whitelist=
7
+
8
+ # Add files or directories to the blacklist. They should be base names, not
9
+ # paths.
10
+ ignore=CVS
11
+
12
+ # Add files or directories matching the regex patterns to the blacklist. The
13
+ # regex matches against base names, not paths.
14
+ ignore-patterns=
15
+
16
+ # Python code to execute, usually for sys.path manipulation such as
17
+ # pygtk.require().
18
+ #init-hook=
19
+
20
+ # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
21
+ # number of processors available to use.
22
+ jobs=1
23
+
24
+ # Control the amount of potential inferred values when inferring a single
25
+ # object. This can help the performance when dealing with large functions or
26
+ # complex, nested conditions.
27
+ limit-inference-results=100
28
+
29
+ # List of plugins (as comma separated values of python modules names) to load,
30
+ # usually to register additional checkers.
31
+ load-plugins=
32
+
33
+ # Pickle collected data for later comparisons.
34
+ persistent=yes
35
+
36
+ # Specify a configuration file.
37
+ #rcfile=
38
+
39
+ # When enabled, pylint would attempt to guess common misconfiguration and emit
40
+ # user-friendly hints instead of false-positive error messages.
41
+ suggestion-mode=yes
42
+
43
+ # Allow loading of arbitrary C extensions. Extensions are imported into the
44
+ # active Python interpreter and may run arbitrary code.
45
+ unsafe-load-any-extension=no
46
+
47
+
48
+ [MESSAGES CONTROL]
49
+
50
+ # Only show warnings with the listed confidence levels. Leave empty to show
51
+ # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
52
+ confidence=
53
+
54
+ # Disable the message, report, category or checker with the given id(s). You
55
+ # can either give multiple identifiers separated by comma (,) or put this
56
+ # option multiple times (only on the command line, not in the configuration
57
+ # file where it should appear only once). You can also use "--disable=all" to
58
+ # disable everything first and then reenable specific checks. For example, if
59
+ # you want to run only the similarities checker, you can use "--disable=all
60
+ # --enable=similarities". If you want to run only the classes checker, but have
61
+ # no Warning level messages displayed, use "--disable=all --enable=classes
62
+ # --disable=W".
63
+ disable=missing-docstring,
64
+ too-many-public-methods,
65
+ too-many-lines,
66
+ bare-except,
67
+ ## for avoiding weird p3.6 CI linter error
68
+ ## TODO: see later if we can remove this
69
+ assigning-non-slot,
70
+ unsupported-assignment-operation,
71
+ ## end
72
+ line-too-long,
73
+ fixme,
74
+ wrong-import-order,
75
+ ungrouped-imports,
76
+ wrong-import-position,
77
+ import-error,
78
+ invalid-name,
79
+ too-many-instance-attributes,
80
+ arguments-differ,
81
+ arguments-renamed,
82
+ no-name-in-module,
83
+ no-member,
84
+ unsubscriptable-object,
85
+ raw-checker-failed,
86
+ bad-inline-option,
87
+ locally-disabled,
88
+ file-ignored,
89
+ suppressed-message,
90
+ useless-suppression,
91
+ deprecated-pragma,
92
+ use-symbolic-message-instead,
93
+ useless-object-inheritance,
94
+ too-few-public-methods,
95
+ too-many-branches,
96
+ too-many-arguments,
97
+ too-many-locals,
98
+ too-many-statements,
99
+ duplicate-code,
100
+ not-callable,
101
+ import-outside-toplevel,
102
+ logging-fstring-interpolation,
103
+ logging-not-lazy,
104
+ unused-argument,
105
+ no-else-return,
106
+ chained-comparison,
107
+ redefined-outer-name
108
+
109
+ # Enable the message, report, category or checker with the given id(s). You can
110
+ # either give multiple identifier separated by comma (,) or put this option
111
+ # multiple time (only on the command line, not in the configuration file where
112
+ # it should appear only once). See also the "--disable" option for examples.
113
+ enable=c-extension-no-member
114
+
115
+
116
+ [REPORTS]
117
+
118
+ # Python expression which should return a note less than 10 (10 is the highest
119
+ # note). You have access to the variables errors warning, statement which
120
+ # respectively contain the number of errors / warnings messages and the total
121
+ # number of statements analyzed. This is used by the global evaluation report
122
+ # (RP0004).
123
+ evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
124
+
125
+ # Template used to display messages. This is a python new-style format string
126
+ # used to format the message information. See doc for all details.
127
+ #msg-template=
128
+
129
+ # Set the output format. Available formats are text, parseable, colorized, json
130
+ # and msvs (visual studio). You can also give a reporter class, e.g.
131
+ # mypackage.mymodule.MyReporterClass.
132
+ output-format=text
133
+
134
+ # Tells whether to display a full report or only the messages.
135
+ reports=no
136
+
137
+ # Activate the evaluation score.
138
+ score=yes
139
+
140
+
141
+ [REFACTORING]
142
+
143
+ # Maximum number of nested blocks for function / method body
144
+ max-nested-blocks=5
145
+
146
+ # Complete name of functions that never returns. When checking for
147
+ # inconsistent-return-statements if a never returning function is called then
148
+ # it will be considered as an explicit return statement and no message will be
149
+ # printed.
150
+ never-returning-functions=sys.exit
151
+
152
+
153
+ [LOGGING]
154
+
155
+ # Format style used to check logging format string. `old` means using %
156
+ # formatting, while `new` is for `{}` formatting.
157
+ logging-format-style=old
158
+
159
+ # Logging modules to check that the string format arguments are in logging
160
+ # function parameter format.
161
+ logging-modules=logging
162
+
163
+
164
+ [SPELLING]
165
+
166
+ # Limits count of emitted suggestions for spelling mistakes.
167
+ max-spelling-suggestions=4
168
+
169
+ # Spelling dictionary name. Available dictionaries: none. To make it working
170
+ # install python-enchant package..
171
+ spelling-dict=
172
+
173
+ # List of comma separated words that should not be checked.
174
+ spelling-ignore-words=
175
+
176
+ # A path to a file that contains private dictionary; one word per line.
177
+ spelling-private-dict-file=
178
+
179
+ # Tells whether to store unknown words to indicated private dictionary in
180
+ # --spelling-private-dict-file option instead of raising a message.
181
+ spelling-store-unknown-words=no
182
+
183
+
184
+ [MISCELLANEOUS]
185
+
186
+ # List of note tags to take in consideration, separated by a comma.
187
+ notes=FIXME,
188
+ XXX,
189
+ TODO
190
+
191
+
192
+ [TYPECHECK]
193
+
194
+ # List of decorators that produce context managers, such as
195
+ # contextlib.contextmanager. Add to this list to register other decorators that
196
+ # produce valid context managers.
197
+ contextmanager-decorators=contextlib.contextmanager
198
+
199
+ # List of members which are set dynamically and missed by pylint inference
200
+ # system, and so shouldn't trigger E1101 when accessed. Python regular
201
+ # expressions are accepted.
202
+ generated-members=numpy.*,torch.*
203
+
204
+ # Tells whether missing members accessed in mixin class should be ignored. A
205
+ # mixin class is detected if its name ends with "mixin" (case insensitive).
206
+ ignore-mixin-members=yes
207
+
208
+ # Tells whether to warn about missing members when the owner of the attribute
209
+ # is inferred to be None.
210
+ ignore-none=yes
211
+
212
+ # This flag controls whether pylint should warn about no-member and similar
213
+ # checks whenever an opaque object is returned when inferring. The inference
214
+ # can return multiple potential results while evaluating a Python object, but
215
+ # some branches might not be evaluated, which results in partial inference. In
216
+ # that case, it might be useful to still emit no-member and other checks for
217
+ # the rest of the inferred objects.
218
+ ignore-on-opaque-inference=yes
219
+
220
+ # List of class names for which member attributes should not be checked (useful
221
+ # for classes with dynamically set attributes). This supports the use of
222
+ # qualified names.
223
+ ignored-classes=optparse.Values,thread._local,_thread._local
224
+
225
+ # List of module names for which member attributes should not be checked
226
+ # (useful for modules/projects where namespaces are manipulated during runtime
227
+ # and thus existing member attributes cannot be deduced by static analysis. It
228
+ # supports qualified module names, as well as Unix pattern matching.
229
+ ignored-modules=
230
+
231
+ # Show a hint with possible names when a member name was not found. The aspect
232
+ # of finding the hint is based on edit distance.
233
+ missing-member-hint=yes
234
+
235
+ # The minimum edit distance a name should have in order to be considered a
236
+ # similar match for a missing member name.
237
+ missing-member-hint-distance=1
238
+
239
+ # The total number of similar names that should be taken in consideration when
240
+ # showing a hint for a missing member.
241
+ missing-member-max-choices=1
242
+
243
+
244
+ [VARIABLES]
245
+
246
+ # List of additional names supposed to be defined in builtins. Remember that
247
+ # you should avoid defining new builtins when possible.
248
+ additional-builtins=
249
+
250
+ # Tells whether unused global variables should be treated as a violation.
251
+ allow-global-unused-variables=yes
252
+
253
+ # List of strings which can identify a callback function by name. A callback
254
+ # name must start or end with one of those strings.
255
+ callbacks=cb_,
256
+ _cb
257
+
258
+ # A regular expression matching the name of dummy variables (i.e. expected to
259
+ # not be used).
260
+ dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
261
+
262
+ # Argument names that match this expression will be ignored. Default to name
263
+ # with leading underscore.
264
+ ignored-argument-names=_.*|^ignored_|^unused_
265
+
266
+ # Tells whether we should check for unused import in __init__ files.
267
+ init-import=no
268
+
269
+ # List of qualified module names which can have objects that can redefine
270
+ # builtins.
271
+ redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
272
+
273
+
274
+ [FORMAT]
275
+
276
+ # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
277
+ expected-line-ending-format=
278
+
279
+ # Regexp for a line that is allowed to be longer than the limit.
280
+ ignore-long-lines=^\s*(# )?<?https?://\S+>?$
281
+
282
+ # Number of spaces of indent required inside a hanging or continued line.
283
+ indent-after-paren=4
284
+
285
+ # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
286
+ # tab).
287
+ indent-string=' '
288
+
289
+ # Maximum number of characters on a single line.
290
+ max-line-length=120
291
+
292
+ # Maximum number of lines in a module.
293
+ max-module-lines=1000
294
+
295
+ # Allow the body of a class to be on the same line as the declaration if body
296
+ # contains single statement.
297
+ single-line-class-stmt=no
298
+
299
+ # Allow the body of an if to be on the same line as the test if there is no
300
+ # else.
301
+ single-line-if-stmt=no
302
+
303
+
304
+ [SIMILARITIES]
305
+
306
+ # Ignore comments when computing similarities.
307
+ ignore-comments=yes
308
+
309
+ # Ignore docstrings when computing similarities.
310
+ ignore-docstrings=yes
311
+
312
+ # Ignore imports when computing similarities.
313
+ ignore-imports=no
314
+
315
+ # Minimum lines number of a similarity.
316
+ min-similarity-lines=4
317
+
318
+
319
+ [BASIC]
320
+
321
+ # Naming style matching correct argument names.
322
+ argument-naming-style=snake_case
323
+
324
+ # Regular expression matching correct argument names. Overrides argument-
325
+ # naming-style.
326
+ argument-rgx=[a-z_][a-z0-9_]{0,30}$
327
+
328
+ # Naming style matching correct attribute names.
329
+ attr-naming-style=snake_case
330
+
331
+ # Regular expression matching correct attribute names. Overrides attr-naming-
332
+ # style.
333
+ #attr-rgx=
334
+
335
+ # Bad variable names which should always be refused, separated by a comma.
336
+ bad-names=
337
+
338
+ # Naming style matching correct class attribute names.
339
+ class-attribute-naming-style=any
340
+
341
+ # Regular expression matching correct class attribute names. Overrides class-
342
+ # attribute-naming-style.
343
+ #class-attribute-rgx=
344
+
345
+ # Naming style matching correct class names.
346
+ class-naming-style=PascalCase
347
+
348
+ # Regular expression matching correct class names. Overrides class-naming-
349
+ # style.
350
+ #class-rgx=
351
+
352
+ # Naming style matching correct constant names.
353
+ const-naming-style=UPPER_CASE
354
+
355
+ # Regular expression matching correct constant names. Overrides const-naming-
356
+ # style.
357
+ #const-rgx=
358
+
359
+ # Minimum line length for functions/classes that require docstrings, shorter
360
+ # ones are exempt.
361
+ docstring-min-length=-1
362
+
363
+ # Naming style matching correct function names.
364
+ function-naming-style=snake_case
365
+
366
+ # Regular expression matching correct function names. Overrides function-
367
+ # naming-style.
368
+ #function-rgx=
369
+
370
+ # Good variable names which should always be accepted, separated by a comma.
371
+ good-names=i,
372
+ j,
373
+ k,
374
+ x,
375
+ ex,
376
+ Run,
377
+ _
378
+
379
+ # Include a hint for the correct naming format with invalid-name.
380
+ include-naming-hint=no
381
+
382
+ # Naming style matching correct inline iteration names.
383
+ inlinevar-naming-style=any
384
+
385
+ # Regular expression matching correct inline iteration names. Overrides
386
+ # inlinevar-naming-style.
387
+ #inlinevar-rgx=
388
+
389
+ # Naming style matching correct method names.
390
+ method-naming-style=snake_case
391
+
392
+ # Regular expression matching correct method names. Overrides method-naming-
393
+ # style.
394
+ #method-rgx=
395
+
396
+ # Naming style matching correct module names.
397
+ module-naming-style=snake_case
398
+
399
+ # Regular expression matching correct module names. Overrides module-naming-
400
+ # style.
401
+ #module-rgx=
402
+
403
+ # Colon-delimited sets of names that determine each other's naming style when
404
+ # the name regexes allow several styles.
405
+ name-group=
406
+
407
+ # Regular expression which should only match function or class names that do
408
+ # not require a docstring.
409
+ no-docstring-rgx=^_
410
+
411
+ # List of decorators that produce properties, such as abc.abstractproperty. Add
412
+ # to this list to register other decorators that produce valid properties.
413
+ # These decorators are taken in consideration only for invalid-name.
414
+ property-classes=abc.abstractproperty
415
+
416
+ # Naming style matching correct variable names.
417
+ variable-naming-style=snake_case
418
+
419
+ # Regular expression matching correct variable names. Overrides variable-
420
+ # naming-style.
421
+ variable-rgx=[a-z_][a-z0-9_]{0,30}$
422
+
423
+
424
+ [STRING]
425
+
426
+ # This flag controls whether the implicit-str-concat-in-sequence should
427
+ # generate a warning on implicit string concatenation in sequences defined over
428
+ # several lines.
429
+ check-str-concat-over-line-jumps=no
430
+
431
+
432
+ [IMPORTS]
433
+
434
+ # Allow wildcard imports from modules that define __all__.
435
+ allow-wildcard-with-all=no
436
+
437
+ # Analyse import fallback blocks. This can be used to support both Python 2 and
438
+ # 3 compatible code, which means that the block might have code that exists
439
+ # only in one or another interpreter, leading to false positives when analysed.
440
+ analyse-fallback-blocks=no
441
+
442
+ # Deprecated modules which should not be used, separated by a comma.
443
+ deprecated-modules=optparse,tkinter.tix
444
+
445
+ # Create a graph of external dependencies in the given file (report RP0402 must
446
+ # not be disabled).
447
+ ext-import-graph=
448
+
449
+ # Create a graph of every (i.e. internal and external) dependencies in the
450
+ # given file (report RP0402 must not be disabled).
451
+ import-graph=
452
+
453
+ # Create a graph of internal dependencies in the given file (report RP0402 must
454
+ # not be disabled).
455
+ int-import-graph=
456
+
457
+ # Force import order to recognize a module as part of the standard
458
+ # compatibility libraries.
459
+ known-standard-library=
460
+
461
+ # Force import order to recognize a module as part of a third party library.
462
+ known-third-party=enchant
463
+
464
+
465
+ [CLASSES]
466
+
467
+ # List of method names used to declare (i.e. assign) instance attributes.
468
+ defining-attr-methods=__init__,
469
+ __new__,
470
+ setUp
471
+
472
+ # List of member names, which should be excluded from the protected access
473
+ # warning.
474
+ exclude-protected=_asdict,
475
+ _fields,
476
+ _replace,
477
+ _source,
478
+ _make
479
+
480
+ # List of valid names for the first argument in a class method.
481
+ valid-classmethod-first-arg=cls
482
+
483
+ # List of valid names for the first argument in a metaclass class method.
484
+ valid-metaclass-classmethod-first-arg=cls
485
+
486
+
487
+ [DESIGN]
488
+
489
+ # Maximum number of arguments for function / method.
490
+ max-args=5
491
+
492
+ # Maximum number of attributes for a class (see R0902).
493
+ max-attributes=7
494
+
495
+ # Maximum number of boolean expressions in an if statement.
496
+ max-bool-expr=5
497
+
498
+ # Maximum number of branch for function / method body.
499
+ max-branches=12
500
+
501
+ # Maximum number of locals for function / method body.
502
+ max-locals=15
503
+
504
+ # Maximum number of parents for a class (see R0901).
505
+ max-parents=15
506
+
507
+ # Maximum number of public methods for a class (see R0904).
508
+ max-public-methods=20
509
+
510
+ # Maximum number of return / yield for function / method body.
511
+ max-returns=6
512
+
513
+ # Maximum number of statements in function / method body.
514
+ max-statements=50
515
+
516
+ # Minimum number of public methods for a class (see R0903).
517
+ min-public-methods=2
518
+
519
+
520
+ [EXCEPTIONS]
521
+
522
+ # Exceptions that will emit a warning when being caught. Defaults to
523
+ # "BaseException, Exception".
524
+ overgeneral-exceptions=builtins.BaseException,
525
+ builtins.Exception
third_party/Matcha-TTS/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Shivam Mehta
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
third_party/Matcha-TTS/MANIFEST.in ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include README.md
2
+ include LICENSE.txt
3
+ include requirements.*.txt
4
+ include *.cff
5
+ include requirements.txt
6
+ include matcha/VERSION
7
+ recursive-include matcha *.json
8
+ recursive-include matcha *.html
9
+ recursive-include matcha *.png
10
+ recursive-include matcha *.md
11
+ recursive-include matcha *.py
12
+ recursive-include matcha *.pyx
13
+ recursive-exclude tests *
14
+ prune tests*
third_party/Matcha-TTS/README.md ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ # 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching
4
+
5
+ ### [Shivam Mehta](https://www.kth.se/profile/smehta), [Ruibo Tu](https://www.kth.se/profile/ruibo), [Jonas Beskow](https://www.kth.se/profile/beskow), [Éva Székely](https://www.kth.se/profile/szekely), and [Gustav Eje Henter](https://people.kth.se/~ghe/)
6
+
7
+ [![python](https://img.shields.io/badge/-Python_3.10-blue?logo=python&logoColor=white)](https://www.python.org/downloads/release/python-3100/)
8
+ [![pytorch](https://img.shields.io/badge/PyTorch_2.0+-ee4c2c?logo=pytorch&logoColor=white)](https://pytorch.org/get-started/locally/)
9
+ [![lightning](https://img.shields.io/badge/-Lightning_2.0+-792ee5?logo=pytorchlightning&logoColor=white)](https://pytorchlightning.ai/)
10
+ [![hydra](https://img.shields.io/badge/Config-Hydra_1.3-89b8cd)](https://hydra.cc/)
11
+ [![black](https://img.shields.io/badge/Code%20Style-Black-black.svg?labelColor=gray)](https://black.readthedocs.io/en/stable/)
12
+ [![isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
13
+
14
+ <p style="text-align: center;">
15
+ <img src="https://shivammehta25.github.io/Matcha-TTS/images/logo.png" height="128"/>
16
+ </p>
17
+
18
+ </div>
19
+
20
+ > This is the official code implementation of 🍵 Matcha-TTS [ICASSP 2024].
21
+
22
+ We propose 🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses [conditional flow matching](https://arxiv.org/abs/2210.02747) (similar to [rectified flows](https://arxiv.org/abs/2209.03003)) to speed up ODE-based speech synthesis. Our method:
23
+
24
+ - Is probabilistic
25
+ - Has compact memory footprint
26
+ - Sounds highly natural
27
+ - Is very fast to synthesise from
28
+
29
+ Check out our [demo page](https://shivammehta25.github.io/Matcha-TTS) and read [our ICASSP 2024 paper](https://arxiv.org/abs/2309.03199) for more details.
30
+
31
+ [Pre-trained models](https://drive.google.com/drive/folders/17C_gYgEHOxI5ZypcfE_k1piKCtyR0isJ?usp=sharing) will be automatically downloaded with the CLI or gradio interface.
32
+
33
+ You can also [try 🍵 Matcha-TTS in your browser on HuggingFace 🤗 spaces](https://huggingface.co/spaces/shivammehta25/Matcha-TTS).
34
+
35
+ ## Teaser video
36
+
37
+ [![Watch the video](https://img.youtube.com/vi/xmvJkz3bqw0/hqdefault.jpg)](https://youtu.be/xmvJkz3bqw0)
38
+
39
+ ## Installation
40
+
41
+ 1. Create an environment (suggested but optional)
42
+
43
+ ```
44
+ conda create -n matcha-tts python=3.10 -y
45
+ conda activate matcha-tts
46
+ ```
47
+
48
+ 2. Install Matcha TTS using pip or from source
49
+
50
+ ```bash
51
+ pip install matcha-tts
52
+ ```
53
+
54
+ from source
55
+
56
+ ```bash
57
+ pip install git+https://github.com/shivammehta25/Matcha-TTS.git
58
+ cd Matcha-TTS
59
+ pip install -e .
60
+ ```
61
+
62
+ 3. Run CLI / gradio app / jupyter notebook
63
+
64
+ ```bash
65
+ # This will download the required models
66
+ matcha-tts --text "<INPUT TEXT>"
67
+ ```
68
+
69
+ or
70
+
71
+ ```bash
72
+ matcha-tts-app
73
+ ```
74
+
75
+ or open `synthesis.ipynb` on jupyter notebook
76
+
77
+ ### CLI Arguments
78
+
79
+ - To synthesise from given text, run:
80
+
81
+ ```bash
82
+ matcha-tts --text "<INPUT TEXT>"
83
+ ```
84
+
85
+ - To synthesise from a file, run:
86
+
87
+ ```bash
88
+ matcha-tts --file <PATH TO FILE>
89
+ ```
90
+
91
+ - To batch synthesise from a file, run:
92
+
93
+ ```bash
94
+ matcha-tts --file <PATH TO FILE> --batched
95
+ ```
96
+
97
+ Additional arguments
98
+
99
+ - Speaking rate
100
+
101
+ ```bash
102
+ matcha-tts --text "<INPUT TEXT>" --speaking_rate 1.0
103
+ ```
104
+
105
+ - Sampling temperature
106
+
107
+ ```bash
108
+ matcha-tts --text "<INPUT TEXT>" --temperature 0.667
109
+ ```
110
+
111
+ - Euler ODE solver steps
112
+
113
+ ```bash
114
+ matcha-tts --text "<INPUT TEXT>" --steps 10
115
+ ```
116
+
117
+ ## Train with your own dataset
118
+
119
+ Let's assume we are training with LJ Speech
120
+
121
+ 1. Download the dataset from [here](https://keithito.com/LJ-Speech-Dataset/), extract it to `data/LJSpeech-1.1`, and prepare the file lists to point to the extracted data like for [item 5 in the setup of the NVIDIA Tacotron 2 repo](https://github.com/NVIDIA/tacotron2#setup).
122
+
123
+ 2. Clone and enter the Matcha-TTS repository
124
+
125
+ ```bash
126
+ git clone https://github.com/shivammehta25/Matcha-TTS.git
127
+ cd Matcha-TTS
128
+ ```
129
+
130
+ 3. Install the package from source
131
+
132
+ ```bash
133
+ pip install -e .
134
+ ```
135
+
136
+ 4. Go to `configs/data/ljspeech.yaml` and change
137
+
138
+ ```yaml
139
+ train_filelist_path: data/filelists/ljs_audio_text_train_filelist.txt
140
+ valid_filelist_path: data/filelists/ljs_audio_text_val_filelist.txt
141
+ ```
142
+
143
+ 5. Generate normalisation statistics with the yaml file of dataset configuration
144
+
145
+ ```bash
146
+ matcha-data-stats -i ljspeech.yaml
147
+ # Output:
148
+ #{'mel_mean': -5.53662231756592, 'mel_std': 2.1161014277038574}
149
+ ```
150
+
151
+ Update these values in `configs/data/ljspeech.yaml` under `data_statistics` key.
152
+
153
+ ```bash
154
+ data_statistics: # Computed for ljspeech dataset
155
+ mel_mean: -5.536622
156
+ mel_std: 2.116101
157
+ ```
158
+
159
+ to the paths of your train and validation filelists.
160
+
161
+ 6. Run the training script
162
+
163
+ ```bash
164
+ make train-ljspeech
165
+ ```
166
+
167
+ or
168
+
169
+ ```bash
170
+ python matcha/train.py experiment=ljspeech
171
+ ```
172
+
173
+ - for a minimum memory run
174
+
175
+ ```bash
176
+ python matcha/train.py experiment=ljspeech_min_memory
177
+ ```
178
+
179
+ - for multi-gpu training, run
180
+
181
+ ```bash
182
+ python matcha/train.py experiment=ljspeech trainer.devices=[0,1]
183
+ ```
184
+
185
+ 7. Synthesise from the custom trained model
186
+
187
+ ```bash
188
+ matcha-tts --text "<INPUT TEXT>" --checkpoint_path <PATH TO CHECKPOINT>
189
+ ```
190
+
191
+ ## ONNX support
192
+
193
+ > Special thanks to [@mush42](https://github.com/mush42) for implementing ONNX export and inference support.
194
+
195
+ It is possible to export Matcha checkpoints to [ONNX](https://onnx.ai/), and run inference on the exported ONNX graph.
196
+
197
+ ### ONNX export
198
+
199
+ To export a checkpoint to ONNX, first install ONNX with
200
+
201
+ ```bash
202
+ pip install onnx
203
+ ```
204
+
205
+ then run the following:
206
+
207
+ ```bash
208
+ python3 -m matcha.onnx.export matcha.ckpt model.onnx --n-timesteps 5
209
+ ```
210
+
211
+ Optionally, the ONNX exporter accepts **vocoder-name** and **vocoder-checkpoint** arguments. This enables you to embed the vocoder in the exported graph and generate waveforms in a single run (similar to end-to-end TTS systems).
212
+
213
+ **Note** that `n_timesteps` is treated as a hyper-parameter rather than a model input. This means you should specify it during export (not during inference). If not specified, `n_timesteps` is set to **5**.
214
+
215
+ **Important**: for now, torch>=2.1.0 is needed for export since the `scaled_product_attention` operator is not exportable in older versions. Until the final version is released, those who want to export their models must install torch>=2.1.0 manually as a pre-release.
216
+
217
+ ### ONNX Inference
218
+
219
+ To run inference on the exported model, first install `onnxruntime` using
220
+
221
+ ```bash
222
+ pip install onnxruntime
223
+ pip install onnxruntime-gpu # for GPU inference
224
+ ```
225
+
226
+ then use the following:
227
+
228
+ ```bash
229
+ python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs
230
+ ```
231
+
232
+ You can also control synthesis parameters:
233
+
234
+ ```bash
235
+ python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs --temperature 0.4 --speaking_rate 0.9 --spk 0
236
+ ```
237
+
238
+ To run inference on **GPU**, make sure to install **onnxruntime-gpu** package, and then pass `--gpu` to the inference command:
239
+
240
+ ```bash
241
+ python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs --gpu
242
+ ```
243
+
244
+ If you exported only Matcha to ONNX, this will write mel-spectrogram as graphs and `numpy` arrays to the output directory.
245
+ If you embedded the vocoder in the exported graph, this will write `.wav` audio files to the output directory.
246
+
247
+ If you exported only Matcha to ONNX, and you want to run a full TTS pipeline, you can pass a path to a vocoder model in `ONNX` format:
248
+
249
+ ```bash
250
+ python3 -m matcha.onnx.infer model.onnx --text "hey" --output-dir ./outputs --vocoder hifigan.small.onnx
251
+ ```
252
+
253
+ This will write `.wav` audio files to the output directory.
254
+
255
+ ## Citation information
256
+
257
+ If you use our code or otherwise find this work useful, please cite our paper:
258
+
259
+ ```text
260
+ @inproceedings{mehta2024matcha,
261
+ title={Matcha-{TTS}: A fast {TTS} architecture with conditional flow matching},
262
+ author={Mehta, Shivam and Tu, Ruibo and Beskow, Jonas and Sz{\'e}kely, {\'E}va and Henter, Gustav Eje},
263
+ booktitle={Proc. ICASSP},
264
+ year={2024}
265
+ }
266
+ ```
267
+
268
+ ## Acknowledgements
269
+
270
+ Since this code uses [Lightning-Hydra-Template](https://github.com/ashleve/lightning-hydra-template), you have all the powers that come with it.
271
+
272
+ Other source code we would like to acknowledge:
273
+
274
+ - [Coqui-TTS](https://github.com/coqui-ai/TTS/tree/dev): For helping me figure out how to make cython binaries pip installable and encouragement
275
+ - [Hugging Face Diffusers](https://huggingface.co/): For their awesome diffusers library and its components
276
+ - [Grad-TTS](https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS): For the monotonic alignment search source code
277
+ - [torchdyn](https://github.com/DiffEqML/torchdyn): Useful for trying other ODE solvers during research and development
278
+ - [labml.ai](https://nn.labml.ai/transformers/rope/index.html): For the RoPE implementation
third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html
2
+
3
+ model_checkpoint:
4
+ _target_: lightning.pytorch.callbacks.ModelCheckpoint
5
+ dirpath: ${paths.output_dir}/checkpoints # directory to save the model file
6
+ filename: checkpoint_{epoch:03d} # checkpoint filename
7
+ monitor: epoch # name of the logged metric which determines when model is improving
8
+ verbose: False # verbosity mode
9
+ save_last: true # additionally always save an exact copy of the last checkpoint to a file last.ckpt
10
+ save_top_k: 10 # save k best models (determined by above metric)
11
+ mode: "max" # "max" means higher metric value is better, can be also "min"
12
+ auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name
13
+ save_weights_only: False # if True, then only the model’s weights will be saved
14
+ every_n_train_steps: null # number of training steps between checkpoints
15
+ train_time_interval: null # checkpoints are monitored at the specified time interval
16
+ every_n_epochs: 100 # number of epochs between checkpoints
17
+ save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation
third_party/Matcha-TTS/configs/callbacks/none.yaml ADDED
File without changes
third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.RichProgressBar.html
2
+
3
+ rich_progress_bar:
4
+ _target_: lightning.pytorch.callbacks.RichProgressBar
third_party/Matcha-TTS/configs/debug/default.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # default debugging setup, runs 1 full epoch
4
+ # other debugging configs can inherit from this one
5
+
6
+ # overwrite task name so debugging logs are stored in separate folder
7
+ task_name: "debug"
8
+
9
+ # disable callbacks and loggers during debugging
10
+ # callbacks: null
11
+ # logger: null
12
+
13
+ extras:
14
+ ignore_warnings: False
15
+ enforce_tags: False
16
+
17
+ # sets level of all command line loggers to 'DEBUG'
18
+ # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/
19
+ hydra:
20
+ job_logging:
21
+ root:
22
+ level: DEBUG
23
+
24
+ # use this to also set hydra loggers to 'DEBUG'
25
+ # verbose: True
26
+
27
+ trainer:
28
+ max_epochs: 1
29
+ accelerator: cpu # debuggers don't like gpus
30
+ devices: 1 # debuggers don't like multiprocessing
31
+ detect_anomaly: true # raise exception if NaN or +/-inf is detected in any tensor
32
+
33
+ data:
34
+ num_workers: 0 # debuggers don't like multiprocessing
35
+ pin_memory: False # disable gpu memory pin
third_party/Matcha-TTS/configs/debug/overfit.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # overfits to 3 batches
4
+
5
+ defaults:
6
+ - default
7
+
8
+ trainer:
9
+ max_epochs: 20
10
+ overfit_batches: 3
11
+
12
+ # model ckpt and early stopping need to be disabled during overfitting
13
+ callbacks: null
third_party/Matcha-TTS/configs/debug/profiler.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # runs with execution time profiling
4
+
5
+ defaults:
6
+ - default
7
+
8
+ trainer:
9
+ max_epochs: 1
10
+ # profiler: "simple"
11
+ profiler: "advanced"
12
+ # profiler: "pytorch"
13
+ accelerator: gpu
14
+
15
+ limit_train_batches: 0.02
third_party/Matcha-TTS/configs/eval.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ defaults:
4
+ - _self_
5
+ - data: mnist # choose datamodule with `test_dataloader()` for evaluation
6
+ - model: mnist
7
+ - logger: null
8
+ - trainer: default
9
+ - paths: default
10
+ - extras: default
11
+ - hydra: default
12
+
13
+ task_name: "eval"
14
+
15
+ tags: ["dev"]
16
+
17
+ # passing checkpoint path is necessary for evaluation
18
+ ckpt_path: ???
third_party/Matcha-TTS/configs/experiment/ljspeech.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # to execute this experiment run:
4
+ # python train.py experiment=multispeaker
5
+
6
+ defaults:
7
+ - override /data: ljspeech.yaml
8
+
9
+ # all parameters below will be merged with parameters from default configurations set above
10
+ # this allows you to overwrite only specified parameters
11
+
12
+ tags: ["ljspeech"]
13
+
14
+ run_name: ljspeech
third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # to execute this experiment run:
4
+ # python train.py experiment=multispeaker
5
+
6
+ defaults:
7
+ - override /data: ljspeech.yaml
8
+
9
+ # all parameters below will be merged with parameters from default configurations set above
10
+ # this allows you to overwrite only specified parameters
11
+
12
+ tags: ["ljspeech"]
13
+
14
+ run_name: ljspeech_min
15
+
16
+
17
+ model:
18
+ out_size: 172
third_party/Matcha-TTS/configs/experiment/multispeaker.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # to execute this experiment run:
4
+ # python train.py experiment=multispeaker
5
+
6
+ defaults:
7
+ - override /data: vctk.yaml
8
+
9
+ # all parameters below will be merged with parameters from default configurations set above
10
+ # this allows you to overwrite only specified parameters
11
+
12
+ tags: ["multispeaker"]
13
+
14
+ run_name: multispeaker
third_party/Matcha-TTS/configs/extras/default.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # disable python warnings if they annoy you
2
+ ignore_warnings: False
3
+
4
+ # ask user for tags if none are provided in the config
5
+ enforce_tags: True
6
+
7
+ # pretty print config tree at the start of the run using Rich library
8
+ print_config: True
third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # example hyperparameter optimization of some experiment with Optuna:
4
+ # python train.py -m hparams_search=mnist_optuna experiment=example
5
+
6
+ defaults:
7
+ - override /hydra/sweeper: optuna
8
+
9
+ # choose metric which will be optimized by Optuna
10
+ # make sure this is the correct name of some metric logged in lightning module!
11
+ optimized_metric: "val/acc_best"
12
+
13
+ # here we define Optuna hyperparameter search
14
+ # it optimizes for value returned from function with @hydra.main decorator
15
+ # docs: https://hydra.cc/docs/next/plugins/optuna_sweeper
16
+ hydra:
17
+ mode: "MULTIRUN" # set hydra to multirun by default if this config is attached
18
+
19
+ sweeper:
20
+ _target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper
21
+
22
+ # storage URL to persist optimization results
23
+ # for example, you can use SQLite if you set 'sqlite:///example.db'
24
+ storage: null
25
+
26
+ # name of the study to persist optimization results
27
+ study_name: null
28
+
29
+ # number of parallel workers
30
+ n_jobs: 1
31
+
32
+ # 'minimize' or 'maximize' the objective
33
+ direction: maximize
34
+
35
+ # total number of runs that will be executed
36
+ n_trials: 20
37
+
38
+ # choose Optuna hyperparameter sampler
39
+ # you can choose bayesian sampler (tpe), random search (without optimization), grid sampler, and others
40
+ # docs: https://optuna.readthedocs.io/en/stable/reference/samplers.html
41
+ sampler:
42
+ _target_: optuna.samplers.TPESampler
43
+ seed: 1234
44
+ n_startup_trials: 10 # number of random sampling runs before optimization starts
45
+
46
+ # define hyperparameter search space
47
+ params:
48
+ model.optimizer.lr: interval(0.0001, 0.1)
49
+ data.batch_size: choice(32, 64, 128, 256)
50
+ model.net.lin1_size: choice(64, 128, 256)
51
+ model.net.lin2_size: choice(64, 128, 256)
52
+ model.net.lin3_size: choice(32, 64, 128, 256)
third_party/Matcha-TTS/configs/local/.gitkeep ADDED
File without changes
third_party/Matcha-TTS/configs/logger/aim.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://aimstack.io/
2
+
3
+ # example usage in lightning module:
4
+ # https://github.com/aimhubio/aim/blob/main/examples/pytorch_lightning_track.py
5
+
6
+ # open the Aim UI with the following command (run in the folder containing the `.aim` folder):
7
+ # `aim up`
8
+
9
+ aim:
10
+ _target_: aim.pytorch_lightning.AimLogger
11
+ repo: ${paths.root_dir} # .aim folder will be created here
12
+ # repo: "aim://ip_address:port" # can instead provide IP address pointing to Aim remote tracking server which manages the repo, see https://aimstack.readthedocs.io/en/latest/using/remote_tracking.html#
13
+
14
+ # aim allows to group runs under experiment name
15
+ experiment: null # any string, set to "default" if not specified
16
+
17
+ train_metric_prefix: "train/"
18
+ val_metric_prefix: "val/"
19
+ test_metric_prefix: "test/"
20
+
21
+ # sets the tracking interval in seconds for system usage metrics (CPU, GPU, memory, etc.)
22
+ system_tracking_interval: 10 # set to null to disable system metrics tracking
23
+
24
+ # enable/disable logging of system params such as installed packages, git info, env vars, etc.
25
+ log_system_params: true
26
+
27
+ # enable/disable tracking console logs (default value is true)
28
+ capture_terminal_logs: false # set to false to avoid infinite console log loop issue https://github.com/aimhubio/aim/issues/2550
third_party/Matcha-TTS/configs/logger/tensorboard.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://www.tensorflow.org/tensorboard/
2
+
3
+ tensorboard:
4
+ _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
5
+ save_dir: "${paths.output_dir}/tensorboard/"
6
+ name: null
7
+ log_graph: False
8
+ default_hp_metric: True
9
+ prefix: ""
10
+ # version: ""
third_party/Matcha-TTS/configs/logger/wandb.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://wandb.ai
2
+
3
+ wandb:
4
+ _target_: lightning.pytorch.loggers.wandb.WandbLogger
5
+ # name: "" # name of the run (normally generated by wandb)
6
+ save_dir: "${paths.output_dir}"
7
+ offline: False
8
+ id: null # pass correct id to resume experiment!
9
+ anonymous: null # enable anonymous logging
10
+ project: "lightning-hydra-template"
11
+ log_model: False # upload lightning ckpts
12
+ prefix: "" # a string to put at the beginning of metric keys
13
+ # entity: "" # set to name of your wandb team
14
+ group: ""
15
+ tags: []
16
+ job_type: ""
third_party/Matcha-TTS/configs/model/cfm/default.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ name: CFM
2
+ solver: euler
3
+ sigma_min: 1e-4
third_party/Matcha-TTS/configs/model/decoder/default.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ channels: [256, 256]
2
+ dropout: 0.05
3
+ attention_head_dim: 64
4
+ n_blocks: 1
5
+ num_mid_blocks: 2
6
+ num_heads: 2
7
+ act_fn: snakebeta
third_party/Matcha-TTS/configs/model/encoder/default.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ encoder_type: RoPE Encoder
2
+ encoder_params:
3
+ n_feats: ${model.n_feats}
4
+ n_channels: 192
5
+ filter_channels: 768
6
+ filter_channels_dp: 256
7
+ n_heads: 2
8
+ n_layers: 6
9
+ kernel_size: 3
10
+ p_dropout: 0.1
11
+ spk_emb_dim: 64
12
+ n_spks: 1
13
+ prenet: true
14
+
15
+ duration_predictor_params:
16
+ filter_channels_dp: ${model.encoder.encoder_params.filter_channels_dp}
17
+ kernel_size: 3
18
+ p_dropout: ${model.encoder.encoder_params.p_dropout}
third_party/Matcha-TTS/configs/model/matcha.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - _self_
3
+ - encoder: default.yaml
4
+ - decoder: default.yaml
5
+ - cfm: default.yaml
6
+ - optimizer: adam.yaml
7
+
8
+ _target_: matcha.models.matcha_tts.MatchaTTS
9
+ n_vocab: 178
10
+ n_spks: ${data.n_spks}
11
+ spk_emb_dim: 64
12
+ n_feats: 80
13
+ data_statistics: ${data.data_statistics}
14
+ out_size: null # Must be divisible by 4
15
+ prior_loss: true
third_party/Matcha-TTS/configs/model/optimizer/adam.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ _target_: torch.optim.Adam
2
+ _partial_: true
3
+ lr: 1e-4
4
+ weight_decay: 0.0
third_party/Matcha-TTS/configs/paths/default.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # path to root directory
2
+ # this requires PROJECT_ROOT environment variable to exist
3
+ # you can replace it with "." if you want the root to be the current working directory
4
+ root_dir: ${oc.env:PROJECT_ROOT}
5
+
6
+ # path to data directory
7
+ data_dir: ${paths.root_dir}/data/
8
+
9
+ # path to logging directory
10
+ log_dir: ${paths.root_dir}/logs/
11
+
12
+ # path to output directory, created dynamically by hydra
13
+ # path generation pattern is specified in `configs/hydra/default.yaml`
14
+ # use it to store all files generated during the run, like ckpts and metrics
15
+ output_dir: ${hydra:runtime.output_dir}
16
+
17
+ # path to working directory
18
+ work_dir: ${hydra:runtime.cwd}
third_party/Matcha-TTS/configs/train.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # specify here default configuration
4
+ # order of defaults determines the order in which configs override each other
5
+ defaults:
6
+ - _self_
7
+ - data: ljspeech
8
+ - model: matcha
9
+ - callbacks: default
10
+ - logger: tensorboard # set logger here or use command line (e.g. `python train.py logger=tensorboard`)
11
+ - trainer: default
12
+ - paths: default
13
+ - extras: default
14
+ - hydra: default
15
+
16
+ # experiment configs allow for version control of specific hyperparameters
17
+ # e.g. best hyperparameters for given model and datamodule
18
+ - experiment: null
19
+
20
+ # config for hyperparameter optimization
21
+ - hparams_search: null
22
+
23
+ # optional local config for machine/user specific settings
24
+ # it's optional since it doesn't need to exist and is excluded from version control
25
+ - optional local: default
26
+
27
+ # debugging config (enable through command line, e.g. `python train.py debug=default)
28
+ - debug: null
29
+
30
+ # task name, determines output directory path
31
+ task_name: "train"
32
+
33
+ run_name: ???
34
+
35
+ # tags to help you identify your experiments
36
+ # you can overwrite this in experiment configs
37
+ # overwrite from command line with `python train.py tags="[first_tag, second_tag]"`
38
+ tags: ["dev"]
39
+
40
+ # set False to skip model training
41
+ train: True
42
+
43
+ # evaluate on test set, using best model weights achieved during training
44
+ # lightning chooses best weights based on the metric specified in checkpoint callback
45
+ test: True
46
+
47
+ # simply provide checkpoint path to resume training
48
+ ckpt_path: null
49
+
50
+ # seed for random number generators in pytorch, numpy and python.random
51
+ seed: 1234
third_party/Matcha-TTS/configs/trainer/ddp.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - default
3
+
4
+ strategy: ddp
5
+
6
+ accelerator: gpu
7
+ devices: [0,1]
8
+ num_nodes: 1
9
+ sync_batchnorm: True
third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - default
3
+
4
+ # simulate DDP on CPU, useful for debugging
5
+ accelerator: cpu
6
+ devices: 2
7
+ strategy: ddp_spawn
third_party/Matcha-TTS/configs/trainer/gpu.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ defaults:
2
+ - default
3
+
4
+ accelerator: gpu
5
+ devices: 1
third_party/Matcha-TTS/configs/trainer/mps.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ defaults:
2
+ - default
3
+
4
+ accelerator: mps
5
+ devices: 1
third_party/Matcha-TTS/matcha/VERSION ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.0.5.1
third_party/Matcha-TTS/matcha/__init__.py ADDED
File without changes
third_party/Matcha-TTS/matcha/cli.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import datetime as dt
3
+ import os
4
+ import warnings
5
+ from pathlib import Path
6
+
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import soundfile as sf
10
+ import torch
11
+
12
+ from matcha.hifigan.config import v1
13
+ from matcha.hifigan.denoiser import Denoiser
14
+ from matcha.hifigan.env import AttrDict
15
+ from matcha.hifigan.models import Generator as HiFiGAN
16
+ from matcha.models.matcha_tts import MatchaTTS
17
+ from matcha.text import sequence_to_text, text_to_sequence
18
+ from matcha.utils.utils import assert_model_downloaded, get_user_data_dir, intersperse
19
+
20
+ MATCHA_URLS = {
21
+ "matcha_ljspeech": "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/matcha_ljspeech.ckpt",
22
+ "matcha_vctk": "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/matcha_vctk.ckpt",
23
+ }
24
+
25
+ VOCODER_URLS = {
26
+ "hifigan_T2_v1": "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/generator_v1", # Old url: https://drive.google.com/file/d/14NENd4equCBLyyCSke114Mv6YR_j_uFs/view?usp=drive_link
27
+ "hifigan_univ_v1": "https://github.com/shivammehta25/Matcha-TTS-checkpoints/releases/download/v1.0/g_02500000", # Old url: https://drive.google.com/file/d/1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW/view?usp=drive_link
28
+ }
29
+
30
+ MULTISPEAKER_MODEL = {
31
+ "matcha_vctk": {"vocoder": "hifigan_univ_v1", "speaking_rate": 0.85, "spk": 0, "spk_range": (0, 107)}
32
+ }
33
+
34
+ SINGLESPEAKER_MODEL = {"matcha_ljspeech": {"vocoder": "hifigan_T2_v1", "speaking_rate": 0.95, "spk": None}}
35
+
36
+
37
+ def plot_spectrogram_to_numpy(spectrogram, filename):
38
+ fig, ax = plt.subplots(figsize=(12, 3))
39
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
40
+ plt.colorbar(im, ax=ax)
41
+ plt.xlabel("Frames")
42
+ plt.ylabel("Channels")
43
+ plt.title("Synthesised Mel-Spectrogram")
44
+ fig.canvas.draw()
45
+ plt.savefig(filename)
46
+
47
+
48
+ def process_text(i: int, text: str, device: torch.device):
49
+ print(f"[{i}] - Input text: {text}")
50
+ x = torch.tensor(
51
+ intersperse(text_to_sequence(text, ["english_cleaners2"]), 0),
52
+ dtype=torch.long,
53
+ device=device,
54
+ )[None]
55
+ x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device)
56
+ x_phones = sequence_to_text(x.squeeze(0).tolist())
57
+ print(f"[{i}] - Phonetised text: {x_phones[1::2]}")
58
+
59
+ return {"x_orig": text, "x": x, "x_lengths": x_lengths, "x_phones": x_phones}
60
+
61
+
62
+ def get_texts(args):
63
+ if args.text:
64
+ texts = [args.text]
65
+ else:
66
+ with open(args.file, encoding="utf-8") as f:
67
+ texts = f.readlines()
68
+ return texts
69
+
70
+
71
+ def assert_required_models_available(args):
72
+ save_dir = get_user_data_dir()
73
+ if not hasattr(args, "checkpoint_path") and args.checkpoint_path is None:
74
+ model_path = args.checkpoint_path
75
+ else:
76
+ model_path = save_dir / f"{args.model}.ckpt"
77
+ assert_model_downloaded(model_path, MATCHA_URLS[args.model])
78
+
79
+ vocoder_path = save_dir / f"{args.vocoder}"
80
+ assert_model_downloaded(vocoder_path, VOCODER_URLS[args.vocoder])
81
+ return {"matcha": model_path, "vocoder": vocoder_path}
82
+
83
+
84
+ def load_hifigan(checkpoint_path, device):
85
+ h = AttrDict(v1)
86
+ hifigan = HiFiGAN(h).to(device)
87
+ hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)["generator"])
88
+ _ = hifigan.eval()
89
+ hifigan.remove_weight_norm()
90
+ return hifigan
91
+
92
+
93
+ def load_vocoder(vocoder_name, checkpoint_path, device):
94
+ print(f"[!] Loading {vocoder_name}!")
95
+ vocoder = None
96
+ if vocoder_name in ("hifigan_T2_v1", "hifigan_univ_v1"):
97
+ vocoder = load_hifigan(checkpoint_path, device)
98
+ else:
99
+ raise NotImplementedError(
100
+ f"Vocoder {vocoder_name} not implemented! define a load_<<vocoder_name>> method for it"
101
+ )
102
+
103
+ denoiser = Denoiser(vocoder, mode="zeros")
104
+ print(f"[+] {vocoder_name} loaded!")
105
+ return vocoder, denoiser
106
+
107
+
108
+ def load_matcha(model_name, checkpoint_path, device):
109
+ print(f"[!] Loading {model_name}!")
110
+ model = MatchaTTS.load_from_checkpoint(checkpoint_path, map_location=device)
111
+ _ = model.eval()
112
+
113
+ print(f"[+] {model_name} loaded!")
114
+ return model
115
+
116
+
117
+ def to_waveform(mel, vocoder, denoiser=None):
118
+ audio = vocoder(mel).clamp(-1, 1)
119
+ if denoiser is not None:
120
+ audio = denoiser(audio.squeeze(), strength=0.00025).cpu().squeeze()
121
+
122
+ return audio.cpu().squeeze()
123
+
124
+
125
+ def save_to_folder(filename: str, output: dict, folder: str):
126
+ folder = Path(folder)
127
+ folder.mkdir(exist_ok=True, parents=True)
128
+ plot_spectrogram_to_numpy(np.array(output["mel"].squeeze().float().cpu()), f"{filename}.png")
129
+ np.save(folder / f"{filename}", output["mel"].cpu().numpy())
130
+ sf.write(folder / f"{filename}.wav", output["waveform"], 22050, "PCM_24")
131
+ return folder.resolve() / f"{filename}.wav"
132
+
133
+
134
+ def validate_args(args):
135
+ assert (
136
+ args.text or args.file
137
+ ), "Either text or file must be provided Matcha-T(ea)TTS need sometext to whisk the waveforms."
138
+ assert args.temperature >= 0, "Sampling temperature cannot be negative"
139
+ assert args.steps > 0, "Number of ODE steps must be greater than 0"
140
+
141
+ if args.checkpoint_path is None:
142
+ # When using pretrained models
143
+ if args.model in SINGLESPEAKER_MODEL:
144
+ args = validate_args_for_single_speaker_model(args)
145
+
146
+ if args.model in MULTISPEAKER_MODEL:
147
+ args = validate_args_for_multispeaker_model(args)
148
+ else:
149
+ # When using a custom model
150
+ if args.vocoder != "hifigan_univ_v1":
151
+ warn_ = "[-] Using custom model checkpoint! I would suggest passing --vocoder hifigan_univ_v1, unless the custom model is trained on LJ Speech."
152
+ warnings.warn(warn_, UserWarning)
153
+ if args.speaking_rate is None:
154
+ args.speaking_rate = 1.0
155
+
156
+ if args.batched:
157
+ assert args.batch_size > 0, "Batch size must be greater than 0"
158
+ assert args.speaking_rate > 0, "Speaking rate must be greater than 0"
159
+
160
+ return args
161
+
162
+
163
+ def validate_args_for_multispeaker_model(args):
164
+ if args.vocoder is not None:
165
+ if args.vocoder != MULTISPEAKER_MODEL[args.model]["vocoder"]:
166
+ warn_ = f"[-] Using {args.model} model! I would suggest passing --vocoder {MULTISPEAKER_MODEL[args.model]['vocoder']}"
167
+ warnings.warn(warn_, UserWarning)
168
+ else:
169
+ args.vocoder = MULTISPEAKER_MODEL[args.model]["vocoder"]
170
+
171
+ if args.speaking_rate is None:
172
+ args.speaking_rate = MULTISPEAKER_MODEL[args.model]["speaking_rate"]
173
+
174
+ spk_range = MULTISPEAKER_MODEL[args.model]["spk_range"]
175
+ if args.spk is not None:
176
+ assert (
177
+ args.spk >= spk_range[0] and args.spk <= spk_range[-1]
178
+ ), f"Speaker ID must be between {spk_range} for this model."
179
+ else:
180
+ available_spk_id = MULTISPEAKER_MODEL[args.model]["spk"]
181
+ warn_ = f"[!] Speaker ID not provided! Using speaker ID {available_spk_id}"
182
+ warnings.warn(warn_, UserWarning)
183
+ args.spk = available_spk_id
184
+
185
+ return args
186
+
187
+
188
+ def validate_args_for_single_speaker_model(args):
189
+ if args.vocoder is not None:
190
+ if args.vocoder != SINGLESPEAKER_MODEL[args.model]["vocoder"]:
191
+ warn_ = f"[-] Using {args.model} model! I would suggest passing --vocoder {SINGLESPEAKER_MODEL[args.model]['vocoder']}"
192
+ warnings.warn(warn_, UserWarning)
193
+ else:
194
+ args.vocoder = SINGLESPEAKER_MODEL[args.model]["vocoder"]
195
+
196
+ if args.speaking_rate is None:
197
+ args.speaking_rate = SINGLESPEAKER_MODEL[args.model]["speaking_rate"]
198
+
199
+ if args.spk != SINGLESPEAKER_MODEL[args.model]["spk"]:
200
+ warn_ = f"[-] Ignoring speaker id {args.spk} for {args.model}"
201
+ warnings.warn(warn_, UserWarning)
202
+ args.spk = SINGLESPEAKER_MODEL[args.model]["spk"]
203
+
204
+ return args
205
+
206
+
207
+ @torch.inference_mode()
208
+ def cli():
209
+ parser = argparse.ArgumentParser(
210
+ description=" 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching"
211
+ )
212
+ parser.add_argument(
213
+ "--model",
214
+ type=str,
215
+ default="matcha_ljspeech",
216
+ help="Model to use",
217
+ choices=MATCHA_URLS.keys(),
218
+ )
219
+
220
+ parser.add_argument(
221
+ "--checkpoint_path",
222
+ type=str,
223
+ default=None,
224
+ help="Path to the custom model checkpoint",
225
+ )
226
+
227
+ parser.add_argument(
228
+ "--vocoder",
229
+ type=str,
230
+ default=None,
231
+ help="Vocoder to use (default: will use the one suggested with the pretrained model))",
232
+ choices=VOCODER_URLS.keys(),
233
+ )
234
+ parser.add_argument("--text", type=str, default=None, help="Text to synthesize")
235
+ parser.add_argument("--file", type=str, default=None, help="Text file to synthesize")
236
+ parser.add_argument("--spk", type=int, default=None, help="Speaker ID")
237
+ parser.add_argument(
238
+ "--temperature",
239
+ type=float,
240
+ default=0.667,
241
+ help="Variance of the x0 noise (default: 0.667)",
242
+ )
243
+ parser.add_argument(
244
+ "--speaking_rate",
245
+ type=float,
246
+ default=None,
247
+ help="change the speaking rate, a higher value means slower speaking rate (default: 1.0)",
248
+ )
249
+ parser.add_argument("--steps", type=int, default=10, help="Number of ODE steps (default: 10)")
250
+ parser.add_argument("--cpu", action="store_true", help="Use CPU for inference (default: use GPU if available)")
251
+ parser.add_argument(
252
+ "--denoiser_strength",
253
+ type=float,
254
+ default=0.00025,
255
+ help="Strength of the vocoder bias denoiser (default: 0.00025)",
256
+ )
257
+ parser.add_argument(
258
+ "--output_folder",
259
+ type=str,
260
+ default=os.getcwd(),
261
+ help="Output folder to save results (default: current dir)",
262
+ )
263
+ parser.add_argument("--batched", action="store_true", help="Batched inference (default: False)")
264
+ parser.add_argument(
265
+ "--batch_size", type=int, default=32, help="Batch size only useful when --batched (default: 32)"
266
+ )
267
+
268
+ args = parser.parse_args()
269
+
270
+ args = validate_args(args)
271
+ device = get_device(args)
272
+ print_config(args)
273
+ paths = assert_required_models_available(args)
274
+
275
+ if args.checkpoint_path is not None:
276
+ print(f"[🍵] Loading custom model from {args.checkpoint_path}")
277
+ paths["matcha"] = args.checkpoint_path
278
+ args.model = "custom_model"
279
+
280
+ model = load_matcha(args.model, paths["matcha"], device)
281
+ vocoder, denoiser = load_vocoder(args.vocoder, paths["vocoder"], device)
282
+
283
+ texts = get_texts(args)
284
+
285
+ spk = torch.tensor([args.spk], device=device, dtype=torch.long) if args.spk is not None else None
286
+ if len(texts) == 1 or not args.batched:
287
+ unbatched_synthesis(args, device, model, vocoder, denoiser, texts, spk)
288
+ else:
289
+ batched_synthesis(args, device, model, vocoder, denoiser, texts, spk)
290
+
291
+
292
+ class BatchedSynthesisDataset(torch.utils.data.Dataset):
293
+ def __init__(self, processed_texts):
294
+ self.processed_texts = processed_texts
295
+
296
+ def __len__(self):
297
+ return len(self.processed_texts)
298
+
299
+ def __getitem__(self, idx):
300
+ return self.processed_texts[idx]
301
+
302
+
303
+ def batched_collate_fn(batch):
304
+ x = []
305
+ x_lengths = []
306
+
307
+ for b in batch:
308
+ x.append(b["x"].squeeze(0))
309
+ x_lengths.append(b["x_lengths"])
310
+
311
+ x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True)
312
+ x_lengths = torch.concat(x_lengths, dim=0)
313
+ return {"x": x, "x_lengths": x_lengths}
314
+
315
+
316
+ def batched_synthesis(args, device, model, vocoder, denoiser, texts, spk):
317
+ total_rtf = []
318
+ total_rtf_w = []
319
+ processed_text = [process_text(i, text, "cpu") for i, text in enumerate(texts)]
320
+ dataloader = torch.utils.data.DataLoader(
321
+ BatchedSynthesisDataset(processed_text),
322
+ batch_size=args.batch_size,
323
+ collate_fn=batched_collate_fn,
324
+ num_workers=8,
325
+ )
326
+ for i, batch in enumerate(dataloader):
327
+ i = i + 1
328
+ start_t = dt.datetime.now()
329
+ output = model.synthesise(
330
+ batch["x"].to(device),
331
+ batch["x_lengths"].to(device),
332
+ n_timesteps=args.steps,
333
+ temperature=args.temperature,
334
+ spks=spk,
335
+ length_scale=args.speaking_rate,
336
+ )
337
+
338
+ output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
339
+ t = (dt.datetime.now() - start_t).total_seconds()
340
+ rtf_w = t * 22050 / (output["waveform"].shape[-1])
341
+ print(f"[🍵-Batch: {i}] Matcha-TTS RTF: {output['rtf']:.4f}")
342
+ print(f"[🍵-Batch: {i}] Matcha-TTS + VOCODER RTF: {rtf_w:.4f}")
343
+ total_rtf.append(output["rtf"])
344
+ total_rtf_w.append(rtf_w)
345
+ for j in range(output["mel"].shape[0]):
346
+ base_name = f"utterance_{j:03d}_speaker_{args.spk:03d}" if args.spk is not None else f"utterance_{j:03d}"
347
+ length = output["mel_lengths"][j]
348
+ new_dict = {"mel": output["mel"][j][:, :length], "waveform": output["waveform"][j][: length * 256]}
349
+ location = save_to_folder(base_name, new_dict, args.output_folder)
350
+ print(f"[🍵-{j}] Waveform saved: {location}")
351
+
352
+ print("".join(["="] * 100))
353
+ print(f"[🍵] Average Matcha-TTS RTF: {np.mean(total_rtf):.4f} ± {np.std(total_rtf)}")
354
+ print(f"[🍵] Average Matcha-TTS + VOCODER RTF: {np.mean(total_rtf_w):.4f} ± {np.std(total_rtf_w)}")
355
+ print("[🍵] Enjoy the freshly whisked 🍵 Matcha-TTS!")
356
+
357
+
358
+ def unbatched_synthesis(args, device, model, vocoder, denoiser, texts, spk):
359
+ total_rtf = []
360
+ total_rtf_w = []
361
+ for i, text in enumerate(texts):
362
+ i = i + 1
363
+ base_name = f"utterance_{i:03d}_speaker_{args.spk:03d}" if args.spk is not None else f"utterance_{i:03d}"
364
+
365
+ print("".join(["="] * 100))
366
+ text = text.strip()
367
+ text_processed = process_text(i, text, device)
368
+
369
+ print(f"[🍵] Whisking Matcha-T(ea)TS for: {i}")
370
+ start_t = dt.datetime.now()
371
+ output = model.synthesise(
372
+ text_processed["x"],
373
+ text_processed["x_lengths"],
374
+ n_timesteps=args.steps,
375
+ temperature=args.temperature,
376
+ spks=spk,
377
+ length_scale=args.speaking_rate,
378
+ )
379
+ output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
380
+ # RTF with HiFiGAN
381
+ t = (dt.datetime.now() - start_t).total_seconds()
382
+ rtf_w = t * 22050 / (output["waveform"].shape[-1])
383
+ print(f"[🍵-{i}] Matcha-TTS RTF: {output['rtf']:.4f}")
384
+ print(f"[🍵-{i}] Matcha-TTS + VOCODER RTF: {rtf_w:.4f}")
385
+ total_rtf.append(output["rtf"])
386
+ total_rtf_w.append(rtf_w)
387
+
388
+ location = save_to_folder(base_name, output, args.output_folder)
389
+ print(f"[+] Waveform saved: {location}")
390
+
391
+ print("".join(["="] * 100))
392
+ print(f"[🍵] Average Matcha-TTS RTF: {np.mean(total_rtf):.4f} ± {np.std(total_rtf)}")
393
+ print(f"[🍵] Average Matcha-TTS + VOCODER RTF: {np.mean(total_rtf_w):.4f} ± {np.std(total_rtf_w)}")
394
+ print("[🍵] Enjoy the freshly whisked 🍵 Matcha-TTS!")
395
+
396
+
397
+ def print_config(args):
398
+ print("[!] Configurations: ")
399
+ print(f"\t- Model: {args.model}")
400
+ print(f"\t- Vocoder: {args.vocoder}")
401
+ print(f"\t- Temperature: {args.temperature}")
402
+ print(f"\t- Speaking rate: {args.speaking_rate}")
403
+ print(f"\t- Number of ODE steps: {args.steps}")
404
+ print(f"\t- Speaker: {args.spk}")
405
+
406
+
407
+ def get_device(args):
408
+ if torch.cuda.is_available() and not args.cpu:
409
+ print("[+] GPU Available! Using GPU")
410
+ device = torch.device("cuda")
411
+ else:
412
+ print("[-] GPU not available or forced CPU run! Using CPU")
413
+ device = torch.device("cpu")
414
+ return device
415
+
416
+
417
+ if __name__ == "__main__":
418
+ cli()
third_party/Matcha-TTS/matcha/hifigan/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2020 Jungil Kong
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
third_party/Matcha-TTS/matcha/hifigan/meldataset.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/jik876/hifi-gan """
2
+
3
+ import math
4
+ import os
5
+ import random
6
+
7
+ import numpy as np
8
+ import torch
9
+ import torch.utils.data
10
+ from librosa.filters import mel as librosa_mel_fn
11
+ from librosa.util import normalize
12
+ from scipy.io.wavfile import read
13
+
14
+ MAX_WAV_VALUE = 32768.0
15
+
16
+
17
+ def load_wav(full_path):
18
+ sampling_rate, data = read(full_path)
19
+ return data, sampling_rate
20
+
21
+
22
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
23
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
24
+
25
+
26
+ def dynamic_range_decompression(x, C=1):
27
+ return np.exp(x) / C
28
+
29
+
30
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
31
+ return torch.log(torch.clamp(x, min=clip_val) * C)
32
+
33
+
34
+ def dynamic_range_decompression_torch(x, C=1):
35
+ return torch.exp(x) / C
36
+
37
+
38
+ def spectral_normalize_torch(magnitudes):
39
+ output = dynamic_range_compression_torch(magnitudes)
40
+ return output
41
+
42
+
43
+ def spectral_de_normalize_torch(magnitudes):
44
+ output = dynamic_range_decompression_torch(magnitudes)
45
+ return output
46
+
47
+
48
+ mel_basis = {}
49
+ hann_window = {}
50
+
51
+
52
+ def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
53
+ if torch.min(y) < -1.0:
54
+ print("min value is ", torch.min(y))
55
+ if torch.max(y) > 1.0:
56
+ print("max value is ", torch.max(y))
57
+
58
+ global mel_basis, hann_window # pylint: disable=global-statement
59
+ if fmax not in mel_basis:
60
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
61
+ mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
62
+ hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
63
+
64
+ y = torch.nn.functional.pad(
65
+ y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
66
+ )
67
+ y = y.squeeze(1)
68
+
69
+ spec = torch.view_as_real(
70
+ torch.stft(
71
+ y,
72
+ n_fft,
73
+ hop_length=hop_size,
74
+ win_length=win_size,
75
+ window=hann_window[str(y.device)],
76
+ center=center,
77
+ pad_mode="reflect",
78
+ normalized=False,
79
+ onesided=True,
80
+ return_complex=True,
81
+ )
82
+ )
83
+
84
+ spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
85
+
86
+ spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
87
+ spec = spectral_normalize_torch(spec)
88
+
89
+ return spec
90
+
91
+
92
+ def get_dataset_filelist(a):
93
+ with open(a.input_training_file, encoding="utf-8") as fi:
94
+ training_files = [
95
+ os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0
96
+ ]
97
+
98
+ with open(a.input_validation_file, encoding="utf-8") as fi:
99
+ validation_files = [
100
+ os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav") for x in fi.read().split("\n") if len(x) > 0
101
+ ]
102
+ return training_files, validation_files
103
+
104
+
105
+ class MelDataset(torch.utils.data.Dataset):
106
+ def __init__(
107
+ self,
108
+ training_files,
109
+ segment_size,
110
+ n_fft,
111
+ num_mels,
112
+ hop_size,
113
+ win_size,
114
+ sampling_rate,
115
+ fmin,
116
+ fmax,
117
+ split=True,
118
+ shuffle=True,
119
+ n_cache_reuse=1,
120
+ device=None,
121
+ fmax_loss=None,
122
+ fine_tuning=False,
123
+ base_mels_path=None,
124
+ ):
125
+ self.audio_files = training_files
126
+ random.seed(1234)
127
+ if shuffle:
128
+ random.shuffle(self.audio_files)
129
+ self.segment_size = segment_size
130
+ self.sampling_rate = sampling_rate
131
+ self.split = split
132
+ self.n_fft = n_fft
133
+ self.num_mels = num_mels
134
+ self.hop_size = hop_size
135
+ self.win_size = win_size
136
+ self.fmin = fmin
137
+ self.fmax = fmax
138
+ self.fmax_loss = fmax_loss
139
+ self.cached_wav = None
140
+ self.n_cache_reuse = n_cache_reuse
141
+ self._cache_ref_count = 0
142
+ self.device = device
143
+ self.fine_tuning = fine_tuning
144
+ self.base_mels_path = base_mels_path
145
+
146
+ def __getitem__(self, index):
147
+ filename = self.audio_files[index]
148
+ if self._cache_ref_count == 0:
149
+ audio, sampling_rate = load_wav(filename)
150
+ audio = audio / MAX_WAV_VALUE
151
+ if not self.fine_tuning:
152
+ audio = normalize(audio) * 0.95
153
+ self.cached_wav = audio
154
+ if sampling_rate != self.sampling_rate:
155
+ raise ValueError(f"{sampling_rate} SR doesn't match target {self.sampling_rate} SR")
156
+ self._cache_ref_count = self.n_cache_reuse
157
+ else:
158
+ audio = self.cached_wav
159
+ self._cache_ref_count -= 1
160
+
161
+ audio = torch.FloatTensor(audio)
162
+ audio = audio.unsqueeze(0)
163
+
164
+ if not self.fine_tuning:
165
+ if self.split:
166
+ if audio.size(1) >= self.segment_size:
167
+ max_audio_start = audio.size(1) - self.segment_size
168
+ audio_start = random.randint(0, max_audio_start)
169
+ audio = audio[:, audio_start : audio_start + self.segment_size]
170
+ else:
171
+ audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant")
172
+
173
+ mel = mel_spectrogram(
174
+ audio,
175
+ self.n_fft,
176
+ self.num_mels,
177
+ self.sampling_rate,
178
+ self.hop_size,
179
+ self.win_size,
180
+ self.fmin,
181
+ self.fmax,
182
+ center=False,
183
+ )
184
+ else:
185
+ mel = np.load(os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + ".npy"))
186
+ mel = torch.from_numpy(mel)
187
+
188
+ if len(mel.shape) < 3:
189
+ mel = mel.unsqueeze(0)
190
+
191
+ if self.split:
192
+ frames_per_seg = math.ceil(self.segment_size / self.hop_size)
193
+
194
+ if audio.size(1) >= self.segment_size:
195
+ mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
196
+ mel = mel[:, :, mel_start : mel_start + frames_per_seg]
197
+ audio = audio[:, mel_start * self.hop_size : (mel_start + frames_per_seg) * self.hop_size]
198
+ else:
199
+ mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), "constant")
200
+ audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), "constant")
201
+
202
+ mel_loss = mel_spectrogram(
203
+ audio,
204
+ self.n_fft,
205
+ self.num_mels,
206
+ self.sampling_rate,
207
+ self.hop_size,
208
+ self.win_size,
209
+ self.fmin,
210
+ self.fmax_loss,
211
+ center=False,
212
+ )
213
+
214
+ return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
215
+
216
+ def __len__(self):
217
+ return len(self.audio_files)
third_party/Matcha-TTS/matcha/onnx/infer.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import warnings
4
+ from pathlib import Path
5
+ from time import perf_counter
6
+
7
+ import numpy as np
8
+ import onnxruntime as ort
9
+ import soundfile as sf
10
+ import torch
11
+
12
+ from matcha.cli import plot_spectrogram_to_numpy, process_text
13
+
14
+
15
+ def validate_args(args):
16
+ assert (
17
+ args.text or args.file
18
+ ), "Either text or file must be provided Matcha-T(ea)TTS need sometext to whisk the waveforms."
19
+ assert args.temperature >= 0, "Sampling temperature cannot be negative"
20
+ assert args.speaking_rate >= 0, "Speaking rate must be greater than 0"
21
+ return args
22
+
23
+
24
+ def write_wavs(model, inputs, output_dir, external_vocoder=None):
25
+ if external_vocoder is None:
26
+ print("The provided model has the vocoder embedded in the graph.\nGenerating waveform directly")
27
+ t0 = perf_counter()
28
+ wavs, wav_lengths = model.run(None, inputs)
29
+ infer_secs = perf_counter() - t0
30
+ mel_infer_secs = vocoder_infer_secs = None
31
+ else:
32
+ print("[🍵] Generating mel using Matcha")
33
+ mel_t0 = perf_counter()
34
+ mels, mel_lengths = model.run(None, inputs)
35
+ mel_infer_secs = perf_counter() - mel_t0
36
+ print("Generating waveform from mel using external vocoder")
37
+ vocoder_inputs = {external_vocoder.get_inputs()[0].name: mels}
38
+ vocoder_t0 = perf_counter()
39
+ wavs = external_vocoder.run(None, vocoder_inputs)[0]
40
+ vocoder_infer_secs = perf_counter() - vocoder_t0
41
+ wavs = wavs.squeeze(1)
42
+ wav_lengths = mel_lengths * 256
43
+ infer_secs = mel_infer_secs + vocoder_infer_secs
44
+
45
+ output_dir = Path(output_dir)
46
+ output_dir.mkdir(parents=True, exist_ok=True)
47
+ for i, (wav, wav_length) in enumerate(zip(wavs, wav_lengths)):
48
+ output_filename = output_dir.joinpath(f"output_{i + 1}.wav")
49
+ audio = wav[:wav_length]
50
+ print(f"Writing audio to {output_filename}")
51
+ sf.write(output_filename, audio, 22050, "PCM_24")
52
+
53
+ wav_secs = wav_lengths.sum() / 22050
54
+ print(f"Inference seconds: {infer_secs}")
55
+ print(f"Generated wav seconds: {wav_secs}")
56
+ rtf = infer_secs / wav_secs
57
+ if mel_infer_secs is not None:
58
+ mel_rtf = mel_infer_secs / wav_secs
59
+ print(f"Matcha RTF: {mel_rtf}")
60
+ if vocoder_infer_secs is not None:
61
+ vocoder_rtf = vocoder_infer_secs / wav_secs
62
+ print(f"Vocoder RTF: {vocoder_rtf}")
63
+ print(f"Overall RTF: {rtf}")
64
+
65
+
66
+ def write_mels(model, inputs, output_dir):
67
+ t0 = perf_counter()
68
+ mels, mel_lengths = model.run(None, inputs)
69
+ infer_secs = perf_counter() - t0
70
+
71
+ output_dir = Path(output_dir)
72
+ output_dir.mkdir(parents=True, exist_ok=True)
73
+ for i, mel in enumerate(mels):
74
+ output_stem = output_dir.joinpath(f"output_{i + 1}")
75
+ plot_spectrogram_to_numpy(mel.squeeze(), output_stem.with_suffix(".png"))
76
+ np.save(output_stem.with_suffix(".numpy"), mel)
77
+
78
+ wav_secs = (mel_lengths * 256).sum() / 22050
79
+ print(f"Inference seconds: {infer_secs}")
80
+ print(f"Generated wav seconds: {wav_secs}")
81
+ rtf = infer_secs / wav_secs
82
+ print(f"RTF: {rtf}")
83
+
84
+
85
+ def main():
86
+ parser = argparse.ArgumentParser(
87
+ description=" 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching"
88
+ )
89
+ parser.add_argument(
90
+ "model",
91
+ type=str,
92
+ help="ONNX model to use",
93
+ )
94
+ parser.add_argument("--vocoder", type=str, default=None, help="Vocoder to use (defaults to None)")
95
+ parser.add_argument("--text", type=str, default=None, help="Text to synthesize")
96
+ parser.add_argument("--file", type=str, default=None, help="Text file to synthesize")
97
+ parser.add_argument("--spk", type=int, default=None, help="Speaker ID")
98
+ parser.add_argument(
99
+ "--temperature",
100
+ type=float,
101
+ default=0.667,
102
+ help="Variance of the x0 noise (default: 0.667)",
103
+ )
104
+ parser.add_argument(
105
+ "--speaking-rate",
106
+ type=float,
107
+ default=1.0,
108
+ help="change the speaking rate, a higher value means slower speaking rate (default: 1.0)",
109
+ )
110
+ parser.add_argument("--gpu", action="store_true", help="Use CPU for inference (default: use GPU if available)")
111
+ parser.add_argument(
112
+ "--output-dir",
113
+ type=str,
114
+ default=os.getcwd(),
115
+ help="Output folder to save results (default: current dir)",
116
+ )
117
+
118
+ args = parser.parse_args()
119
+ args = validate_args(args)
120
+
121
+ if args.gpu:
122
+ providers = ["GPUExecutionProvider"]
123
+ else:
124
+ providers = ["CPUExecutionProvider"]
125
+ model = ort.InferenceSession(args.model, providers=providers)
126
+
127
+ model_inputs = model.get_inputs()
128
+ model_outputs = list(model.get_outputs())
129
+
130
+ if args.text:
131
+ text_lines = args.text.splitlines()
132
+ else:
133
+ with open(args.file, encoding="utf-8") as file:
134
+ text_lines = file.read().splitlines()
135
+
136
+ processed_lines = [process_text(0, line, "cpu") for line in text_lines]
137
+ x = [line["x"].squeeze() for line in processed_lines]
138
+ # Pad
139
+ x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True)
140
+ x = x.detach().cpu().numpy()
141
+ x_lengths = np.array([line["x_lengths"].item() for line in processed_lines], dtype=np.int64)
142
+ inputs = {
143
+ "x": x,
144
+ "x_lengths": x_lengths,
145
+ "scales": np.array([args.temperature, args.speaking_rate], dtype=np.float32),
146
+ }
147
+ is_multi_speaker = len(model_inputs) == 4
148
+ if is_multi_speaker:
149
+ if args.spk is None:
150
+ args.spk = 0
151
+ warn = "[!] Speaker ID not provided! Using speaker ID 0"
152
+ warnings.warn(warn, UserWarning)
153
+ inputs["spks"] = np.repeat(args.spk, x.shape[0]).astype(np.int64)
154
+
155
+ has_vocoder_embedded = model_outputs[0].name == "wav"
156
+ if has_vocoder_embedded:
157
+ write_wavs(model, inputs, args.output_dir)
158
+ elif args.vocoder:
159
+ external_vocoder = ort.InferenceSession(args.vocoder, providers=providers)
160
+ write_wavs(model, inputs, args.output_dir, external_vocoder=external_vocoder)
161
+ else:
162
+ warn = "[!] A vocoder is not embedded in the graph nor an external vocoder is provided. The mel output will be written as numpy arrays to `*.npy` files in the output directory"
163
+ warnings.warn(warn, UserWarning)
164
+ write_mels(model, inputs, args.output_dir)
165
+
166
+
167
+ if __name__ == "__main__":
168
+ main()
third_party/Matcha-TTS/matcha/text/__init__.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+ from matcha.text import cleaners
3
+ from matcha.text.symbols import symbols
4
+
5
+ # Mappings from symbol to numeric ID and vice versa:
6
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
7
+ _id_to_symbol = {i: s for i, s in enumerate(symbols)} # pylint: disable=unnecessary-comprehension
8
+
9
+
10
+ def text_to_sequence(text, cleaner_names):
11
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
12
+ Args:
13
+ text: string to convert to a sequence
14
+ cleaner_names: names of the cleaner functions to run the text through
15
+ Returns:
16
+ List of integers corresponding to the symbols in the text
17
+ """
18
+ sequence = []
19
+
20
+ clean_text = _clean_text(text, cleaner_names)
21
+ for symbol in clean_text:
22
+ symbol_id = _symbol_to_id[symbol]
23
+ sequence += [symbol_id]
24
+ return sequence
25
+
26
+
27
+ def cleaned_text_to_sequence(cleaned_text):
28
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
29
+ Args:
30
+ text: string to convert to a sequence
31
+ Returns:
32
+ List of integers corresponding to the symbols in the text
33
+ """
34
+ sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
35
+ return sequence
36
+
37
+
38
+ def sequence_to_text(sequence):
39
+ """Converts a sequence of IDs back to a string"""
40
+ result = ""
41
+ for symbol_id in sequence:
42
+ s = _id_to_symbol[symbol_id]
43
+ result += s
44
+ return result
45
+
46
+
47
+ def _clean_text(text, cleaner_names):
48
+ for name in cleaner_names:
49
+ cleaner = getattr(cleaners, name)
50
+ if not cleaner:
51
+ raise Exception("Unknown cleaner: %s" % name)
52
+ text = cleaner(text)
53
+ return text