Spaces:
Running
on
Zero
Running
on
Zero
Merge branch 'FunAudioLLM:main' into fastapi
Browse files- cosyvoice/cli/frontend.py +4 -1
- cosyvoice/cli/model.py +1 -0
- cosyvoice/dataset/processor.py +1 -1
- cosyvoice/flow/flow.py +1 -1
- cosyvoice/llm/llm.py +1 -1
- cosyvoice/utils/executor.py +4 -0
- cosyvoice/utils/scheduler.py +22 -0
- cosyvoice/utils/train_utils.py +4 -1
- examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml +1 -0
- examples/libritts/cosyvoice/conf/cosyvoice.yaml +3 -2
- tools/extract_embedding.py +2 -0
cosyvoice/cli/frontend.py
CHANGED
@@ -114,7 +114,10 @@ class CosyVoiceFrontEnd:
|
|
114 |
token_min_n=60, merge_len=20,
|
115 |
comma_split=False)]
|
116 |
else:
|
117 |
-
|
|
|
|
|
|
|
118 |
text = spell_out_number(text, self.inflect_parser)
|
119 |
texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
|
120 |
token_min_n=60, merge_len=20,
|
|
|
114 |
token_min_n=60, merge_len=20,
|
115 |
comma_split=False)]
|
116 |
else:
|
117 |
+
if self.use_ttsfrd:
|
118 |
+
text = self.frd.get_frd_extra_info(text, 'input')
|
119 |
+
else:
|
120 |
+
text = self.en_tn_model.normalize(text)
|
121 |
text = spell_out_number(text, self.inflect_parser)
|
122 |
texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
|
123 |
token_min_n=60, merge_len=20,
|
cosyvoice/cli/model.py
CHANGED
@@ -56,4 +56,5 @@ class CosyVoiceModel:
|
|
56 |
prompt_feat_len=prompt_speech_feat_len.to(self.device),
|
57 |
embedding=flow_embedding.to(self.device))
|
58 |
tts_speech = self.hift.inference(mel=tts_mel).cpu()
|
|
|
59 |
return {'tts_speech': tts_speech}
|
|
|
56 |
prompt_feat_len=prompt_speech_feat_len.to(self.device),
|
57 |
embedding=flow_embedding.to(self.device))
|
58 |
tts_speech = self.hift.inference(mel=tts_mel).cpu()
|
59 |
+
torch.cuda.empty_cache()
|
60 |
return {'tts_speech': tts_speech}
|
cosyvoice/dataset/processor.py
CHANGED
@@ -167,7 +167,7 @@ def parse_embedding(data, normalize, mode='train'):
|
|
167 |
"""
|
168 |
for sample in data:
|
169 |
sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32)
|
170 |
-
sample['spk_embedding'] = torch.
|
171 |
if normalize:
|
172 |
sample['utt_embedding'] = F.normalize(sample['utt_embedding'], dim=0)
|
173 |
sample['spk_embedding'] = F.normalize(sample['spk_embedding'], dim=0)
|
|
|
167 |
"""
|
168 |
for sample in data:
|
169 |
sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32)
|
170 |
+
sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32)
|
171 |
if normalize:
|
172 |
sample['utt_embedding'] = F.normalize(sample['utt_embedding'], dim=0)
|
173 |
sample['spk_embedding'] = F.normalize(sample['spk_embedding'], dim=0)
|
cosyvoice/flow/flow.py
CHANGED
@@ -60,7 +60,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
|
|
60 |
token_len = batch['speech_token_len'].to(device)
|
61 |
feat = batch['speech_feat'].to(device)
|
62 |
feat_len = batch['speech_feat_len'].to(device)
|
63 |
-
embedding = batch['
|
64 |
|
65 |
# xvec projection
|
66 |
embedding = F.normalize(embedding, dim=1)
|
|
|
60 |
token_len = batch['speech_token_len'].to(device)
|
61 |
feat = batch['speech_feat'].to(device)
|
62 |
feat_len = batch['speech_feat_len'].to(device)
|
63 |
+
embedding = batch['embedding'].to(device)
|
64 |
|
65 |
# xvec projection
|
66 |
embedding = F.normalize(embedding, dim=1)
|
cosyvoice/llm/llm.py
CHANGED
@@ -97,7 +97,7 @@ class TransformerLM(torch.nn.Module):
|
|
97 |
text_token_len = batch['text_token_len'].to(device)
|
98 |
speech_token = batch['speech_token'].to(device)
|
99 |
speech_token_len = batch['speech_token_len'].to(device)
|
100 |
-
embedding = batch['
|
101 |
|
102 |
# 1. prepare llm_target
|
103 |
lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() + [self.speech_token_size]) for i in range(text_token.size(0))]
|
|
|
97 |
text_token_len = batch['text_token_len'].to(device)
|
98 |
speech_token = batch['speech_token'].to(device)
|
99 |
speech_token_len = batch['speech_token_len'].to(device)
|
100 |
+
embedding = batch['embedding'].to(device)
|
101 |
|
102 |
# 1. prepare llm_target
|
103 |
lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() + [self.speech_token_size]) for i in range(text_token.size(0))]
|
cosyvoice/utils/executor.py
CHANGED
@@ -52,6 +52,10 @@ class Executor:
|
|
52 |
info_dict["batch_idx"] = batch_idx
|
53 |
if cosyvoice_join(group_join, info_dict):
|
54 |
break
|
|
|
|
|
|
|
|
|
55 |
|
56 |
# Disable gradient synchronizations across DDP processes.
|
57 |
# Within this context, gradients will be accumulated on module
|
|
|
52 |
info_dict["batch_idx"] = batch_idx
|
53 |
if cosyvoice_join(group_join, info_dict):
|
54 |
break
|
55 |
+
if info_dict["use_spk_embedding"] is True:
|
56 |
+
batch_dict["embedding"] = batch_dict["spk_embedding"]
|
57 |
+
else:
|
58 |
+
batch_dict["embedding"] = batch_dict["utt_embedding"]
|
59 |
|
60 |
# Disable gradient synchronizations across DDP processes.
|
61 |
# Within this context, gradients will be accumulated on module
|
cosyvoice/utils/scheduler.py
CHANGED
@@ -715,3 +715,25 @@ class NoamHoldAnnealing(WarmupHoldPolicy):
|
|
715 |
|
716 |
def set_step(self, step: int):
|
717 |
self.last_epoch = step
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
715 |
|
716 |
def set_step(self, step: int):
|
717 |
self.last_epoch = step
|
718 |
+
|
719 |
+
|
720 |
+
class ConstantLR(_LRScheduler):
|
721 |
+
"""The ConstantLR scheduler
|
722 |
+
|
723 |
+
This scheduler keeps a constant lr
|
724 |
+
|
725 |
+
"""
|
726 |
+
|
727 |
+
def __init__(
|
728 |
+
self,
|
729 |
+
optimizer: torch.optim.Optimizer,
|
730 |
+
):
|
731 |
+
# __init__() must be invoked before setting field
|
732 |
+
# because step() is also invoked in __init__()
|
733 |
+
super().__init__(optimizer)
|
734 |
+
|
735 |
+
def get_lr(self):
|
736 |
+
return self.base_lrs
|
737 |
+
|
738 |
+
def set_step(self, step: int):
|
739 |
+
self.last_epoch = step
|
cosyvoice/utils/train_utils.py
CHANGED
@@ -34,7 +34,7 @@ from torch.nn.utils import clip_grad_norm_
|
|
34 |
from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live
|
35 |
|
36 |
from cosyvoice.dataset.dataset import Dataset
|
37 |
-
from cosyvoice.utils.scheduler import WarmupLR, NoamHoldAnnealing
|
38 |
|
39 |
|
40 |
def init_distributed(args):
|
@@ -122,6 +122,9 @@ def init_optimizer_and_scheduler(args, configs, model):
|
|
122 |
elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing':
|
123 |
scheduler_type = NoamHoldAnnealing
|
124 |
scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf'])
|
|
|
|
|
|
|
125 |
else:
|
126 |
raise ValueError("unknown scheduler: " + configs['train_conf'])
|
127 |
|
|
|
34 |
from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live
|
35 |
|
36 |
from cosyvoice.dataset.dataset import Dataset
|
37 |
+
from cosyvoice.utils.scheduler import WarmupLR, NoamHoldAnnealing, ConstantLR
|
38 |
|
39 |
|
40 |
def init_distributed(args):
|
|
|
122 |
elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing':
|
123 |
scheduler_type = NoamHoldAnnealing
|
124 |
scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf'])
|
125 |
+
elif configs['train_conf']['scheduler'] == 'constantlr':
|
126 |
+
scheduler_type = ConstantLR
|
127 |
+
scheduler = ConstantLR(optimizer)
|
128 |
else:
|
129 |
raise ValueError("unknown scheduler: " + configs['train_conf'])
|
130 |
|
examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml
CHANGED
@@ -190,6 +190,7 @@ train_conf:
|
|
190 |
scheduler: warmuplr
|
191 |
scheduler_conf:
|
192 |
warmup_steps: 25000
|
|
|
193 |
max_epoch: 200
|
194 |
grad_clip: 5
|
195 |
accum_grad: 2
|
|
|
190 |
scheduler: warmuplr
|
191 |
scheduler_conf:
|
192 |
warmup_steps: 25000
|
193 |
+
use_spk_embedding: False # change to True during sft
|
194 |
max_epoch: 200
|
195 |
grad_clip: 5
|
196 |
accum_grad: 2
|
examples/libritts/cosyvoice/conf/cosyvoice.yaml
CHANGED
@@ -186,10 +186,11 @@ data_pipeline: [
|
|
186 |
train_conf:
|
187 |
optim: adam
|
188 |
optim_conf:
|
189 |
-
lr: 0.001
|
190 |
-
scheduler: warmuplr
|
191 |
scheduler_conf:
|
192 |
warmup_steps: 2500
|
|
|
193 |
max_epoch: 200
|
194 |
grad_clip: 5
|
195 |
accum_grad: 2
|
|
|
186 |
train_conf:
|
187 |
optim: adam
|
188 |
optim_conf:
|
189 |
+
lr: 0.001 # change to 1e-5 during sft
|
190 |
+
scheduler: warmuplr # change to constantlr during sft
|
191 |
scheduler_conf:
|
192 |
warmup_steps: 2500
|
193 |
+
use_spk_embedding: False # change to True during sft
|
194 |
max_epoch: 200
|
195 |
grad_clip: 5
|
196 |
accum_grad: 2
|
tools/extract_embedding.py
CHANGED
@@ -53,6 +53,8 @@ def main(args):
|
|
53 |
if spk not in spk2embedding:
|
54 |
spk2embedding[spk] = []
|
55 |
spk2embedding[spk].append(embedding)
|
|
|
|
|
56 |
|
57 |
torch.save(utt2embedding, '{}/utt2embedding.pt'.format(args.dir))
|
58 |
torch.save(spk2embedding, '{}/spk2embedding.pt'.format(args.dir))
|
|
|
53 |
if spk not in spk2embedding:
|
54 |
spk2embedding[spk] = []
|
55 |
spk2embedding[spk].append(embedding)
|
56 |
+
for k, v in spk2embedding.items():
|
57 |
+
spk2embedding[k] = torch.tensor(v).mean(dim=0).tolist()
|
58 |
|
59 |
torch.save(utt2embedding, '{}/utt2embedding.pt'.format(args.dir))
|
60 |
torch.save(spk2embedding, '{}/spk2embedding.pt'.format(args.dir))
|