iflamed commited on
Commit
3513376
2 Parent(s): 3bc37ed 0fd15bb

Merge branch 'FunAudioLLM:main' into fastapi

Browse files
cosyvoice/cli/frontend.py CHANGED
@@ -114,7 +114,10 @@ class CosyVoiceFrontEnd:
114
  token_min_n=60, merge_len=20,
115
  comma_split=False)]
116
  else:
117
- text = self.en_tn_model.normalize(text)
 
 
 
118
  text = spell_out_number(text, self.inflect_parser)
119
  texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
120
  token_min_n=60, merge_len=20,
 
114
  token_min_n=60, merge_len=20,
115
  comma_split=False)]
116
  else:
117
+ if self.use_ttsfrd:
118
+ text = self.frd.get_frd_extra_info(text, 'input')
119
+ else:
120
+ text = self.en_tn_model.normalize(text)
121
  text = spell_out_number(text, self.inflect_parser)
122
  texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
123
  token_min_n=60, merge_len=20,
cosyvoice/cli/model.py CHANGED
@@ -56,4 +56,5 @@ class CosyVoiceModel:
56
  prompt_feat_len=prompt_speech_feat_len.to(self.device),
57
  embedding=flow_embedding.to(self.device))
58
  tts_speech = self.hift.inference(mel=tts_mel).cpu()
 
59
  return {'tts_speech': tts_speech}
 
56
  prompt_feat_len=prompt_speech_feat_len.to(self.device),
57
  embedding=flow_embedding.to(self.device))
58
  tts_speech = self.hift.inference(mel=tts_mel).cpu()
59
+ torch.cuda.empty_cache()
60
  return {'tts_speech': tts_speech}
cosyvoice/dataset/processor.py CHANGED
@@ -167,7 +167,7 @@ def parse_embedding(data, normalize, mode='train'):
167
  """
168
  for sample in data:
169
  sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32)
170
- sample['spk_embedding'] = torch.stack([torch.tensor(i, dtype=torch.float32) for i in sample['spk_embedding']], dim=0).mean(dim=0)
171
  if normalize:
172
  sample['utt_embedding'] = F.normalize(sample['utt_embedding'], dim=0)
173
  sample['spk_embedding'] = F.normalize(sample['spk_embedding'], dim=0)
 
167
  """
168
  for sample in data:
169
  sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32)
170
+ sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32)
171
  if normalize:
172
  sample['utt_embedding'] = F.normalize(sample['utt_embedding'], dim=0)
173
  sample['spk_embedding'] = F.normalize(sample['spk_embedding'], dim=0)
cosyvoice/flow/flow.py CHANGED
@@ -60,7 +60,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
60
  token_len = batch['speech_token_len'].to(device)
61
  feat = batch['speech_feat'].to(device)
62
  feat_len = batch['speech_feat_len'].to(device)
63
- embedding = batch['utt_embedding'].to(device)
64
 
65
  # xvec projection
66
  embedding = F.normalize(embedding, dim=1)
 
60
  token_len = batch['speech_token_len'].to(device)
61
  feat = batch['speech_feat'].to(device)
62
  feat_len = batch['speech_feat_len'].to(device)
63
+ embedding = batch['embedding'].to(device)
64
 
65
  # xvec projection
66
  embedding = F.normalize(embedding, dim=1)
cosyvoice/llm/llm.py CHANGED
@@ -97,7 +97,7 @@ class TransformerLM(torch.nn.Module):
97
  text_token_len = batch['text_token_len'].to(device)
98
  speech_token = batch['speech_token'].to(device)
99
  speech_token_len = batch['speech_token_len'].to(device)
100
- embedding = batch['utt_embedding'].to(device)
101
 
102
  # 1. prepare llm_target
103
  lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() + [self.speech_token_size]) for i in range(text_token.size(0))]
 
97
  text_token_len = batch['text_token_len'].to(device)
98
  speech_token = batch['speech_token'].to(device)
99
  speech_token_len = batch['speech_token_len'].to(device)
100
+ embedding = batch['embedding'].to(device)
101
 
102
  # 1. prepare llm_target
103
  lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() + [self.speech_token_size]) for i in range(text_token.size(0))]
cosyvoice/utils/executor.py CHANGED
@@ -52,6 +52,10 @@ class Executor:
52
  info_dict["batch_idx"] = batch_idx
53
  if cosyvoice_join(group_join, info_dict):
54
  break
 
 
 
 
55
 
56
  # Disable gradient synchronizations across DDP processes.
57
  # Within this context, gradients will be accumulated on module
 
52
  info_dict["batch_idx"] = batch_idx
53
  if cosyvoice_join(group_join, info_dict):
54
  break
55
+ if info_dict["use_spk_embedding"] is True:
56
+ batch_dict["embedding"] = batch_dict["spk_embedding"]
57
+ else:
58
+ batch_dict["embedding"] = batch_dict["utt_embedding"]
59
 
60
  # Disable gradient synchronizations across DDP processes.
61
  # Within this context, gradients will be accumulated on module
cosyvoice/utils/scheduler.py CHANGED
@@ -715,3 +715,25 @@ class NoamHoldAnnealing(WarmupHoldPolicy):
715
 
716
  def set_step(self, step: int):
717
  self.last_epoch = step
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
715
 
716
  def set_step(self, step: int):
717
  self.last_epoch = step
718
+
719
+
720
+ class ConstantLR(_LRScheduler):
721
+ """The ConstantLR scheduler
722
+
723
+ This scheduler keeps a constant lr
724
+
725
+ """
726
+
727
+ def __init__(
728
+ self,
729
+ optimizer: torch.optim.Optimizer,
730
+ ):
731
+ # __init__() must be invoked before setting field
732
+ # because step() is also invoked in __init__()
733
+ super().__init__(optimizer)
734
+
735
+ def get_lr(self):
736
+ return self.base_lrs
737
+
738
+ def set_step(self, step: int):
739
+ self.last_epoch = step
cosyvoice/utils/train_utils.py CHANGED
@@ -34,7 +34,7 @@ from torch.nn.utils import clip_grad_norm_
34
  from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live
35
 
36
  from cosyvoice.dataset.dataset import Dataset
37
- from cosyvoice.utils.scheduler import WarmupLR, NoamHoldAnnealing
38
 
39
 
40
  def init_distributed(args):
@@ -122,6 +122,9 @@ def init_optimizer_and_scheduler(args, configs, model):
122
  elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing':
123
  scheduler_type = NoamHoldAnnealing
124
  scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf'])
 
 
 
125
  else:
126
  raise ValueError("unknown scheduler: " + configs['train_conf'])
127
 
 
34
  from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live
35
 
36
  from cosyvoice.dataset.dataset import Dataset
37
+ from cosyvoice.utils.scheduler import WarmupLR, NoamHoldAnnealing, ConstantLR
38
 
39
 
40
  def init_distributed(args):
 
122
  elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing':
123
  scheduler_type = NoamHoldAnnealing
124
  scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf'])
125
+ elif configs['train_conf']['scheduler'] == 'constantlr':
126
+ scheduler_type = ConstantLR
127
+ scheduler = ConstantLR(optimizer)
128
  else:
129
  raise ValueError("unknown scheduler: " + configs['train_conf'])
130
 
examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml CHANGED
@@ -190,6 +190,7 @@ train_conf:
190
  scheduler: warmuplr
191
  scheduler_conf:
192
  warmup_steps: 25000
 
193
  max_epoch: 200
194
  grad_clip: 5
195
  accum_grad: 2
 
190
  scheduler: warmuplr
191
  scheduler_conf:
192
  warmup_steps: 25000
193
+ use_spk_embedding: False # change to True during sft
194
  max_epoch: 200
195
  grad_clip: 5
196
  accum_grad: 2
examples/libritts/cosyvoice/conf/cosyvoice.yaml CHANGED
@@ -186,10 +186,11 @@ data_pipeline: [
186
  train_conf:
187
  optim: adam
188
  optim_conf:
189
- lr: 0.001
190
- scheduler: warmuplr
191
  scheduler_conf:
192
  warmup_steps: 2500
 
193
  max_epoch: 200
194
  grad_clip: 5
195
  accum_grad: 2
 
186
  train_conf:
187
  optim: adam
188
  optim_conf:
189
+ lr: 0.001 # change to 1e-5 during sft
190
+ scheduler: warmuplr # change to constantlr during sft
191
  scheduler_conf:
192
  warmup_steps: 2500
193
+ use_spk_embedding: False # change to True during sft
194
  max_epoch: 200
195
  grad_clip: 5
196
  accum_grad: 2
tools/extract_embedding.py CHANGED
@@ -53,6 +53,8 @@ def main(args):
53
  if spk not in spk2embedding:
54
  spk2embedding[spk] = []
55
  spk2embedding[spk].append(embedding)
 
 
56
 
57
  torch.save(utt2embedding, '{}/utt2embedding.pt'.format(args.dir))
58
  torch.save(spk2embedding, '{}/spk2embedding.pt'.format(args.dir))
 
53
  if spk not in spk2embedding:
54
  spk2embedding[spk] = []
55
  spk2embedding[spk].append(embedding)
56
+ for k, v in spk2embedding.items():
57
+ spk2embedding[k] = torch.tensor(v).mean(dim=0).tolist()
58
 
59
  torch.save(utt2embedding, '{}/utt2embedding.pt'.format(args.dir))
60
  torch.save(spk2embedding, '{}/spk2embedding.pt'.format(args.dir))