CosyVoice commited on
Commit
06934c3
1 Parent(s): d52358f

update vc code

Browse files
README.md CHANGED
@@ -71,6 +71,7 @@ If you are expert in this field, and you are only interested in training your ow
71
  # SDK模型下载
72
  from modelscope import snapshot_download
73
  snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
 
74
  snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
75
  snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
76
  snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
@@ -80,6 +81,7 @@ snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice
80
  # git模型下载,请确保已安装git lfs
81
  mkdir -p pretrained_models
82
  git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
 
83
  git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
84
  git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
85
  git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
@@ -118,7 +120,7 @@ print(cosyvoice.list_avaliable_spks())
118
  for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
119
  torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
120
 
121
- cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz')
122
  # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
123
  prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
124
  for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
@@ -127,18 +129,16 @@ for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来
127
  prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
128
  for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
129
  torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
130
-
131
- cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
132
- # instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
133
- for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
134
- torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050)
135
-
136
- cosyvoice = CosyVoice('pretrained_models/CosyVoice-VC')
137
  # vc usage
138
  prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
139
  source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
140
  for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
141
  torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050)
 
 
 
 
 
142
  ```
143
 
144
  **Start web demo**
 
71
  # SDK模型下载
72
  from modelscope import snapshot_download
73
  snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
74
+ snapshot_download('iic/CosyVoice-300M-25Hz', local_dir='pretrained_models/CosyVoice-300M-25Hz')
75
  snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
76
  snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
77
  snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
 
81
  # git模型下载,请确保已安装git lfs
82
  mkdir -p pretrained_models
83
  git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
84
+ git clone https://www.modelscope.cn/iic/CosyVoice-300M-25Hz.git pretrained_models/CosyVoice-300M-25Hz
85
  git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
86
  git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
87
  git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
 
120
  for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
121
  torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
122
 
123
+ cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') # or change to pretrained_models/CosyVoice-300M for 50Hz inference
124
  # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
125
  prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
126
  for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
 
129
  prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
130
  for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
131
  torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
 
 
 
 
 
 
 
132
  # vc usage
133
  prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
134
  source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
135
  for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
136
  torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050)
137
+
138
+ cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
139
+ # instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
140
+ for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
141
+ torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050)
142
  ```
143
 
144
  **Start web demo**
cosyvoice/cli/cosyvoice.py CHANGED
@@ -25,7 +25,6 @@ class CosyVoice:
25
 
26
  def __init__(self, model_dir, load_jit=True, load_onnx=False):
27
  instruct = True if '-Instruct' in model_dir else False
28
- vc = True if '-VC' in model_dir else False
29
  self.model_dir = model_dir
30
  if not os.path.exists(model_dir):
31
  model_dir = snapshot_download(model_dir)
@@ -37,7 +36,6 @@ class CosyVoice:
37
  '{}/speech_tokenizer_v1.onnx'.format(model_dir),
38
  '{}/spk2info.pt'.format(model_dir),
39
  instruct,
40
- vc,
41
  configs['allowed_special'])
42
  self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
43
  self.model.load('{}/llm.pt'.format(model_dir),
 
25
 
26
  def __init__(self, model_dir, load_jit=True, load_onnx=False):
27
  instruct = True if '-Instruct' in model_dir else False
 
28
  self.model_dir = model_dir
29
  if not os.path.exists(model_dir):
30
  model_dir = snapshot_download(model_dir)
 
36
  '{}/speech_tokenizer_v1.onnx'.format(model_dir),
37
  '{}/spk2info.pt'.format(model_dir),
38
  instruct,
 
39
  configs['allowed_special'])
40
  self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
41
  self.model.load('{}/llm.pt'.format(model_dir),
cosyvoice/cli/frontend.py CHANGED
@@ -42,7 +42,6 @@ class CosyVoiceFrontEnd:
42
  speech_tokenizer_model: str,
43
  spk2info: str = '',
44
  instruct: bool = False,
45
- vc: bool = False,
46
  allowed_special: str = 'all'):
47
  self.tokenizer = get_tokenizer()
48
  self.feat_extractor = feat_extractor
@@ -59,7 +58,6 @@ class CosyVoiceFrontEnd:
59
  else:
60
  self.spk2info = {}
61
  self.instruct = instruct
62
- self.vc = vc
63
  self.allowed_special = allowed_special
64
  self.inflect_parser = inflect.engine()
65
  self.use_ttsfrd = use_ttsfrd
 
42
  speech_tokenizer_model: str,
43
  spk2info: str = '',
44
  instruct: bool = False,
 
45
  allowed_special: str = 'all'):
46
  self.tokenizer = get_tokenizer()
47
  self.feat_extractor = feat_extractor
 
58
  else:
59
  self.spk2info = {}
60
  self.instruct = instruct
 
61
  self.allowed_special = allowed_special
62
  self.inflect_parser = inflect.engine()
63
  self.use_ttsfrd = use_ttsfrd
cosyvoice/cli/model.py CHANGED
@@ -54,10 +54,9 @@ class CosyVoiceModel:
54
  self.hift_cache_dict = {}
55
 
56
  def load(self, llm_model, flow_model, hift_model):
57
- if self.llm is not None:
58
- self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
59
- self.llm.to(self.device).eval()
60
- self.llm.half()
61
  self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
62
  self.flow.to(self.device).eval()
63
  self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
 
54
  self.hift_cache_dict = {}
55
 
56
  def load(self, llm_model, flow_model, hift_model):
57
+ self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
58
+ self.llm.to(self.device).eval()
59
+ self.llm.half()
 
60
  self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
61
  self.flow.to(self.device).eval()
62
  self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))