Spaces:
Runtime error
Runtime error
wenmengzhou
commited on
Commit
•
83a5954
1
Parent(s):
98dc562
change model repo to hf (#1)
Browse files- change model repo to hf (b75789cb2b05e8dae7a653352e8427c4a692d212)
- Update cosyvoice/cli/cosyvoice.py (589560d7d413aa9c393ddb1b8e1fe247234d4f5e)
- update model according to hysts advice (5297156ee58ebd4b7665c23e4a129b6628605bea)
- cosyvoice/cli/cosyvoice.py +2 -2
- cosyvoice/cli/model.py +4 -6
- css/utils.py +3 -3
cosyvoice/cli/cosyvoice.py
CHANGED
@@ -14,7 +14,7 @@
|
|
14 |
import os
|
15 |
import torch
|
16 |
from hyperpyyaml import load_hyperpyyaml
|
17 |
-
from
|
18 |
from cosyvoice.cli.frontend import CosyVoiceFrontEnd
|
19 |
from cosyvoice.cli.model import CosyVoiceModel
|
20 |
|
@@ -24,7 +24,7 @@ class CosyVoice:
|
|
24 |
instruct = True if '-Instruct' in model_dir else False
|
25 |
self.model_dir = model_dir
|
26 |
if not os.path.exists(model_dir):
|
27 |
-
model_dir = snapshot_download(model_dir)
|
28 |
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
|
29 |
configs = load_hyperpyyaml(f)
|
30 |
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
|
|
|
14 |
import os
|
15 |
import torch
|
16 |
from hyperpyyaml import load_hyperpyyaml
|
17 |
+
from huggingface_hub import snapshot_download
|
18 |
from cosyvoice.cli.frontend import CosyVoiceFrontEnd
|
19 |
from cosyvoice.cli.model import CosyVoiceModel
|
20 |
|
|
|
24 |
instruct = True if '-Instruct' in model_dir else False
|
25 |
self.model_dir = model_dir
|
26 |
if not os.path.exists(model_dir):
|
27 |
+
model_dir = snapshot_download(model_dir, local_dir=model_dir)
|
28 |
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
|
29 |
configs = load_hyperpyyaml(f)
|
30 |
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
|
cosyvoice/cli/model.py
CHANGED
@@ -19,18 +19,17 @@ class CosyVoiceModel:
|
|
19 |
llm: torch.nn.Module,
|
20 |
flow: torch.nn.Module,
|
21 |
hift: torch.nn.Module):
|
22 |
-
|
23 |
-
self.device = 'cpu'
|
24 |
self.llm = llm
|
25 |
self.flow = flow
|
26 |
self.hift = hift
|
27 |
|
28 |
def load(self, llm_model, flow_model, hift_model):
|
29 |
-
self.llm.load_state_dict(torch.load(llm_model, map_location=
|
30 |
self.llm.to(self.device).eval()
|
31 |
-
self.flow.load_state_dict(torch.load(flow_model, map_location=
|
32 |
self.flow.to(self.device).eval()
|
33 |
-
self.hift.load_state_dict(torch.load(hift_model, map_location=
|
34 |
self.hift.to(self.device).eval()
|
35 |
|
36 |
def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
|
@@ -38,7 +37,6 @@ class CosyVoiceModel:
|
|
38 |
llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
|
39 |
flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
|
40 |
prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
|
41 |
-
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
42 |
tts_speech_token = self.llm.inference(text=text.to(self.device),
|
43 |
text_len=text_len.to(self.device),
|
44 |
prompt_text=prompt_text.to(self.device),
|
|
|
19 |
llm: torch.nn.Module,
|
20 |
flow: torch.nn.Module,
|
21 |
hift: torch.nn.Module):
|
22 |
+
self.device = torch.device('cuda')
|
|
|
23 |
self.llm = llm
|
24 |
self.flow = flow
|
25 |
self.hift = hift
|
26 |
|
27 |
def load(self, llm_model, flow_model, hift_model):
|
28 |
+
self.llm.load_state_dict(torch.load(llm_model, map_location='cpu'))
|
29 |
self.llm.to(self.device).eval()
|
30 |
+
self.flow.load_state_dict(torch.load(flow_model, map_location='cpu'))
|
31 |
self.flow.to(self.device).eval()
|
32 |
+
self.hift.load_state_dict(torch.load(hift_model, map_location='cpu'))
|
33 |
self.hift.to(self.device).eval()
|
34 |
|
35 |
def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
|
|
|
37 |
llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
|
38 |
flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
|
39 |
prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
|
|
|
40 |
tts_speech_token = self.llm.inference(text=text.to(self.device),
|
41 |
text_len=text_len.to(self.device),
|
42 |
prompt_text=prompt_text.to(self.device),
|
css/utils.py
CHANGED
@@ -13,9 +13,9 @@ from cosyvoice.utils.file_utils import load_wav
|
|
13 |
|
14 |
from cosyvoice.cli.cosyvoice import CosyVoice
|
15 |
|
16 |
-
cosyvoice= CosyVoice('
|
17 |
-
cosyvoice_sft= CosyVoice('
|
18 |
-
cosyvoice_instruct= CosyVoice('
|
19 |
|
20 |
example_tts_text = ["我们走的每一步,都是我们策略的一部分;你看到的所有一切,包括我此刻与你交谈,所做的一切,所说的每一句话,都有深远的含义。",
|
21 |
"那位喜剧演员真有才,[laughter]一开口就让全场观众爆笑。",
|
|
|
13 |
|
14 |
from cosyvoice.cli.cosyvoice import CosyVoice
|
15 |
|
16 |
+
cosyvoice= CosyVoice('FunAudioLLM/CosyVoice-300M')
|
17 |
+
cosyvoice_sft= CosyVoice('FunAudioLLM/CosyVoice-300M-SFT')
|
18 |
+
cosyvoice_instruct= CosyVoice('FunAudioLLM/CosyVoice-300M-Instruct')
|
19 |
|
20 |
example_tts_text = ["我们走的每一步,都是我们策略的一部分;你看到的所有一切,包括我此刻与你交谈,所做的一切,所说的每一句话,都有深远的含义。",
|
21 |
"那位喜剧演员真有才,[laughter]一开口就让全场观众爆笑。",
|