Spaces:
Running
on
Zero
Running
on
Zero
开始部署
Browse files- app.py +23 -21
- common_utils/utils4infer.py +3 -3
- tts/cosyvoice/cli/cosyvoice.py +15 -26
- tts/cosyvoice/cli/model.py +6 -11
app.py
CHANGED
@@ -64,7 +64,7 @@ loaded_models = {
|
|
64 |
}
|
65 |
print("\n所有模型已加载完毕。")
|
66 |
|
67 |
-
cosyvoice = CosyVoice(cosyvoice_model_path
|
68 |
|
69 |
# 将图片转换为 Base64
|
70 |
with open("./tts/assert/实验室.png", "rb") as image_file:
|
@@ -110,9 +110,9 @@ for item in prompt_audio_choices:
|
|
110 |
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
|
111 |
|
112 |
|
113 |
-
|
114 |
def do_s2t(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
|
115 |
-
model.eval()
|
116 |
feat, feat_lens = get_feat_from_wav_path(input_wav_path)
|
117 |
print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
|
118 |
if is_npu: torch_npu.npu.synchronize()
|
@@ -123,9 +123,9 @@ def do_s2t(model, input_wav_path, input_prompt, profile=False): # 增加 model
|
|
123 |
print(f"S2T 推理消耗时间: {end_time - start_time:.2f} 秒")
|
124 |
return res_text
|
125 |
|
126 |
-
|
127 |
def do_s2t4chat(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
|
128 |
-
model.eval()
|
129 |
feat, feat_lens = get_feat_from_wav_path(input_wav_path)
|
130 |
print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
|
131 |
if is_npu: torch_npu.npu.synchronize()
|
@@ -135,9 +135,9 @@ def do_s2t4chat(model, input_wav_path, input_prompt, profile=False): # 增加 m
|
|
135 |
end_time = time.time()
|
136 |
print(f"S2T4Chat 推理消耗时间: {end_time - start_time:.2f} 秒")
|
137 |
return res_text
|
138 |
-
|
139 |
def do_s2t4chat_think(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
|
140 |
-
model.eval()
|
141 |
feat, feat_lens = get_feat_from_wav_path(input_wav_path)
|
142 |
print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
|
143 |
if is_npu: torch_npu.npu.synchronize()
|
@@ -148,9 +148,9 @@ def do_s2t4chat_think(model, input_wav_path, input_prompt, profile=False): #
|
|
148 |
print(f"S2T4Chat 推理消耗时间: {end_time - start_time:.2f} 秒")
|
149 |
return res_text
|
150 |
|
151 |
-
|
152 |
def do_t2s(model, input_prompt, text_for_tts, profile=False): # 增加 model 参数
|
153 |
-
model.eval()
|
154 |
if is_npu: torch_npu.npu.synchronize()
|
155 |
start_time = time.time()
|
156 |
res_tensor = model.generate_tts(device=device, text=text_for_tts, )[0]
|
@@ -161,9 +161,9 @@ def do_t2s(model, input_prompt, text_for_tts, profile=False): # 增加 model
|
|
161 |
print(f"T2S 推理消耗时间: {end_time - start_time:.2f} 秒")
|
162 |
return res_text
|
163 |
|
164 |
-
|
165 |
def do_t2t(model, question_txt, profile=False): # 增加 model 参数
|
166 |
-
model.eval()
|
167 |
if is_npu: torch_npu.npu.synchronize()
|
168 |
start_time = time.time()
|
169 |
print(f'开始t2t推理, question_txt: {question_txt}')
|
@@ -173,9 +173,9 @@ def do_t2t(model, question_txt, profile=False): # 增加 model 参数
|
|
173 |
print(f"T2T 推理消耗时间: {end_time - start_time:.2f} 秒")
|
174 |
return res_text
|
175 |
|
176 |
-
|
177 |
def do_s2s(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
|
178 |
-
model.eval()
|
179 |
feat, feat_lens = get_feat_from_wav_path(input_wav_path)
|
180 |
print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
|
181 |
if is_npu: torch_npu.npu.synchronize()
|
@@ -186,8 +186,9 @@ def do_s2s(model, input_wav_path, input_prompt, profile=False): # 增加 model
|
|
186 |
print(f"S2S 推理消耗时间: {end_time - start_time:.2f} 秒")
|
187 |
return f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
|
188 |
|
|
|
189 |
def do_s2s_think(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
|
190 |
-
model.eval()
|
191 |
feat, feat_lens = get_feat_from_wav_path(input_wav_path)
|
192 |
print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
|
193 |
if is_npu: torch_npu.npu.synchronize()
|
@@ -199,6 +200,14 @@ def do_s2s_think(model, input_wav_path, input_prompt, profile=False): # 增加
|
|
199 |
return f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
|
200 |
|
201 |
@spaces.GPU
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt): # 增加 model 和 tokenizer 参数
|
203 |
print(f"wav_path: {input_wav_path}, prompt:{input_prompt}")
|
204 |
if input_wav_path is None and not input_prompt.endswith(("_TTS", "_T2T")):
|
@@ -254,13 +263,6 @@ def save_to_jsonl(if_correct, wav, prompt, res):
|
|
254 |
def download_audio(input_wav_path):
|
255 |
return input_wav_path if input_wav_path else None
|
256 |
|
257 |
-
|
258 |
-
def get_wav_from_token_list(input_list, prompt_speech):
|
259 |
-
time_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
260 |
-
wav_path = f"./tmp/{time_str}.wav"
|
261 |
-
return token_list2wav(input_list, prompt_speech, wav_path, cosyvoice)
|
262 |
-
|
263 |
-
|
264 |
# --- Gradio 界面 ---
|
265 |
with gr.Blocks() as demo:
|
266 |
gr.Markdown(
|
|
|
64 |
}
|
65 |
print("\n所有模型已加载完毕。")
|
66 |
|
67 |
+
cosyvoice = CosyVoice(cosyvoice_model_path)
|
68 |
|
69 |
# 将图片转换为 Base64
|
70 |
with open("./tts/assert/实验室.png", "rb") as image_file:
|
|
|
110 |
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
|
111 |
|
112 |
|
113 |
+
@spaces.GPU
|
114 |
def do_s2t(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
|
115 |
+
model.eval().cuda()
|
116 |
feat, feat_lens = get_feat_from_wav_path(input_wav_path)
|
117 |
print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
|
118 |
if is_npu: torch_npu.npu.synchronize()
|
|
|
123 |
print(f"S2T 推理消耗时间: {end_time - start_time:.2f} 秒")
|
124 |
return res_text
|
125 |
|
126 |
+
@spaces.GPU
|
127 |
def do_s2t4chat(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
|
128 |
+
model.eval().cuda()
|
129 |
feat, feat_lens = get_feat_from_wav_path(input_wav_path)
|
130 |
print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
|
131 |
if is_npu: torch_npu.npu.synchronize()
|
|
|
135 |
end_time = time.time()
|
136 |
print(f"S2T4Chat 推理消耗时间: {end_time - start_time:.2f} 秒")
|
137 |
return res_text
|
138 |
+
@spaces.GPU
|
139 |
def do_s2t4chat_think(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
|
140 |
+
model.eval().cuda()
|
141 |
feat, feat_lens = get_feat_from_wav_path(input_wav_path)
|
142 |
print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
|
143 |
if is_npu: torch_npu.npu.synchronize()
|
|
|
148 |
print(f"S2T4Chat 推理消耗时间: {end_time - start_time:.2f} 秒")
|
149 |
return res_text
|
150 |
|
151 |
+
@spaces.GPU
|
152 |
def do_t2s(model, input_prompt, text_for_tts, profile=False): # 增加 model 参数
|
153 |
+
model.eval().cuda()
|
154 |
if is_npu: torch_npu.npu.synchronize()
|
155 |
start_time = time.time()
|
156 |
res_tensor = model.generate_tts(device=device, text=text_for_tts, )[0]
|
|
|
161 |
print(f"T2S 推理消耗时间: {end_time - start_time:.2f} 秒")
|
162 |
return res_text
|
163 |
|
164 |
+
@spaces.GPU
|
165 |
def do_t2t(model, question_txt, profile=False): # 增加 model 参数
|
166 |
+
model.eval().cuda()
|
167 |
if is_npu: torch_npu.npu.synchronize()
|
168 |
start_time = time.time()
|
169 |
print(f'开始t2t推理, question_txt: {question_txt}')
|
|
|
173 |
print(f"T2T 推理消耗时间: {end_time - start_time:.2f} 秒")
|
174 |
return res_text
|
175 |
|
176 |
+
@spaces.GPU
|
177 |
def do_s2s(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
|
178 |
+
model.eval().cuda()
|
179 |
feat, feat_lens = get_feat_from_wav_path(input_wav_path)
|
180 |
print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
|
181 |
if is_npu: torch_npu.npu.synchronize()
|
|
|
186 |
print(f"S2S 推理消耗时间: {end_time - start_time:.2f} 秒")
|
187 |
return f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
|
188 |
|
189 |
+
@spaces.GPU
|
190 |
def do_s2s_think(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
|
191 |
+
model.eval().cuda()
|
192 |
feat, feat_lens = get_feat_from_wav_path(input_wav_path)
|
193 |
print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
|
194 |
if is_npu: torch_npu.npu.synchronize()
|
|
|
200 |
return f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
|
201 |
|
202 |
@spaces.GPU
|
203 |
+
def get_wav_from_token_list(input_list, prompt_speech):
|
204 |
+
cosyvoice.eval().cuda()
|
205 |
+
time_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
206 |
+
wav_path = f"./tmp/{time_str}.wav"
|
207 |
+
return token_list2wav(input_list, prompt_speech, wav_path, cosyvoice)
|
208 |
+
|
209 |
+
|
210 |
+
|
211 |
def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt): # 增加 model 和 tokenizer 参数
|
212 |
print(f"wav_path: {input_wav_path}, prompt:{input_prompt}")
|
213 |
if input_wav_path is None and not input_prompt.endswith(("_TTS", "_T2T")):
|
|
|
263 |
def download_audio(input_wav_path):
|
264 |
return input_wav_path if input_wav_path else None
|
265 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
# --- Gradio 界面 ---
|
267 |
with gr.Blocks() as demo:
|
268 |
gr.Markdown(
|
common_utils/utils4infer.py
CHANGED
@@ -16,7 +16,7 @@ import torchaudio
|
|
16 |
|
17 |
|
18 |
|
19 |
-
def load_model_and_tokenizer(checkpoint_path, config_path
|
20 |
"""
|
21 |
封装了加载模型和分词器的逻辑
|
22 |
Args:
|
@@ -31,10 +31,10 @@ def load_model_and_tokenizer(checkpoint_path, config_path, device:torch.device=t
|
|
31 |
args = GxlNode({"checkpoint": checkpoint_path})
|
32 |
configs = utils_file.load_dict_from_yaml(config_path)
|
33 |
model, configs = init_model(args, configs)
|
34 |
-
model = model.to(
|
35 |
model.eval() # 设置为评估模式
|
36 |
tokenizer = init_tokenizer(configs)
|
37 |
-
print(f"模型 {checkpoint_path}
|
38 |
return model, tokenizer
|
39 |
|
40 |
def token_list2wav(token_list, prompt_speech, wav_path, cosyvoice):
|
|
|
16 |
|
17 |
|
18 |
|
19 |
+
def load_model_and_tokenizer(checkpoint_path, config_path):
|
20 |
"""
|
21 |
封装了加载模型和分词器的逻辑
|
22 |
Args:
|
|
|
31 |
args = GxlNode({"checkpoint": checkpoint_path})
|
32 |
configs = utils_file.load_dict_from_yaml(config_path)
|
33 |
model, configs = init_model(args, configs)
|
34 |
+
model = model.to(torch.bfloat16)
|
35 |
model.eval() # 设置为评估模式
|
36 |
tokenizer = init_tokenizer(configs)
|
37 |
+
print(f"模型 {checkpoint_path} ")
|
38 |
return model, tokenizer
|
39 |
|
40 |
def token_list2wav(token_list, prompt_speech, wav_path, cosyvoice):
|
tts/cosyvoice/cli/cosyvoice.py
CHANGED
@@ -18,15 +18,16 @@ from tqdm import tqdm
|
|
18 |
from hyperpyyaml import load_hyperpyyaml
|
19 |
from modelscope import snapshot_download
|
20 |
import torch
|
21 |
-
from cosyvoice.cli.frontend import CosyVoiceFrontEnd
|
22 |
-
from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
|
23 |
-
from cosyvoice.utils.file_utils import logging
|
24 |
-
from cosyvoice.utils.class_utils import get_model_type
|
25 |
|
26 |
|
27 |
-
class CosyVoice:
|
28 |
|
29 |
-
def __init__(self, model_dir,gpu_id
|
|
|
30 |
self.instruct = True if '-Instruct' in model_dir else False
|
31 |
self.model_dir = model_dir
|
32 |
self.fp16 = fp16
|
@@ -35,29 +36,17 @@ class CosyVoice:
|
|
35 |
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
|
36 |
configs = load_hyperpyyaml(f)
|
37 |
assert get_model_type(configs) != CosyVoice2Model, 'do not use {} for CosyVoice initialization!'.format(model_dir)
|
38 |
-
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
self.sample_rate = configs['sample_rate']
|
46 |
-
if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
|
47 |
-
load_jit, load_trt, fp16 = False, False, False
|
48 |
-
logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
|
49 |
self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16, gpu_id=gpu_id)
|
50 |
-
self.model.load('{}/
|
51 |
-
'{}/flow.pt'.format(model_dir),
|
52 |
'{}/hift.pt'.format(model_dir))
|
53 |
-
if load_jit:
|
54 |
-
self.model.load_jit('{}/llm.text_encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
|
55 |
-
'{}/llm.llm.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
|
56 |
-
'{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
|
57 |
-
if load_trt:
|
58 |
-
self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
|
59 |
-
'{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
|
60 |
-
self.fp16)
|
61 |
del configs
|
62 |
|
63 |
def list_available_spks(self):
|
|
|
18 |
from hyperpyyaml import load_hyperpyyaml
|
19 |
from modelscope import snapshot_download
|
20 |
import torch
|
21 |
+
from tts.cosyvoice.cli.frontend import CosyVoiceFrontEnd
|
22 |
+
from tts.cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
|
23 |
+
from tts.cosyvoice.utils.file_utils import logging
|
24 |
+
from tts.cosyvoice.utils.class_utils import get_model_type
|
25 |
|
26 |
|
27 |
+
class CosyVoice(torch.Module):
|
28 |
|
29 |
+
def __init__(self, model_dir,gpu_id=-1, load_jit=False, load_trt=False, fp16=False):
|
30 |
+
super(CosyVoice, self).__init__()
|
31 |
self.instruct = True if '-Instruct' in model_dir else False
|
32 |
self.model_dir = model_dir
|
33 |
self.fp16 = fp16
|
|
|
36 |
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
|
37 |
configs = load_hyperpyyaml(f)
|
38 |
assert get_model_type(configs) != CosyVoice2Model, 'do not use {} for CosyVoice initialization!'.format(model_dir)
|
39 |
+
# self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
|
40 |
+
# configs['feat_extractor'],
|
41 |
+
# '{}/campplus.onnx'.format(model_dir),
|
42 |
+
# '{}/speech_tokenizer_v1.onnx'.format(model_dir),
|
43 |
+
# '{}/spk2info.pt'.format(model_dir),
|
44 |
+
# configs['allowed_special'],
|
45 |
+
# gpu_id=gpu_id)
|
46 |
self.sample_rate = configs['sample_rate']
|
|
|
|
|
|
|
47 |
self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16, gpu_id=gpu_id)
|
48 |
+
self.model.load('{}/flow.pt'.format(model_dir),
|
|
|
49 |
'{}/hift.pt'.format(model_dir))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
del configs
|
51 |
|
52 |
def list_available_spks(self):
|
tts/cosyvoice/cli/model.py
CHANGED
@@ -37,10 +37,6 @@ class CosyVoiceModel:
|
|
37 |
hift: torch.nn.Module,
|
38 |
fp16: bool,
|
39 |
gpu_id: int = 0):
|
40 |
-
if is_npu:
|
41 |
-
self.device = torch.device(f'npu:{gpu_id}')
|
42 |
-
else:
|
43 |
-
self.device = torch.device(f'cuda:{gpu_id}')
|
44 |
self.llm = llm
|
45 |
self.flow = flow
|
46 |
self.hift = hift
|
@@ -75,15 +71,14 @@ class CosyVoiceModel:
|
|
75 |
self.flow_cache_dict = {}
|
76 |
self.hift_cache_dict = {}
|
77 |
|
78 |
-
def load(self,
|
79 |
-
|
80 |
-
|
81 |
-
self.flow.load_state_dict(torch.load(flow_model,
|
82 |
-
self.flow.
|
83 |
-
# in case hift_model is a hifigan model
|
84 |
hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
|
85 |
self.hift.load_state_dict(hift_state_dict, strict=True)
|
86 |
-
self.hift.
|
87 |
|
88 |
def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
|
89 |
llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)
|
|
|
37 |
hift: torch.nn.Module,
|
38 |
fp16: bool,
|
39 |
gpu_id: int = 0):
|
|
|
|
|
|
|
|
|
40 |
self.llm = llm
|
41 |
self.flow = flow
|
42 |
self.hift = hift
|
|
|
71 |
self.flow_cache_dict = {}
|
72 |
self.hift_cache_dict = {}
|
73 |
|
74 |
+
def load(self, flow_model, hift_model, llm_model=None):
|
75 |
+
if llm_model is not None:
|
76 |
+
self.llm.load_state_dict(torch.load(llm_model, map_location="cpu"), strict=True)
|
77 |
+
self.flow.load_state_dict(torch.load(flow_model,map_location="cpu"), strict=True)
|
78 |
+
self.flow.eval()
|
|
|
79 |
hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
|
80 |
self.hift.load_state_dict(hift_state_dict, strict=True)
|
81 |
+
self.hift.eval()
|
82 |
|
83 |
def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
|
84 |
llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)
|