xlgeng commited on
Commit
daa8d34
·
1 Parent(s): cc344d5

开始部署

Browse files
app.py CHANGED
@@ -64,7 +64,7 @@ loaded_models = {
64
  }
65
  print("\n所有模型已加载完毕。")
66
 
67
- cosyvoice = CosyVoice(cosyvoice_model_path, gpu_id=0)
68
 
69
  # 将图片转换为 Base64
70
  with open("./tts/assert/实验室.png", "rb") as image_file:
@@ -110,9 +110,9 @@ for item in prompt_audio_choices:
110
  logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
111
 
112
 
113
-
114
  def do_s2t(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
115
- model.eval()
116
  feat, feat_lens = get_feat_from_wav_path(input_wav_path)
117
  print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
118
  if is_npu: torch_npu.npu.synchronize()
@@ -123,9 +123,9 @@ def do_s2t(model, input_wav_path, input_prompt, profile=False): # 增加 model
123
  print(f"S2T 推理消耗时间: {end_time - start_time:.2f} 秒")
124
  return res_text
125
 
126
-
127
  def do_s2t4chat(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
128
- model.eval()
129
  feat, feat_lens = get_feat_from_wav_path(input_wav_path)
130
  print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
131
  if is_npu: torch_npu.npu.synchronize()
@@ -135,9 +135,9 @@ def do_s2t4chat(model, input_wav_path, input_prompt, profile=False): # 增加 m
135
  end_time = time.time()
136
  print(f"S2T4Chat 推理消耗时间: {end_time - start_time:.2f} 秒")
137
  return res_text
138
-
139
  def do_s2t4chat_think(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
140
- model.eval()
141
  feat, feat_lens = get_feat_from_wav_path(input_wav_path)
142
  print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
143
  if is_npu: torch_npu.npu.synchronize()
@@ -148,9 +148,9 @@ def do_s2t4chat_think(model, input_wav_path, input_prompt, profile=False): #
148
  print(f"S2T4Chat 推理消耗时间: {end_time - start_time:.2f} 秒")
149
  return res_text
150
 
151
-
152
  def do_t2s(model, input_prompt, text_for_tts, profile=False): # 增加 model 参数
153
- model.eval()
154
  if is_npu: torch_npu.npu.synchronize()
155
  start_time = time.time()
156
  res_tensor = model.generate_tts(device=device, text=text_for_tts, )[0]
@@ -161,9 +161,9 @@ def do_t2s(model, input_prompt, text_for_tts, profile=False): # 增加 model
161
  print(f"T2S 推理消耗时间: {end_time - start_time:.2f} 秒")
162
  return res_text
163
 
164
-
165
  def do_t2t(model, question_txt, profile=False): # 增加 model 参数
166
- model.eval()
167
  if is_npu: torch_npu.npu.synchronize()
168
  start_time = time.time()
169
  print(f'开始t2t推理, question_txt: {question_txt}')
@@ -173,9 +173,9 @@ def do_t2t(model, question_txt, profile=False): # 增加 model 参数
173
  print(f"T2T 推理消耗时间: {end_time - start_time:.2f} 秒")
174
  return res_text
175
 
176
-
177
  def do_s2s(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
178
- model.eval()
179
  feat, feat_lens = get_feat_from_wav_path(input_wav_path)
180
  print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
181
  if is_npu: torch_npu.npu.synchronize()
@@ -186,8 +186,9 @@ def do_s2s(model, input_wav_path, input_prompt, profile=False): # 增加 model
186
  print(f"S2S 推理消耗时间: {end_time - start_time:.2f} 秒")
187
  return f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
188
 
 
189
  def do_s2s_think(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
190
- model.eval()
191
  feat, feat_lens = get_feat_from_wav_path(input_wav_path)
192
  print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
193
  if is_npu: torch_npu.npu.synchronize()
@@ -199,6 +200,14 @@ def do_s2s_think(model, input_wav_path, input_prompt, profile=False): # 增加
199
  return f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
200
 
201
  @spaces.GPU
 
 
 
 
 
 
 
 
202
  def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt): # 增加 model 和 tokenizer 参数
203
  print(f"wav_path: {input_wav_path}, prompt:{input_prompt}")
204
  if input_wav_path is None and not input_prompt.endswith(("_TTS", "_T2T")):
@@ -254,13 +263,6 @@ def save_to_jsonl(if_correct, wav, prompt, res):
254
  def download_audio(input_wav_path):
255
  return input_wav_path if input_wav_path else None
256
 
257
-
258
- def get_wav_from_token_list(input_list, prompt_speech):
259
- time_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
260
- wav_path = f"./tmp/{time_str}.wav"
261
- return token_list2wav(input_list, prompt_speech, wav_path, cosyvoice)
262
-
263
-
264
  # --- Gradio 界面 ---
265
  with gr.Blocks() as demo:
266
  gr.Markdown(
 
64
  }
65
  print("\n所有模型已加载完毕。")
66
 
67
+ cosyvoice = CosyVoice(cosyvoice_model_path)
68
 
69
  # 将图片转换为 Base64
70
  with open("./tts/assert/实验室.png", "rb") as image_file:
 
110
  logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
111
 
112
 
113
+ @spaces.GPU
114
  def do_s2t(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
115
+ model.eval().cuda()
116
  feat, feat_lens = get_feat_from_wav_path(input_wav_path)
117
  print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
118
  if is_npu: torch_npu.npu.synchronize()
 
123
  print(f"S2T 推理消耗时间: {end_time - start_time:.2f} 秒")
124
  return res_text
125
 
126
+ @spaces.GPU
127
  def do_s2t4chat(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
128
+ model.eval().cuda()
129
  feat, feat_lens = get_feat_from_wav_path(input_wav_path)
130
  print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
131
  if is_npu: torch_npu.npu.synchronize()
 
135
  end_time = time.time()
136
  print(f"S2T4Chat 推理消耗时间: {end_time - start_time:.2f} 秒")
137
  return res_text
138
+ @spaces.GPU
139
  def do_s2t4chat_think(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
140
+ model.eval().cuda()
141
  feat, feat_lens = get_feat_from_wav_path(input_wav_path)
142
  print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
143
  if is_npu: torch_npu.npu.synchronize()
 
148
  print(f"S2T4Chat 推理消耗时间: {end_time - start_time:.2f} 秒")
149
  return res_text
150
 
151
+ @spaces.GPU
152
  def do_t2s(model, input_prompt, text_for_tts, profile=False): # 增加 model 参数
153
+ model.eval().cuda()
154
  if is_npu: torch_npu.npu.synchronize()
155
  start_time = time.time()
156
  res_tensor = model.generate_tts(device=device, text=text_for_tts, )[0]
 
161
  print(f"T2S 推理消耗时间: {end_time - start_time:.2f} 秒")
162
  return res_text
163
 
164
+ @spaces.GPU
165
  def do_t2t(model, question_txt, profile=False): # 增加 model 参数
166
+ model.eval().cuda()
167
  if is_npu: torch_npu.npu.synchronize()
168
  start_time = time.time()
169
  print(f'开始t2t推理, question_txt: {question_txt}')
 
173
  print(f"T2T 推理消耗时间: {end_time - start_time:.2f} 秒")
174
  return res_text
175
 
176
+ @spaces.GPU
177
  def do_s2s(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
178
+ model.eval().cuda()
179
  feat, feat_lens = get_feat_from_wav_path(input_wav_path)
180
  print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
181
  if is_npu: torch_npu.npu.synchronize()
 
186
  print(f"S2S 推理消耗时间: {end_time - start_time:.2f} 秒")
187
  return f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
188
 
189
+ @spaces.GPU
190
  def do_s2s_think(model, input_wav_path, input_prompt, profile=False): # 增加 model 参数
191
+ model.eval().cuda()
192
  feat, feat_lens = get_feat_from_wav_path(input_wav_path)
193
  print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
194
  if is_npu: torch_npu.npu.synchronize()
 
200
  return f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
201
 
202
  @spaces.GPU
203
+ def get_wav_from_token_list(input_list, prompt_speech):
204
+ cosyvoice.eval().cuda()
205
+ time_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
206
+ wav_path = f"./tmp/{time_str}.wav"
207
+ return token_list2wav(input_list, prompt_speech, wav_path, cosyvoice)
208
+
209
+
210
+
211
  def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt): # 增加 model 和 tokenizer 参数
212
  print(f"wav_path: {input_wav_path}, prompt:{input_prompt}")
213
  if input_wav_path is None and not input_prompt.endswith(("_TTS", "_T2T")):
 
263
  def download_audio(input_wav_path):
264
  return input_wav_path if input_wav_path else None
265
 
 
 
 
 
 
 
 
266
  # --- Gradio 界面 ---
267
  with gr.Blocks() as demo:
268
  gr.Markdown(
common_utils/utils4infer.py CHANGED
@@ -16,7 +16,7 @@ import torchaudio
16
 
17
 
18
 
19
- def load_model_and_tokenizer(checkpoint_path, config_path, device:torch.device=torch.device('cuda')):
20
  """
21
  封装了加载模型和分词器的逻辑
22
  Args:
@@ -31,10 +31,10 @@ def load_model_and_tokenizer(checkpoint_path, config_path, device:torch.device=t
31
  args = GxlNode({"checkpoint": checkpoint_path})
32
  configs = utils_file.load_dict_from_yaml(config_path)
33
  model, configs = init_model(args, configs)
34
- model = model.to(device).to(torch.bfloat16)
35
  model.eval() # 设置为评估模式
36
  tokenizer = init_tokenizer(configs)
37
- print(f"模型 {checkpoint_path} 加载完成并移动到 {device}")
38
  return model, tokenizer
39
 
40
  def token_list2wav(token_list, prompt_speech, wav_path, cosyvoice):
 
16
 
17
 
18
 
19
+ def load_model_and_tokenizer(checkpoint_path, config_path):
20
  """
21
  封装了加载模型和分词器的逻辑
22
  Args:
 
31
  args = GxlNode({"checkpoint": checkpoint_path})
32
  configs = utils_file.load_dict_from_yaml(config_path)
33
  model, configs = init_model(args, configs)
34
+ model = model.to(torch.bfloat16)
35
  model.eval() # 设置为评估模式
36
  tokenizer = init_tokenizer(configs)
37
+ print(f"模型 {checkpoint_path} ")
38
  return model, tokenizer
39
 
40
  def token_list2wav(token_list, prompt_speech, wav_path, cosyvoice):
tts/cosyvoice/cli/cosyvoice.py CHANGED
@@ -18,15 +18,16 @@ from tqdm import tqdm
18
  from hyperpyyaml import load_hyperpyyaml
19
  from modelscope import snapshot_download
20
  import torch
21
- from cosyvoice.cli.frontend import CosyVoiceFrontEnd
22
- from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
23
- from cosyvoice.utils.file_utils import logging
24
- from cosyvoice.utils.class_utils import get_model_type
25
 
26
 
27
- class CosyVoice:
28
 
29
- def __init__(self, model_dir,gpu_id=0, load_jit=False, load_trt=False, fp16=False):
 
30
  self.instruct = True if '-Instruct' in model_dir else False
31
  self.model_dir = model_dir
32
  self.fp16 = fp16
@@ -35,29 +36,17 @@ class CosyVoice:
35
  with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
36
  configs = load_hyperpyyaml(f)
37
  assert get_model_type(configs) != CosyVoice2Model, 'do not use {} for CosyVoice initialization!'.format(model_dir)
38
- self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
39
- configs['feat_extractor'],
40
- '{}/campplus.onnx'.format(model_dir),
41
- '{}/speech_tokenizer_v1.onnx'.format(model_dir),
42
- '{}/spk2info.pt'.format(model_dir),
43
- configs['allowed_special'],
44
- gpu_id=gpu_id)
45
  self.sample_rate = configs['sample_rate']
46
- if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
47
- load_jit, load_trt, fp16 = False, False, False
48
- logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
49
  self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16, gpu_id=gpu_id)
50
- self.model.load('{}/llm.pt'.format(model_dir),
51
- '{}/flow.pt'.format(model_dir),
52
  '{}/hift.pt'.format(model_dir))
53
- if load_jit:
54
- self.model.load_jit('{}/llm.text_encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
55
- '{}/llm.llm.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
56
- '{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
57
- if load_trt:
58
- self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
59
- '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
60
- self.fp16)
61
  del configs
62
 
63
  def list_available_spks(self):
 
18
  from hyperpyyaml import load_hyperpyyaml
19
  from modelscope import snapshot_download
20
  import torch
21
+ from tts.cosyvoice.cli.frontend import CosyVoiceFrontEnd
22
+ from tts.cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
23
+ from tts.cosyvoice.utils.file_utils import logging
24
+ from tts.cosyvoice.utils.class_utils import get_model_type
25
 
26
 
27
+ class CosyVoice(torch.Module):
28
 
29
+ def __init__(self, model_dir,gpu_id=-1, load_jit=False, load_trt=False, fp16=False):
30
+ super(CosyVoice, self).__init__()
31
  self.instruct = True if '-Instruct' in model_dir else False
32
  self.model_dir = model_dir
33
  self.fp16 = fp16
 
36
  with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
37
  configs = load_hyperpyyaml(f)
38
  assert get_model_type(configs) != CosyVoice2Model, 'do not use {} for CosyVoice initialization!'.format(model_dir)
39
+ # self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
40
+ # configs['feat_extractor'],
41
+ # '{}/campplus.onnx'.format(model_dir),
42
+ # '{}/speech_tokenizer_v1.onnx'.format(model_dir),
43
+ # '{}/spk2info.pt'.format(model_dir),
44
+ # configs['allowed_special'],
45
+ # gpu_id=gpu_id)
46
  self.sample_rate = configs['sample_rate']
 
 
 
47
  self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16, gpu_id=gpu_id)
48
+ self.model.load('{}/flow.pt'.format(model_dir),
 
49
  '{}/hift.pt'.format(model_dir))
 
 
 
 
 
 
 
 
50
  del configs
51
 
52
  def list_available_spks(self):
tts/cosyvoice/cli/model.py CHANGED
@@ -37,10 +37,6 @@ class CosyVoiceModel:
37
  hift: torch.nn.Module,
38
  fp16: bool,
39
  gpu_id: int = 0):
40
- if is_npu:
41
- self.device = torch.device(f'npu:{gpu_id}')
42
- else:
43
- self.device = torch.device(f'cuda:{gpu_id}')
44
  self.llm = llm
45
  self.flow = flow
46
  self.hift = hift
@@ -75,15 +71,14 @@ class CosyVoiceModel:
75
  self.flow_cache_dict = {}
76
  self.hift_cache_dict = {}
77
 
78
- def load(self, llm_model, flow_model, hift_model):
79
- self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True)
80
- self.llm.to(self.device).eval()
81
- self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True)
82
- self.flow.to(self.device).eval()
83
- # in case hift_model is a hifigan model
84
  hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
85
  self.hift.load_state_dict(hift_state_dict, strict=True)
86
- self.hift.to(self.device).eval()
87
 
88
  def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
89
  llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)
 
37
  hift: torch.nn.Module,
38
  fp16: bool,
39
  gpu_id: int = 0):
 
 
 
 
40
  self.llm = llm
41
  self.flow = flow
42
  self.hift = hift
 
71
  self.flow_cache_dict = {}
72
  self.hift_cache_dict = {}
73
 
74
+ def load(self, flow_model, hift_model, llm_model=None):
75
+ if llm_model is not None:
76
+ self.llm.load_state_dict(torch.load(llm_model, map_location="cpu"), strict=True)
77
+ self.flow.load_state_dict(torch.load(flow_model,map_location="cpu"), strict=True)
78
+ self.flow.eval()
 
79
  hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
80
  self.hift.load_state_dict(hift_state_dict, strict=True)
81
+ self.hift.eval()
82
 
83
  def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
84
  llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)