Spaces:

ASLP-lab
/

OSUM-EChat

Running on Zero

App Files Files Community

xlgeng commited on 30 days ago

Commit

daa8d34

1 Parent(s): cc344d5

开始部署

Browse files

Files changed (4) hide show

app.py +23 -21
common_utils/utils4infer.py +3 -3
tts/cosyvoice/cli/cosyvoice.py +15 -26
tts/cosyvoice/cli/model.py +6 -11

app.py CHANGED Viewed

@@ -64,7 +64,7 @@ loaded_models = {
 }
 print("\n所有模型已加载完毕。")
-cosyvoice = CosyVoice(cosyvoice_model_path, gpu_id=0)
 # 将图片转换为 Base64
 with open("./tts/assert/实验室.png", "rb") as image_file:
@@ -110,9 +110,9 @@ for item in prompt_audio_choices:
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
 def do_s2t(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
-    model.eval()
     feat, feat_lens = get_feat_from_wav_path(input_wav_path)
     print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
     if is_npu: torch_npu.npu.synchronize()
@@ -123,9 +123,9 @@ def do_s2t(model, input_wav_path, input_prompt, profile=False):  # 增加 model
     print(f"S2T 推理消耗时间: {end_time - start_time:.2f} 秒")
     return res_text
 def do_s2t4chat(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
-    model.eval()
     feat, feat_lens = get_feat_from_wav_path(input_wav_path)
     print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
     if is_npu: torch_npu.npu.synchronize()
@@ -135,9 +135,9 @@ def do_s2t4chat(model, input_wav_path, input_prompt, profile=False):  # 增加 m
     end_time = time.time()
     print(f"S2T4Chat 推理消耗时间: {end_time - start_time:.2f} 秒")
     return res_text
 def do_s2t4chat_think(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
-    model.eval()
     feat, feat_lens = get_feat_from_wav_path(input_wav_path)
     print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
     if is_npu: torch_npu.npu.synchronize()
@@ -148,9 +148,9 @@ def do_s2t4chat_think(model, input_wav_path, input_prompt, profile=False):  #
     print(f"S2T4Chat 推理消耗时间: {end_time - start_time:.2f} 秒")
     return res_text
 def do_t2s(model, input_prompt, text_for_tts, profile=False):  # 增加 model 参数
-    model.eval()
     if is_npu: torch_npu.npu.synchronize()
     start_time = time.time()
     res_tensor = model.generate_tts(device=device, text=text_for_tts, )[0]
@@ -161,9 +161,9 @@ def do_t2s(model, input_prompt, text_for_tts, profile=False):  # 增加 model
     print(f"T2S 推理消耗时间: {end_time - start_time:.2f} 秒")
     return res_text
 def do_t2t(model, question_txt, profile=False):  # 增加 model 参数
-    model.eval()
     if is_npu: torch_npu.npu.synchronize()
     start_time = time.time()
     print(f'开始t2t推理, question_txt: {question_txt}')
@@ -173,9 +173,9 @@ def do_t2t(model, question_txt, profile=False):  # 增加 model 参数
     print(f"T2T 推理消耗时间: {end_time - start_time:.2f} 秒")
     return res_text
 def do_s2s(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
-    model.eval()
     feat, feat_lens = get_feat_from_wav_path(input_wav_path)
     print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
     if is_npu: torch_npu.npu.synchronize()
@@ -186,8 +186,9 @@ def do_s2s(model, input_wav_path, input_prompt, profile=False):  # 增加 model
     print(f"S2S 推理消耗时间: {end_time - start_time:.2f} 秒")
     return f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
 def do_s2s_think(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
-    model.eval()
     feat, feat_lens = get_feat_from_wav_path(input_wav_path)
     print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
     if is_npu: torch_npu.npu.synchronize()
@@ -199,6 +200,14 @@ def do_s2s_think(model, input_wav_path, input_prompt, profile=False):  # 增加
     return f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
 @spaces.GPU
 def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt):  # 增加 model 和 tokenizer 参数
     print(f"wav_path: {input_wav_path}, prompt:{input_prompt}")
     if input_wav_path is None and not input_prompt.endswith(("_TTS", "_T2T")):
@@ -254,13 +263,6 @@ def save_to_jsonl(if_correct, wav, prompt, res):
 def download_audio(input_wav_path):
     return input_wav_path if input_wav_path else None
-def get_wav_from_token_list(input_list, prompt_speech):
-    time_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
-    wav_path = f"./tmp/{time_str}.wav"
-    return token_list2wav(input_list, prompt_speech, wav_path, cosyvoice)
 # --- Gradio 界面 ---
 with gr.Blocks() as demo:
     gr.Markdown(

 }
 print("\n所有模型已加载完毕。")
+cosyvoice = CosyVoice(cosyvoice_model_path)
 # 将图片转换为 Base64
 with open("./tts/assert/实验室.png", "rb") as image_file:
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
+@spaces.GPU
 def do_s2t(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
+    model.eval().cuda()
     feat, feat_lens = get_feat_from_wav_path(input_wav_path)
     print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
     if is_npu: torch_npu.npu.synchronize()
     print(f"S2T 推理消耗时间: {end_time - start_time:.2f} 秒")
     return res_text
+@spaces.GPU
 def do_s2t4chat(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
+    model.eval().cuda()
     feat, feat_lens = get_feat_from_wav_path(input_wav_path)
     print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
     if is_npu: torch_npu.npu.synchronize()
     end_time = time.time()
     print(f"S2T4Chat 推理消耗时间: {end_time - start_time:.2f} 秒")
     return res_text
+@spaces.GPU
 def do_s2t4chat_think(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
+    model.eval().cuda()
     feat, feat_lens = get_feat_from_wav_path(input_wav_path)
     print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
     if is_npu: torch_npu.npu.synchronize()
     print(f"S2T4Chat 推理消耗时间: {end_time - start_time:.2f} 秒")
     return res_text
+@spaces.GPU
 def do_t2s(model, input_prompt, text_for_tts, profile=False):  # 增加 model 参数
+    model.eval().cuda()
     if is_npu: torch_npu.npu.synchronize()
     start_time = time.time()
     res_tensor = model.generate_tts(device=device, text=text_for_tts, )[0]
     print(f"T2S 推理消耗时间: {end_time - start_time:.2f} 秒")
     return res_text
+@spaces.GPU
 def do_t2t(model, question_txt, profile=False):  # 增加 model 参数
+    model.eval().cuda()
     if is_npu: torch_npu.npu.synchronize()
     start_time = time.time()
     print(f'开始t2t推理, question_txt: {question_txt}')
     print(f"T2T 推理消耗时间: {end_time - start_time:.2f} 秒")
     return res_text
+@spaces.GPU
 def do_s2s(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
+    model.eval().cuda()
     feat, feat_lens = get_feat_from_wav_path(input_wav_path)
     print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
     if is_npu: torch_npu.npu.synchronize()
     print(f"S2S 推理消耗时间: {end_time - start_time:.2f} 秒")
     return f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
+@spaces.GPU
 def do_s2s_think(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
+    model.eval().cuda()
     feat, feat_lens = get_feat_from_wav_path(input_wav_path)
     print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
     if is_npu: torch_npu.npu.synchronize()
     return f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
 @spaces.GPU
+def get_wav_from_token_list(input_list, prompt_speech):
+    cosyvoice.eval().cuda()
+    time_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+    wav_path = f"./tmp/{time_str}.wav"
+    return token_list2wav(input_list, prompt_speech, wav_path, cosyvoice)
 def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt):  # 增加 model 和 tokenizer 参数
     print(f"wav_path: {input_wav_path}, prompt:{input_prompt}")
     if input_wav_path is None and not input_prompt.endswith(("_TTS", "_T2T")):
 def download_audio(input_wav_path):
     return input_wav_path if input_wav_path else None
 # --- Gradio 界面 ---
 with gr.Blocks() as demo:
     gr.Markdown(

common_utils/utils4infer.py CHANGED Viewed

@@ -16,7 +16,7 @@ import torchaudio
-def load_model_and_tokenizer(checkpoint_path, config_path, device:torch.device=torch.device('cuda')):
     """
     封装了加载模型和分词器的逻辑
     Args:
@@ -31,10 +31,10 @@ def load_model_and_tokenizer(checkpoint_path, config_path, device:torch.device=t
     args = GxlNode({"checkpoint": checkpoint_path})
     configs = utils_file.load_dict_from_yaml(config_path)
     model, configs = init_model(args, configs)
-    model = model.to(device).to(torch.bfloat16)
     model.eval()  # 设置为评估模式
     tokenizer = init_tokenizer(configs)
-    print(f"模型 {checkpoint_path} 加载完成并移动到 {device}")
     return model, tokenizer
 def token_list2wav(token_list, prompt_speech, wav_path, cosyvoice):

+def load_model_and_tokenizer(checkpoint_path, config_path):
     """
     封装了加载模型和分词器的逻辑
     Args:
     args = GxlNode({"checkpoint": checkpoint_path})
     configs = utils_file.load_dict_from_yaml(config_path)
     model, configs = init_model(args, configs)
+    model = model.to(torch.bfloat16)
     model.eval()  # 设置为评估模式
     tokenizer = init_tokenizer(configs)
+    print(f"模型 {checkpoint_path} ")
     return model, tokenizer
 def token_list2wav(token_list, prompt_speech, wav_path, cosyvoice):

tts/cosyvoice/cli/cosyvoice.py CHANGED Viewed

@@ -18,15 +18,16 @@ from tqdm import tqdm
 from hyperpyyaml import load_hyperpyyaml
 from modelscope import snapshot_download
 import torch
-from cosyvoice.cli.frontend import CosyVoiceFrontEnd
-from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
-from cosyvoice.utils.file_utils import logging
-from cosyvoice.utils.class_utils import get_model_type
-class CosyVoice:
-    def __init__(self, model_dir,gpu_id=0, load_jit=False, load_trt=False, fp16=False):
         self.instruct = True if '-Instruct' in model_dir else False
         self.model_dir = model_dir
         self.fp16 = fp16
@@ -35,29 +36,17 @@ class CosyVoice:
         with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
             configs = load_hyperpyyaml(f)
         assert get_model_type(configs) != CosyVoice2Model, 'do not use {} for CosyVoice initialization!'.format(model_dir)
-        self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
-                                          configs['feat_extractor'],
-                                          '{}/campplus.onnx'.format(model_dir),
-                                          '{}/speech_tokenizer_v1.onnx'.format(model_dir),
-                                          '{}/spk2info.pt'.format(model_dir),
-                                          configs['allowed_special'],
-                                          gpu_id=gpu_id)
         self.sample_rate = configs['sample_rate']
-        if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
-            load_jit, load_trt, fp16 = False, False, False
-            logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
         self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16, gpu_id=gpu_id)
-        self.model.load('{}/llm.pt'.format(model_dir),
-                        '{}/flow.pt'.format(model_dir),
                         '{}/hift.pt'.format(model_dir))
-        if load_jit:
-            self.model.load_jit('{}/llm.text_encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
-                                '{}/llm.llm.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
-                                '{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
-        if load_trt:
-            self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
-                                '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
-                                self.fp16)
         del configs
     def list_available_spks(self):

 from hyperpyyaml import load_hyperpyyaml
 from modelscope import snapshot_download
 import torch
+from tts.cosyvoice.cli.frontend import CosyVoiceFrontEnd
+from tts.cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
+from tts.cosyvoice.utils.file_utils import logging
+from tts.cosyvoice.utils.class_utils import get_model_type
+class CosyVoice(torch.Module):
+    def __init__(self, model_dir,gpu_id=-1, load_jit=False, load_trt=False, fp16=False):
+        super(CosyVoice, self).__init__()
         self.instruct = True if '-Instruct' in model_dir else False
         self.model_dir = model_dir
         self.fp16 = fp16
         with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
             configs = load_hyperpyyaml(f)
         assert get_model_type(configs) != CosyVoice2Model, 'do not use {} for CosyVoice initialization!'.format(model_dir)
+        # self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
+        #                                   configs['feat_extractor'],
+        #                                   '{}/campplus.onnx'.format(model_dir),
+        #                                   '{}/speech_tokenizer_v1.onnx'.format(model_dir),
+        #                                   '{}/spk2info.pt'.format(model_dir),
+        #                                   configs['allowed_special'],
+        #                                   gpu_id=gpu_id)
         self.sample_rate = configs['sample_rate']
         self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16, gpu_id=gpu_id)
+        self.model.load('{}/flow.pt'.format(model_dir),
                         '{}/hift.pt'.format(model_dir))
         del configs
     def list_available_spks(self):

tts/cosyvoice/cli/model.py CHANGED Viewed

@@ -37,10 +37,6 @@ class CosyVoiceModel:
                  hift: torch.nn.Module,
                  fp16: bool,
                  gpu_id: int = 0):
-        if is_npu:
-            self.device = torch.device(f'npu:{gpu_id}')
-        else:
-            self.device = torch.device(f'cuda:{gpu_id}')
         self.llm = llm
         self.flow = flow
         self.hift = hift
@@ -75,15 +71,14 @@ class CosyVoiceModel:
         self.flow_cache_dict = {}
         self.hift_cache_dict = {}
-    def load(self, llm_model, flow_model, hift_model):
-        self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True)
-        self.llm.to(self.device).eval()
-        self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True)
-        self.flow.to(self.device).eval()
-        # in case hift_model is a hifigan model
         hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
         self.hift.load_state_dict(hift_state_dict, strict=True)
-        self.hift.to(self.device).eval()
     def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
         llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)

                  hift: torch.nn.Module,
                  fp16: bool,
                  gpu_id: int = 0):
         self.llm = llm
         self.flow = flow
         self.hift = hift
         self.flow_cache_dict = {}
         self.hift_cache_dict = {}
+    def load(self, flow_model, hift_model, llm_model=None):
+        if llm_model is not None:
+            self.llm.load_state_dict(torch.load(llm_model, map_location="cpu"), strict=True)
+        self.flow.load_state_dict(torch.load(flow_model,map_location="cpu"), strict=True)
+        self.flow.eval()
         hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
         self.hift.load_state_dict(hift_state_dict, strict=True)
+        self.hift.eval()
     def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
         llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)