Spaces:

ASLP-lab
/

OSUM-EChat

Running on Zero

App Files Files Community

xlgeng commited on 30 days ago

Commit

841f290

1 Parent(s): 3efab1d

开始部署

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +414 -0
common_utils/__init__.py +0 -0
common_utils/convert_ckpt_dir_to_pt.py +27 -0
common_utils/load_combine_type_yaml.py +59 -0
common_utils/utils4infer.py +163 -0
conf/ct_config.yaml +153 -0
conf/ct_config_sft.yaml +152 -0
conf/data_s2s.yaml +226 -0
conf/data_s2t.yaml +402 -0
conf/data_t2s.yaml +28 -0
conf/data_t2t.yaml +159 -0
conf/data_tmp.yaml +6 -0
conf/ds_stage2.json +34 -0
conf/empty.yaml +0 -0
conf/prompt_config.yaml +0 -0
conf/system_prompt.yaml +27 -0
patches/cumstom_stop_criteria.py +85 -0
patches/custom_speech_ngram_blocking.py +129 -0
patches/custom_speech_repetition_penalty.py +22 -0
patches/modelling_fm_infer_gpu.py +18 -0
patches/modelling_qwen2_infer_gpu.py +416 -0
patches/utils.py +4 -0
requirements.txt +41 -0
tts/__init__.py +0 -0
tts/assert//345/256/236/351/252/214/345/256/244.png +0 -0
tts/cosyvoice/__init__.py +0 -0
tts/cosyvoice/bin/average_model.py +92 -0
tts/cosyvoice/bin/export_jit.py +91 -0
tts/cosyvoice/bin/export_onnx.py +116 -0
tts/cosyvoice/bin/export_trt.sh +10 -0
tts/cosyvoice/bin/inference.py +115 -0
tts/cosyvoice/bin/train.py +170 -0
tts/cosyvoice/cli/__init__.py +0 -0
tts/cosyvoice/cli/cosyvoice.py +197 -0
tts/cosyvoice/cli/frontend.py +240 -0
tts/cosyvoice/cli/model.py +480 -0
tts/cosyvoice/dataset/__init__.py +0 -0
tts/cosyvoice/dataset/dataset.py +164 -0
tts/cosyvoice/dataset/processor.py +435 -0
tts/cosyvoice/flow/decoder.py +301 -0
tts/cosyvoice/flow/flow.py +239 -0
tts/cosyvoice/flow/flow_matching.py +217 -0
tts/cosyvoice/flow/length_regulator.py +69 -0
tts/cosyvoice/hifigan/discriminator.py +140 -0
tts/cosyvoice/hifigan/f0_predictor.py +56 -0
tts/cosyvoice/hifigan/generator.py +412 -0
tts/cosyvoice/hifigan/hifigan.py +67 -0
tts/cosyvoice/llm/llm.py +434 -0
tts/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +0 -0
tts/cosyvoice/tokenizer/tokenizer.py +279 -0

app.py ADDED Viewed

	@@ -0,0 +1,414 @@

+import ast
+import base64
+import datetime
+import json
+import logging
+import os
+import spaces
+import gradio as gr
+import  sys
+import time
+import traceback
+import torch
+from common_utils.utils4infer import get_feat_from_wav_path, load_model_and_tokenizer, token_list2wav
+sys.path.insert(0, '.')
+sys.path.insert(0, './tts')
+sys.path.insert(0, './tts/third_party/Matcha-TTS')
+from patches import modelling_qwen2_infer_gpu  # 打patch
+from tts.cosyvoice.cli.cosyvoice import CosyVoice
+from tts.cosyvoice.utils.file_utils import load_wav
+is_npu = False
+try:
+    import torch_npu
+except ImportError:
+    is_npu = False
+    print("torch_npu is not available. if you want to use npu, please install it.")
+from huggingface_hub import hf_hub_download
+# 从Hugging Face下载.pt文件
+CHECKPOINT_PATH_A = hf_hub_download(repo_id="ASLP-lab/OSUM-EChat", filename="language_think_final.pt")
+CHECKPOINT_PATH_B= hf_hub_download(repo_id="ASLP-lab/OSUM-EChat", filename="tag_think_final.pt")
+CONFIG_PATH = "./conf/ct_config.yaml"
+NAME_A="language_think"
+NAME_B="tag_think"
+cosyvoice_model_path = hf_hub_download(repo_id="ASLP-lab/OSUM-EChat", filename="CosyVoice-300M-25Hz.tar")
+# 将tar包解压到当前目录
+os.system(f"tar -xvf {cosyvoice_model_path}")
+print("解压cosyvoice模型pt文件完成")
+cosyvoice_model_path="./CosyVoice-300M-25Hz"
+device = torch.device("cuda")
+print("开始加载模型 A...")
+model_a, tokenizer_a = load_model_and_tokenizer(CHECKPOINT_PATH_A, CONFIG_PATH)
+print("\n开始加载模型 B...")
+if CHECKPOINT_PATH_B is not None:
+    model_b, tokenizer_b = load_model_and_tokenizer(CHECKPOINT_PATH_B, CONFIG_PATH)
+else:
+    model_b, tokenizer_b = None, None
+loaded_models = {
+    NAME_A: {"model": model_a, "tokenizer": tokenizer_a},
+    NAME_B: {"model": model_b, "tokenizer": tokenizer_b},
+} if model_b is not None else {
+    NAME_A: {"model": model_a, "tokenizer": tokenizer_a},
+}
+print("\n所有模型已加载完毕。")
+cosyvoice = CosyVoice(cosyvoice_model_path, gpu_id=0)
+# 将图片转换为 Base64
+with open("./tts/assert/实验室.png", "rb") as image_file:
+    encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
+# 任务映射
+TASK_PROMPT_MAPPING = {
+    "empathetic_s2s_dialogue with think": "THINK",
+    "empathetic_s2s_dialogue no think": "s2s_no_think",
+    "empathetic_s2t_dialogue with think": "s2t_think",
+    "empathetic_s2t_dialogue no think": "s2t_no_think",
+    "ASR (Automatic Speech Recognition)": "转录这段音频中的语音内容为文字。",
+    "SRWT (Speech Recognition with Timestamps)": "请识别音频内容，并对所有英文词和中文字进行时间对齐，标注格式为<>，时间精度0.1秒。",
+    "VED (Vocal Event Detection)(类别:laugh，cough，cry，screaming，sigh，throat clearing，sneeze，other)": "请将音频转化为文字，并在末尾添加相关音频事件标签，标签格式为<>。",
+    "SER (Speech Emotion Recognition)(类别:sad，anger，neutral，happy，surprise，fear，disgust，和other)": "请将音频内容转录成文字记录，并在记录末尾标注情感标签,以<>表示。",
+    "SSR (Speaking Style Recognition)(类别:新闻科普，恐怖故事，童话故事，客服，诗歌散文，有声书，日常口语，其他)": "请将音频中的讲话内容转化为文字，并在结尾处注明风格标签，用<>表示。",
+    "SGC (Speaker Gender Classification)(类别:female,male)": "请将音频转录为文字，并在文本末尾标注性别标签，标签格式为<>。",
+    "SAP (Speaker Age Prediction)(类别:child、adult和old)": "请将这段音频转录成文字，并在末尾加上年龄标签，格式为<>。",
+    "STTC (Speech to Text Chat)": "首先将语音转录为文字，然后对语音内容进行回复，转录和文字之间使用<开始回答>分割。",
+    "Only Age Prediction(类别:child、adult和old)": "请根据音频分析发言者的年龄并输出年龄标签，标签格式为<>。",
+    "Only Gender Classification(类别:female,male)": "根据下述音频内容判断说话者性别，返回性别标签，格式为<>.",
+    "Only Style Recognition(类别:新闻科普，恐怖故事，童话故事，客服，诗歌散文，有声书，日常口语，其他)": "对于以下音频，请直接判断风格并返回风格标签，标签格式为<>。",
+    "Only Emotion Recognition(类别:sad，anger，neutral，happy，surprise，fear，disgust，和other)": "请鉴别音频中的发言者情感并标出，标签格式为<>。",
+    "Only  Event Detection(类别:laugh，cough，cry，screaming，sigh，throat clearing，sneeze，other)": "对音频进行标签化，返回音频事件标签，标签格式为<>。",
+    "ASR+AGE+GENDER": '请将这段音频进行转录，并在转录完成的文本末尾附加<年龄> <性别>标签。',
+    "AGE+GENDER": "请识别以下音频发言者的年龄和性别.",
+    "ASR+STYLE+AGE+GENDER": "请对以下音频内容进行转录，并在文本结尾分别���加<风格>、<年龄>、<性别>标签。",
+    "STYLE+AGE+GENDER": "请对以下音频进行分析，识别说话风格、说话者年龄和性别。",
+    "ASR with punctuations": "需对提供的语音文件执行文本转换，同时为转换结果补充必要的标点。",
+    "ASR EVENT AGE GENDER": "请将以下音频内容进行转录，并在转录完成的文本末尾分别附加<音频事件>、<年龄>、<性别>标签。",
+    "ASR EMOTION AGE GENDER": "请将下列音频内容进行转录，并在转录文本的末尾分别添加<情感>、<年龄>、<性别>标签。",
+}
+prompt_path = hf_hub_download(repo_id="ASLP-lab/OSUM-EChat", filename="prompt.wav")
+prompt_audio_choices = [
+    {"name": "拟人",
+     "value": prompt_path},
+]
+prompt_audio_cache = {}
+for item in prompt_audio_choices:
+    prompt_audio_cache[item["value"]] = load_wav(item["value"], 22050)
+logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
+def do_s2t(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
+    model.eval()
+    feat, feat_lens = get_feat_from_wav_path(input_wav_path)
+    print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
+    if is_npu: torch_npu.npu.synchronize()
+    start_time = time.time()
+    res_text = model.generate(wavs=feat, wavs_len=feat_lens, prompt=input_prompt, cache_implementation="static")[0]
+    if is_npu: torch_npu.npu.synchronize()
+    end_time = time.time()
+    print(f"S2T 推理消耗时间: {end_time - start_time:.2f} 秒")
+    return res_text
+def do_s2t4chat(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
+    model.eval()
+    feat, feat_lens = get_feat_from_wav_path(input_wav_path)
+    print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
+    if is_npu: torch_npu.npu.synchronize()
+    start_time = time.time()
+    res_text = model.generate4chat(wavs=feat, wavs_len=feat_lens, cache_implementation="static")[0]
+    if is_npu: torch_npu.npu.synchronize()
+    end_time = time.time()
+    print(f"S2T4Chat 推理消耗时间: {end_time - start_time:.2f} 秒")
+    return res_text
+def do_s2t4chat_think(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
+    model.eval()
+    feat, feat_lens = get_feat_from_wav_path(input_wav_path)
+    print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
+    if is_npu: torch_npu.npu.synchronize()
+    start_time = time.time()
+    res_text = model.generate4chat_think(wavs=feat, wavs_len=feat_lens, cache_implementation="static")[0]
+    if is_npu: torch_npu.npu.synchronize()
+    end_time = time.time()
+    print(f"S2T4Chat 推理消耗时间: {end_time - start_time:.2f} 秒")
+    return res_text
+def do_t2s(model, input_prompt, text_for_tts, profile=False):  # 增加 model 参数
+    model.eval()
+    if is_npu: torch_npu.npu.synchronize()
+    start_time = time.time()
+    res_tensor = model.generate_tts(device=device, text=text_for_tts, )[0]
+    res_token_list = res_tensor.tolist()
+    res_text = res_token_list[:-1]
+    if is_npu: torch_npu.npu.synchronize()
+    end_time = time.time()
+    print(f"T2S 推理消耗时间: {end_time - start_time:.2f} 秒")
+    return res_text
+def do_t2t(model, question_txt, profile=False):  # 增加 model 参数
+    model.eval()
+    if is_npu: torch_npu.npu.synchronize()
+    start_time = time.time()
+    print(f'开始t2t推理, question_txt: {question_txt}')
+    res_text = model.generate_text2text(device=device, text=question_txt)[0]
+    if is_npu: torch_npu.npu.synchronize()
+    end_time = time.time()
+    print(f"T2T 推理消耗时间: {end_time - start_time:.2f} 秒")
+    return res_text
+def do_s2s(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
+    model.eval()
+    feat, feat_lens = get_feat_from_wav_path(input_wav_path)
+    print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
+    if is_npu: torch_npu.npu.synchronize()
+    start_time = time.time()
+    output_text, text_res, speech_res = model.generate_s2s_no_stream_with_repetition_penalty(wavs=feat, wavs_len=feat_lens,)
+    if is_npu: torch_npu.npu.synchronize()
+    end_time = time.time()
+    print(f"S2S 推理消耗时间: {end_time - start_time:.2f} 秒")
+    return f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
+def do_s2s_think(model, input_wav_path, input_prompt, profile=False):  # 增加 model 参数
+    model.eval()
+    feat, feat_lens = get_feat_from_wav_path(input_wav_path)
+    print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
+    if is_npu: torch_npu.npu.synchronize()
+    start_time = time.time()
+    output_text, text_res, speech_res = model.generate_s2s_no_stream_think_with_repetition_penalty(wavs=feat, wavs_len=feat_lens,)
+    if is_npu: torch_npu.npu.synchronize()
+    end_time = time.time()
+    print(f"S2S 推理消耗时间: {end_time - start_time:.2f} 秒")
+    return f'{output_text[0]}|{str(speech_res[0].tolist()[1:])}'
+@spaces.GPU
+def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt):  # 增加 model 和 tokenizer 参数
+    print(f"wav_path: {input_wav_path}, prompt:{input_prompt}")
+    if input_wav_path is None and not input_prompt.endswith(("_TTS", "_T2T")):
+        print("音频信息未输入，且不是T2S或T2T任务")
+        return "错误：需要音频输入"
+    if input_prompt.endswith("_TTS"):
+        text_for_tts = input_prompt.replace("_TTS", "")
+        prompt = "恳请将如下文本转换为其对应的语音token，力求生成最为流畅、自然的语音。"
+        res_text = do_t2s(model, prompt, text_for_tts)
+    elif input_prompt.endswith("_self_prompt"):
+        prompt = input_prompt.replace("_self_prompt", "")
+        res_text = do_s2t(model, input_wav_path, prompt)
+    elif input_prompt.endswith("_T2T"):
+        question_txt = input_prompt.replace("_T2T", "")
+        res_text = do_t2t(model, question_txt)
+    elif input_prompt in ["识别语音内容，并以文字方式作出回答。",
+                          "请推断对这段语音回答时的情感，标注情感类型，撰写流畅自然的聊天回复，并生成情感语音token。",
+                          "s2s_no_think"]:
+        res_text = do_s2s(model, input_wav_path, input_prompt)
+    elif input_prompt == "THINK":
+        res_text = do_s2s_think(model, input_wav_path, input_prompt)
+    elif input_prompt == "s2t_no_think":
+        res_text = do_s2t4chat(model, input_wav_path, input_prompt)
+    elif input_prompt == "s2t_think":
+        res_text = do_s2t4chat_think(model, input_wav_path, input_prompt)
+    else:
+        res_text = do_s2t(model, input_wav_path, input_prompt)
+        res_text = res_text.replace("<youth>", "<adult>").replace("<middle_age>", "<adult>").replace("<middle>",
+                                                                                                     "<adult>")
+    print("识别结果为：", res_text)
+    return res_text
+def do_decode(model, tokenizer, input_wav_path, input_prompt):  # 增加 model 和 tokenizer 参数
+    print(f'使用模型进行推理: input_wav_path={input_wav_path}, input_prompt={input_prompt}')
+    output_res = true_decode_fuc(model, tokenizer, input_wav_path, input_prompt)
+    return output_res
+def save_to_jsonl(if_correct, wav, prompt, res):
+    data = {
+        "if_correct": if_correct,
+        "wav": wav,
+        "task": prompt,
+        "res": res
+    }
+    with open("results.jsonl", "a", encoding="utf-8") as f:
+        f.write(json.dumps(data, ensure_ascii=False) + "\n")
+def download_audio(input_wav_path):
+    return input_wav_path if input_wav_path else None
+def get_wav_from_token_list(input_list, prompt_speech):
+    time_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+    wav_path = f"./tmp/{time_str}.wav"
+    return token_list2wav(input_list, prompt_speech, wav_path, cosyvoice)
+# --- Gradio 界面 ---
+with gr.Blocks() as demo:
+    gr.Markdown(
+        f"""
+        <div style="display: flex; align-items: center; justify-content: center; text-align: center;">
+            <h1 style="font-family: 'Arial', sans-serif; color: #014377; font-size: 32px; margin-bottom: 0; display: inline-block; vertical-align: middle;">
+                OSUM Speech Understanding Model Test
+            </h1>
+        </div>
+        """
+    )
+    # ### --- 关键修改：添加模型选择器 --- ###
+    with gr.Row():
+        model_selector = gr.Radio(
+            choices=list(loaded_models.keys()),  # 从加载的模型字典中获取选项
+            value=NAME_A,  # 默认值
+            label="选择推理模型",
+            interactive=True
+        )
+    with gr.Row():
+        with gr.Column(scale=1, min_width=300):
+            audio_input = gr.Audio(label="录音", sources=["microphone", "upload"], type="filepath", visible=True)
+        with gr.Column(scale=1, min_width=300):
+            output_text = gr.Textbox(label="输出结果", lines=6, placeholder="生成的结果将显示在这里...",
+                                     interactive=False)
+    with gr.Row():
+        task_dropdown = gr.Dropdown(label="任务",
+                                    choices=list(TASK_PROMPT_MAPPING.keys()) + ["自主输入文本", "TTS任务", "T2T任务"],
+                                    value="empathetic_s2s_dialogue with think")
+        prompt_speech_dropdown = gr.Dropdown(label="参考音频（prompt_speech）",
+                                             choices=[(item["name"], item["value"]) for item in prompt_audio_choices],
+                                             value=prompt_audio_choices[0]["value"], visible=True)
+        custom_prompt_input = gr.Textbox(label="自定义任务提示", placeholder="请输入自定义任务提示...", visible=False)
+        tts_input = gr.Textbox(label="TTS输入文本", placeholder="请输入TTS任务的文本...", visible=False)
+        t2t_input = gr.Textbox(label="T2T输入文本", placeholder="请输入T2T任务的文本...", visible=False)
+    audio_player = gr.Audio(label="播放音频", type="filepath", interactive=False)
+    with gr.Row():
+        download_button = gr.DownloadButton("下载音频", variant="secondary",
+                                            elem_classes=["button-height", "download-button"])
+        submit_button = gr.Button("开始处理", variant="primary", elem_classes=["button-height", "submit-button"])
+    with gr.Row(visible=False) as confirmation_row:
+        # ... (确认组件不变)
+        gr.Markdown("请判断结果是否正确：")
+        confirmation_buttons = gr.Radio(choices=["正确", "错误"], label="", interactive=True, container=False,
+                                        elem_classes="confirmation-buttons")
+        save_button = gr.Button("提交反馈", variant="secondary")
+    # ... (底部内容不变)
+    with gr.Row():
+        with gr.Column(scale=1, min_width=800):
+            gr.Markdown(f"""...""")  # 省略底部HTML
+    def show_confirmation(output_res, input_wav_path, input_prompt):
+        return gr.update(visible=True), output_res, input_wav_path, input_prompt
+    def save_result(if_correct, wav, prompt, res):
+        save_to_jsonl(if_correct, wav, prompt, res)
+        return gr.update(visible=False)
+    # handle_submit 函数现在接收 `selected_model_name` 参数
+    def handle_submit(selected_model_name, input_wav_path, task_choice, custom_prompt, tts_text, t2t_text,
+                      prompt_speech):
+        # 1. 根据选择的模型名称，从字典中获取对应的模型和分词器
+        print(f"用户选择了: {selected_model_name}")
+        model_info = loaded_models[selected_model_name]
+        model_to_use = model_info["model"]
+        tokenizer_to_use = model_info["tokenizer"]
+        # 2. 准备 prompt
+        prompt_speech_data = prompt_audio_cache.get(prompt_speech)
+        if task_choice == "自主输入文本":
+            input_prompt = custom_prompt + "_self_prompt"
+        elif task_choice == "TTS任务":
+            input_prompt = tts_text + "_TTS"
+        elif task_choice == "T2T任务":
+            input_prompt = t2t_text + "_T2T"
+        else:
+            input_prompt = TASK_PROMPT_MAPPING.get(task_choice, "未知任务类型")
+        # 3. 调用重构后的推理函数，传入选择的模型
+        output_res = do_decode(model_to_use, tokenizer_to_use, input_wav_path, input_prompt)
+        # 4. 处理输出 (逻辑不变)
+        wav_path_output = input_wav_path
+        if task_choice == "TTS任务" or "empathetic_s2s_dialogue" in task_choice:
+            if isinstance(output_res, list):  # TTS case
+                wav_path_output = get_wav_from_token_list(output_res, prompt_speech_data)
+                output_res = "生成的token: " + str(output_res)
+            elif isinstance(output_res, str) and "|" in output_res:  # S2S case
+                try:
+                    text_res, token_list_str = output_res.split("|")
+                    token_list = json.loads(token_list_str)
+                    wav_path_output = get_wav_from_token_list(token_list, prompt_speech_data)
+                    output_res = text_res
+                except (ValueError, json.JSONDecodeError) as e:
+                    print(f"处理S2S输出时出错: {e}")
+                    output_res = f"错误：无法解析模型输出 - {output_res}"
+        return output_res, wav_path_output
+    # --- 绑定事件 (下拉框逻辑不变) ---
+    task_dropdown.change(fn=lambda choice: gr.update(visible=choice == "自主输入文本"), inputs=task_dropdown,
+                         outputs=custom_prompt_input)
+    task_dropdown.change(fn=lambda choice: gr.update(visible=choice == "TTS任务"), inputs=task_dropdown,
+                         outputs=tts_input)
+    task_dropdown.change(fn=lambda choice: gr.update(visible=choice == "T2T任务"), inputs=task_dropdown,
+                         outputs=t2t_input)
+    submit_button.click(
+        fn=handle_submit,
+        # 在 inputs 列表中添加模型选择器 `model_selector`
+        inputs=[model_selector, audio_input, task_dropdown, custom_prompt_input, tts_input, t2t_input,
+                prompt_speech_dropdown],
+        outputs=[output_text, audio_player]
+    ).then(
+        fn=show_confirmation,
+        inputs=[output_text, audio_input, task_dropdown],
+        outputs=[confirmation_row, output_text, audio_input, task_dropdown]
+    )
+    download_button.click(fn=download_audio, inputs=[audio_input], outputs=[download_button])
+    save_button.click(fn=save_result, inputs=[confirmation_buttons, audio_input, task_dropdown, output_text],
+                      outputs=confirmation_row)
+# --- 关键修改：为两个模型分别进行预热 ---
+print("开始预热模型...")
+warmup_wav_path = "./tts/assert/hq_1.wav"
+warmup_prompt = "将这段音频的语音内容详细记录为文字稿。"
+for model_name, model_info in loaded_models.items():
+    print(f"正在预热 {model_name}...")
+    try:
+        # 使用重构后的 do_s2t 函数进行预热，传入对应的模型
+        res_text = do_s2t(model_info["model"], warmup_wav_path, warmup_prompt, profile=False)
+        print(f'{model_name} 预热完成。ASR推理结果: {res_text}')
+    except Exception as e:
+        print(f"预热 {model_name} 时发生错误: {e}")
+# 启动Gradio界面
+print("\nGradio 界面启动中...")
+demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

common_utils/__init__.py ADDED Viewed

File without changes

common_utils/convert_ckpt_dir_to_pt.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from gxl_ai_utils.utils import utils_file
+import torch
+try:
+    import torch_npu
+except:
+    pass
+import os
+def convert_ckpt_to_pt(pt_dir_path):
+    exp_dir = os.path.dirname(pt_dir_path)
+    pt_name = os.path.basename(pt_dir_path)
+    weight_dict = torch.load(f"{exp_dir}/{pt_name}/mp_rank_00_model_states.pt", map_location=torch.device('cpu'))[
+        'module']
+    print(weight_dict.keys())
+    torch.save(weight_dict, f"{exp_dir}/{pt_name}.pt")
+if __name__ == '__main__':
+    pt_dir_path, = utils_file.do_get_commandline_param(1, ["pt_dir_path"])
+    exp_dir = os.path.dirname(pt_dir_path)
+    pt_name = os.path.basename(pt_dir_path)
+    weight_dict = torch.load(f"{exp_dir}/{pt_name}/mp_rank_00_model_states.pt", map_location=torch.device('cpu'))[
+        'module']
+    print(weight_dict.keys())
+    torch.save(weight_dict, f"{exp_dir}/{pt_name}.pt")
+# weigth_dict = torch.load("/mnt/sfs/asr/code/wenet_undersdand_and_speech_xlgeng/examples/wenetspeech/whisper/exp/epoch24_cosyvoice1_new-set_token_1w_plus-multi_task_new/step_24999.pt")

common_utils/load_combine_type_yaml.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+import random
+import time
+from gxl_ai_utils.utils import utils_file
+data_config_path, tmp_file_path = utils_file.do_get_commandline_param(2)
+# random.seed(10086)# 老的
+# 把当前时间戳作为随机种子
+random.seed(int(time.time()))
+# random.seed(7891)# 尝试一下新的顺序  #7890
+data_info_dict = utils_file.load_dict_from_yaml(data_config_path)
+if data_info_dict is None:
+    data_info_dict = {}
+total_list = []
+for data_info in data_info_dict.values():
+    if "path" not in data_info:
+        print(f"path or weight not in data_info:{data_info}")
+        continue
+    if "weight" not in data_info:
+        data_weight = 1
+    else:
+        data_weight = int(float(data_info['weight']))
+    data_path_i = data_info['path']
+    utils_file.logging_info(f'path:{data_path_i} ')
+    if data_weight == 0:
+        data_weight = float(data_info['weight'])
+        if data_weight >= 0:
+            utils_file.logging_info(f'data {data_path_i} weight is {data_weight}, will be used as a list')
+        final_data_list_i_tmp = utils_file.load_list_file_clean(data_path_i)
+        true_num = int(len(final_data_list_i_tmp)*data_weight)
+        final_data_list_i = utils_file.do_get_random_sublist(final_data_list_i_tmp, true_num)
+    else:
+        final_data_list_i = utils_file.load_list_file_clean(data_path_i) * data_weight
+    # 判断数据类型
+    if "combines_list.txt" in data_path_i:
+        print(f'是 combine类型的数据')
+        tar_root_path = data_path_i.replace('combines_list.txt', 'combines_tar_root.txt')
+        if not os.path.exists(tar_root_path):
+            utils_file.logging_info(f'combine_list.txt:{data_path_i} 对应的 combines_tar_root.txt:{tar_root_path} 不存在')
+            continue
+        tar_root = utils_file.load_first_row_clean(tar_root_path)
+        if tar_root.endswith('/'):
+            tar_root = tar_root[:-1]
+        utils_file.logging_info(f' tar_root:{tar_root}')
+        new_final_data_list_i = []
+        for data_path_j in final_data_list_i:
+            # "combine_path|shard_path"
+            tmp_lines = f'{data_path_j}|{tar_root}/{utils_file.do_get_file_pure_name_from_path(data_path_j)}.tar'
+            new_final_data_list_i.append(tmp_lines)
+    else:
+        print(f'不是 combine类型的数据,是传统shard类型的数据')
+        new_final_data_list_i = [f'-|{data_path_j}' for data_path_j in final_data_list_i]
+    utils_file.logging_info(f'true load num is : {len(new_final_data_list_i)}')
+    total_list.extend(new_final_data_list_i)
+random.shuffle(total_list)
+utils_file.write_list_to_file(total_list, tmp_file_path)

common_utils/utils4infer.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import copy
+import os
+import random
+import re
+import yaml
+from cn2an import an2cn
+from gxl_ai_utils.utils import utils_file
+from wenet.utils.init_tokenizer import init_tokenizer
+from gxl_ai_utils.config.gxl_config import GxlNode
+from wenet.utils.init_model import init_model
+import logging
+import librosa
+import torch
+import torchaudio
+def load_model_and_tokenizer(checkpoint_path, config_path, device:torch.device=torch.device('cuda')):
+    """
+    封装了加载模型和分词器的逻辑
+    Args:
+        checkpoint_path (str): 模型权重文件路径
+        config_path (str): 模型配置文件路径
+        device (torch.device): 加载模型的设备
+    Returns:
+        model: 加载好的模型
+        tokenizer: 加载好的分词器
+    """
+    print(f"正在从以下路径加载模型: {checkpoint_path}")
+    args = GxlNode({"checkpoint": checkpoint_path})
+    configs = utils_file.load_dict_from_yaml(config_path)
+    model, configs = init_model(args, configs)
+    model = model.to(device).to(torch.bfloat16)
+    model.eval()  # 设置为评估模式
+    tokenizer = init_tokenizer(configs)
+    print(f"模型 {checkpoint_path} 加载完成并移动到 {device}")
+    return model, tokenizer
+def token_list2wav(token_list, prompt_speech, wav_path, cosyvoice):
+    token_list = [int(i) for i in token_list]
+    j = cosyvoice.inference_zero_shot_gz_22k(
+        '收到好友从远方寄来的生日礼物。',
+        '希望你以后能够做的比我还好呦。', prompt_speech, stream=False, token_list=token_list)
+    utils_file.makedir_for_file(wav_path)
+    torchaudio.save(wav_path, j['tts_speech'],cosyvoice.sample_rate)
+    print(f'语音合成完成，保存到 {wav_path}')
+    return wav_path
+def do_resample(input_wav_path):
+    """..."""
+    waveform, sample_rate = torchaudio.load(input_wav_path)
+    if waveform.shape[0] > 1:
+        waveform = torch.mean(waveform, dim=0, keepdim=True)
+    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
+    waveform = resampler(waveform)
+    return waveform, 16000
+def get_feat_from_wav_path(input_wav_path, device:torch.device=torch.device('cuda')):
+    """..."""
+    waveform, sample_rate = do_resample(input_wav_path)
+    waveform = waveform.squeeze(0)
+    window = torch.hann_window(400)
+    stft = torch.stft(waveform, 400, 160, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    filters = torch.from_numpy(librosa.filters.mel(sr=sample_rate, n_fft=400, n_mels=80))
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    feat = log_spec.transpose(0, 1)
+    feat_lens = torch.tensor([feat.shape[0]], dtype=torch.int64).to(device)
+    feat = feat.unsqueeze(0).to(device)
+    feat = feat.to(torch.bfloat16)
+    return feat, feat_lens
+def do_format_shard_manifest4one(input_shards_path, tmp_file_path=None):
+    if tmp_file_path is None:
+        tmp_file_path = f'~/.cache/.temp/{random.randint(10000, 99999)}.txt'
+    data_path_i = input_shards_path
+    utils_file.logging_info(f'path:{data_path_i} ')
+    final_data_list_i = utils_file.load_list_file_clean(data_path_i)
+    # 判断数据类型
+    if "combines_list.txt" in data_path_i:
+        print(f'是 combine类型的数据')
+        tar_root_path = data_path_i.replace('combines_list.txt', 'combines_tar_root.txt')
+        if not os.path.exists(tar_root_path):
+            utils_file.logging_error(
+                f'combine_list.txt:{data_path_i} 对应的 combines_tar_root.txt:{tar_root_path} 不存在')
+            return
+        tar_root = utils_file.load_first_row_clean(tar_root_path)
+        if tar_root.endswith('/'):
+            tar_root = tar_root[:-1]
+        utils_file.logging_info(f' tar_root:{tar_root}')
+        new_final_data_list_i = []
+        for data_path_j in final_data_list_i:
+            # "combine_path|shard_path"
+            tmp_lines = f'{data_path_j}|{tar_root}/{utils_file.do_get_file_pure_name_from_path(data_path_j)}.tar'
+            new_final_data_list_i.append(tmp_lines)
+    else:
+        print(f'不是 combine类型的数据,是传统shard类型的数据')
+        new_final_data_list_i = [f'-|{data_path_j}' for data_path_j in final_data_list_i]
+    utils_file.logging_info(f'true load num is : {len(new_final_data_list_i)}')
+    utils_file.write_list_to_file(new_final_data_list_i, tmp_file_path)
+    return tmp_file_path
+def convert_numbers_in_string(s):
+    # 正则表达式匹配数字（支持整数、小数、负数）
+    pattern = r'-?\d+\.?\d*'
+    def replace_func(match):
+        num_str = match.group()
+        try:
+            # 尝试转换数字
+            return an2cn(num_str)
+        except ValueError:
+            # 若转换失败（如非有效数字），返回原内容
+            return num_str
+    # 替换字符串中所有匹配的数字
+    return re.sub(pattern, replace_func, s)
+def get_test_conf(config_path):
+    with open(config_path, 'r', encoding='utf-8') as fin:
+        print(f"加载配置文件 {config_path}")
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+    configs['dataset_conf']['filter_conf']['filter_no_extra_info'] = False
+    test_conf = copy.deepcopy(configs['dataset_conf'])
+    # test_conf['filter_conf']['max_length'] = 3000 # whisper最长处理30s 102400
+    test_conf['filter_conf']['min_length'] = 10
+    test_conf['filter_conf']['token_max_length'] = 102400
+    test_conf['filter_conf']['token_min_length'] = 1
+    test_conf['filter_conf']['max_output_input_ratio'] = 102400
+    test_conf['filter_conf']['min_output_input_ratio'] = 0
+    test_conf['filter_conf']['filter_no_extra_info'] = False
+    test_conf['filter_conf']['max_seq_len'] = 102400
+    test_conf['speed_perturb'] = False
+    test_conf['spec_aug'] = False
+    test_conf['spec_sub'] = False
+    test_conf['spec_trim'] = False
+    test_conf['shuffle'] = False
+    test_conf['sort'] = False
+    test_conf['cycle'] = 1
+    test_conf['list_shuffle'] = True
+    if 'fbank_conf' in test_conf:
+        test_conf['fbank_conf']['dither'] = 0.0
+    elif 'mfcc_conf' in test_conf:
+        test_conf['mfcc_conf']['dither'] = 0.0
+    test_conf['batch_conf']['batch_type'] = "static"
+    test_conf['batch_conf']['batch_size'] = 1
+    test_conf['split_num'] = 1
+    test_conf['multi_num'] = 1
+    test_conf['other_filter_conf'] = {}
+    test_conf['data_recover'] = False
+    return configs, test_conf

conf/ct_config.yaml ADDED Viewed

	@@ -0,0 +1,153 @@

+model: osum_echat
+# llm_path
+llm_path: &llm_path "Qwen/Qwen2.5-3B-Instruct"
+#
+# model config
+downsample_rate: 4 # 1 2 4 8
+adapter_type: osum_echat
+if_instruct: true
+input_dim: 80
+# tokenizer ,gxl
+tokenizer: huggingface
+tokenizer_conf:
+  llm_path: *llm_path
+# lora config
+use_lora: false
+lora_alpha: 32
+lora_rank: 64 # 3B -> 85M
+lora_dropout: 0.1
+# speech generate config
+speech_token_num: &token_num 4097 #4097
+# Configuration of parameters for training
+fire_module: link_and_encoder_and_lora  # link  encoder llm  link_and_encoder link_and_encoder_and_lora, llm需要配合use_lora为true
+# other config
+grad_clip: 5
+accum_grad: 8
+log_interval: 10
+save_interval: 1250 #1250 #2500
+max_epoch: 1
+init_step: true
+# training config
+optim: adamw
+optim_conf:
+  betas:
+  - 0.9
+  - 0.99
+  eps: 1.0e-06
+  lr: 1.0e-06
+  weight_decay: 0.01
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 2000
+dataset: asr
+dataset_conf:
+  speech_token_num: *token_num
+  batch_conf:
+    batch_size: 26
+    batch_type: dynamic
+    max_frames_in_batch: 28000000 #3000 #9000 #3000 #3300 # 3900
+    max_seq_in_batch: 3700 #1500 #4000 #1100 #1600 # 1900
+  feats_type: log_mel_spectrogram
+  filter_conf:
+    max_length: 20000
+    min_length: 20
+    token_max_length: 1200
+    token_min_length: 1
+    filter_no_extra_info: true # 如果没有task lang 等信息,直接过滤掉, 适用于通用多任务训练, 推理时应该关掉
+    max_seq_len: 2000  #、1100 #1000
+    other_filter_conf:
+      only_s2s: false # 只针对与s2s dataloader的过滤
+      only_s2t: false # 只针对与s2t dataloader的过滤
+      only_t2t: false # 只针对与t2t dataloader的过滤
+      only_t2s: false # 只针对与t2s dataloader的过滤
+  language_conf:
+    limited_langs:
+    - zh
+  log_mel_spectrogram_conf:
+    hop_length: 160
+    n_fft: 400
+    num_mel_bins: 80
+    padding: 0
+  resample_conf:
+    resample_rate: 16000
+  shuffle: true
+  shuffle_conf:
+    shuffle_size: 1500
+  sort: true
+  sort_conf:
+    sort_size: 500
+  spec_aug: true
+  spec_aug_conf:
+    max_f: 10
+    max_t: 50
+    num_f_mask: 2
+    num_t_mask: 2
+  spec_sub: true
+  spec_sub_conf:
+    max_t: 30
+    num_t_sub: 3
+  spec_trim: false
+  speed_perturb: false
+  eod_id: 151645
+  split_num: 1
+  multi_num: 2
+  prompt_conf_path: conf/prompt_config.yaml
+  data_recover: false
+  data_recover_conf:
+    start_idx: 0 # 删除前面start_idx个item(tar包)
+  other_tokenze_conf:  # 一些对数据额外操作的可控按钮,这些操作一般来说再test时都得为false
+    only_info:
+      only_s2s: false # 只针对与s2s dataloader的过滤
+      only_s2t: false # 只针对与s2t dataloader的过滤
+      only_t2t: false # 只针对与t2t dataloader的过滤
+      only_t2s: false # 只针对与t2s dataloader的过滤
+    use_50_per_change_if_only_X: true # 50%的句子随机替换为其only X
+    use_s2s_streaming_random:
+      enable: false
+      rate: 0.5 # 1.0 表示100%的句子随机替换为其only X
+    natural_language_convert:
+      enable: false
+      rate: 0.00 # 1.0 表示100%的转换成自然语言模式
+    use_s2s_convert_s2t:
+      enable: false # 单独为s2t dataloader 开启s2s convert
+      rate: 1.0 # 1.0 表示100%的句子随机替换为其only X
+    use_streaming_tts:
+      enable: false
+      rate: 0.5 # 1.0 表示100%的句子随机替换为其only X
+    use_think_mode:
+      enable: false # 开启think 模式, 即随机替换为think模式的句子
+      rate: 0.8
+  other_filter_conf:
+    fiter_txt_is_None: true # 过滤掉text is "<NONE>"的语音数据,适配由于gender数据部分含有<NONE>标签而设计。但仅train起作用
+# model config for encoder
+encoder: transformer
+encoder_conf:
+  activation_type: gelu
+  attention_dropout_rate: 0.0
+  attention_heads: 16
+  dropout_rate: 0.1
+  gradient_checkpointing: true
+  input_layer: conv1d2
+  key_bias: false
+  linear_units: 4096
+  normalize_before: true
+  num_blocks: 24
+  output_size: 1024
+  pos_enc_layer_type: abs_pos_whisper
+  positional_dropout_rate: 0.1
+  static_chunk_size: -1
+  use_dynamic_chunk: false
+  use_dynamic_left_chunk: false

conf/ct_config_sft.yaml ADDED Viewed

	@@ -0,0 +1,152 @@

+model: llmasr
+# llm_path
+llm_path: &llm_path "/home/A02_tmpdata3/ckpt/Qwen2.5-3B-Instruct"
+#
+# model config
+downsample_rate: 4 # 1 2 4 8
+adapter_type: osum_echat
+if_instruct: true
+input_dim: 80
+# tokenizer ,gxl
+tokenizer: huggingface
+tokenizer_conf:
+  llm_path: *llm_path
+# lora config
+use_lora: false
+lora_alpha: 32
+lora_rank: 64 # 3B -> 85M
+lora_dropout: 0.1
+# speech generate config
+speech_token_num: &token_num 4097 #4097
+# Configuration of parameters for training
+fire_module: link_and_encoder_and_lora  # link  encoder llm  link_and_encoder link_and_encoder_and_lora, llm需要配合use_lora为true
+# other config
+grad_clip: 5
+accum_grad: 8
+log_interval: 10
+save_interval: 125 #1250 #2500
+max_epoch: 1
+init_step: true
+# training config
+optim: adamw
+optim_conf:
+  betas:
+  - 0.9
+  - 0.99
+  eps: 1.0e-06
+  lr: 1.0e-06
+  weight_decay: 0.01
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 400
+dataset: asr
+dataset_conf:
+  speech_token_num: *token_num
+  batch_conf:
+    batch_size: 26
+    batch_type: dynamic
+    max_frames_in_batch: 28000000 #3000 #9000 #3000 #3300 # 3900
+    max_seq_in_batch: 3700 #1500 #4000 #1100 #1600 # 1900
+  feats_type: log_mel_spectrogram
+  filter_conf:
+    max_length: 20000
+    min_length: 20
+    token_max_length: 1200
+    token_min_length: 1
+    filter_no_extra_info: true # 如果没有task lang 等信息,直接过滤掉, 适用于通用多任务训练, 推理时应该关掉
+    max_seq_len: 2000  #、1100 #1000
+    other_filter_conf:
+      only_s2s: false # 只针对与s2s dataloader的过滤
+      only_s2t: false # 只针对与s2t dataloader的过滤
+      only_t2t: false # 只针对与t2t dataloader的过滤
+      only_t2s: false # 只针对与t2s dataloader的过滤
+  language_conf:
+    limited_langs:
+    - zh
+  log_mel_spectrogram_conf:
+    hop_length: 160
+    n_fft: 400
+    num_mel_bins: 80
+    padding: 0
+  resample_conf:
+    resample_rate: 16000
+  shuffle: true
+  shuffle_conf:
+    shuffle_size: 1500
+  sort: true
+  sort_conf:
+    sort_size: 500
+  spec_aug: true
+  spec_aug_conf:
+    max_f: 10
+    max_t: 50
+    num_f_mask: 2
+    num_t_mask: 2
+  spec_sub: true
+  spec_sub_conf:
+    max_t: 30
+    num_t_sub: 3
+  spec_trim: false
+  speed_perturb: false
+  eod_id: 151645
+  split_num: 1
+  multi_num: 2
+  prompt_conf_path: conf/prompt_config.yaml
+  data_recover: false
+  data_recover_conf:
+    start_idx: 0 # 删除前面start_idx个item(tar包)
+  other_tokenze_conf:  # 一些对数据额外操作的可控按钮,这些操作一般来说再test时都得为false
+    only_info:
+      only_s2s: false # 只针对与s2s dataloader的过滤
+      only_s2t: false # 只针对与s2t dataloader的过滤
+      only_t2t: false # 只针对与t2t dataloader的过滤
+      only_t2s: false # 只针对与t2s dataloader的过滤
+    use_50_per_change_if_only_X: true # 50%的句子随机替换为其only X
+    use_s2s_streaming_random:
+      enable: false
+      rate: 0.5 # 1.0 表示100%的句子随机替换为其only X
+    natural_language_convert:
+      enable: false
+      rate: 0.00 # 1.0 表示100%的转换成自然语言模式
+    use_s2s_convert_s2t:
+      enable: false # 单独为s2t dataloader 开启s2s convert
+      rate: 1.0 # 1.0 表示100%的句子随机替换为其only X
+    use_streaming_tts:
+      enable: false
+      rate: 0.5 # 1.0 表示100%的句子随机替换为其only X
+    use_think_mode:
+      enable: false # 开启think 模式, 即随机替换为think模式的句子
+      rate: 0.8
+  other_filter_conf:
+    fiter_txt_is_None: true # 过滤掉text is "<NONE>"的语音数据,适配由于gender数据部分含有<NONE>标签而设计。但仅train起作用
+# model config for encoder
+encoder: transformer
+encoder_conf:
+  activation_type: gelu
+  attention_dropout_rate: 0.0
+  attention_heads: 16
+  dropout_rate: 0.1
+  gradient_checkpointing: true
+  input_layer: conv1d2
+  key_bias: false
+  linear_units: 4096
+  normalize_before: true
+  num_blocks: 24
+  output_size: 1024
+  pos_enc_layer_type: abs_pos_whisper
+  positional_dropout_rate: 0.1
+  static_chunk_size: -1
+  use_dynamic_chunk: false
+  use_dynamic_left_chunk: false

conf/data_s2s.yaml ADDED Viewed

	@@ -0,0 +1,226 @@

+#  ===========================副语言 s2s thinking ===================================
+# age gender,
+age_gender_common:
+  path: /home/A02_tmpdata3/osum_s2s/gender/xlgeng_new_data/s2s_thinking/doubao/combines_list.txt
+  tar_num: 1511
+  weight: 2
+gender_xianshi:
+  path: /home/A02_tmpdata3/osum_s2s/sex_xianshi_cosyvoice2_by_cywang_added_by_20250625/raw_data/s2s_handle/xlgeng_new_data/s2s_thinking/doubao/combines_list.txt
+  tar_num: 30
+  weight: 2
+gender_yinshi_3k:
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_sex_yinshi_7_4_osum_by_cywang_added_by_20250708/raw_data/s2s_handle/xlgeng_new_data/s2s_thinking/doubao/combines_list.txt
+  tar_num: 3
+  weight: 2
+gender_yinshi_5k:
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_sex_yinshi_5000_6_13_data_by_gjli_added_by_20250622/raw_data/s2s_handle/xlgeng_new_data/s2s_thinking/doubao/combines_list.txt
+  tar_num: 6
+  weight: 2
+age_xianshi:
+  path: /home/A02_tmpdata3/osum_s2s/age_xianshi_cosyvoice2_by_cywang_added_by_20250625/raw_data/s2s_handle/xlgeng_new_data/s2s_thinking/doubao/combines_list.txt
+  tar_num: 25
+  weight: 2
+# caption
+caption_common_7label:
+  path: /home/A02_tmpdata3/osum_s2s/caption/raw_data/xlgeng_new_data/s2s_thinking/doubao/combines_list.txt
+  tar_num: 162
+  weight: 2
+caption_common_50_label:
+  path: /home/A02_tmpdata3/osum_s2s/caption_add_2025_1_6/raw_data/s2s_data_with_gender/xlgeng_new_data/s2s_thinking/doubao/combines_list.txt
+  tar_num: 395  # 实际是196k
+  weight: 2
+caption_xianshi:
+  path: /home/A02_tmpdata3/osum_s2s/caption_s2s_xianshi_20250806/raw_data/s2s_data/xlgeng_new_data/s2s_thinking/doubao/combines_list.txt
+  tar_num: 6
+  weight: 10
+# emotion
+emotion_100K_sensevoice:
+  path: /home/A02_tmpdata3/osum_s2s/emotion_yinshi_zxzhao_with_q_emo_by_cywang_added_by_20250701/handle_data/s2s_handle/xlgeng_new_data/s2s_thinking/doubao/combines_list.txt
+  tar_num: 107
+  weight: 10
+emotion_30K_sensevoice:
+  path: /home/A02_tmpdata3/emotion/中英混多音色情感数据库/s2s_handle/xlgeng_new_data/s2s_thinking/doubao/combines_list.txt
+  tar_num: 33
+  weight: 10
+S2SChat_osum_setting_qa_527_updated_by_cywang_added_by_20250616_think:
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_osum_setting_qa_527_updated_by_cywang_added_by_20250616/raw_data/s2s_handle/xlgeng_new_data/s2s_thinking/doubao/combines_list.txt
+  shard_num: 8
+  weight: 10
+# ======================================s2s 副语言 thinking end=====================================
+#  ===========================副语言 s2s no thinking ===================================
+# age gender,
+age_gender_common_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/gender/xlgeng_new_data/s2s_no_thinking/doubao/combines_list.txt
+  tar_num: 1511
+  weight: 2
+gender_xianshi_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/sex_xianshi_cosyvoice2_by_cywang_added_by_20250625/raw_data/s2s_handle/xlgeng_new_data/s2s_no_thinking/doubao/combines_list.txt
+  tar_num: 30
+  weight: 2
+gender_yinshi_3k_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_sex_yinshi_7_4_osum_by_cywang_added_by_20250708/raw_data/s2s_handle/xlgeng_new_data/s2s_no_thinking/doubao/combines_list.txt
+  tar_num: 3
+  weight: 2
+gender_yinshi_5k_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_sex_yinshi_5000_6_13_data_by_gjli_added_by_20250622/raw_data/s2s_handle/xlgeng_new_data/s2s_no_thinking/doubao/combines_list.txt
+  tar_num: 6
+  weight: 2
+age_xianshi_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/age_xianshi_cosyvoice2_by_cywang_added_by_20250625/raw_data/s2s_handle/xlgeng_new_data/s2s_no_thinking/doubao/combines_list.txt
+  tar_num: 25
+  weight: 2
+# caption
+caption_common_7label_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/caption/raw_data/xlgeng_new_data/s2s_no_thinking/doubao/combines_list.txt
+  tar_num: 162
+  weight: 2
+caption_common_50_label_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/caption_add_2025_1_6/raw_data/s2s_data_with_gender/xlgeng_new_data/s2s_no_thinking/doubao/combines_list.txt
+  tar_num: 395  # 实际是196k
+  weight: 2
+caption_xianshi_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/caption_s2s_xianshi_20250806/raw_data/s2s_data/xlgeng_new_data/s2s_no_thinking/doubao/combines_list.txt
+  tar_num: 6
+  weight: 2
+# emotion
+emotion_100K_sensevoice_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/emotion_yinshi_zxzhao_with_q_emo_by_cywang_added_by_20250701/handle_data/s2s_handle/xlgeng_new_data/s2s_no_thinking/doubao/combines_list.txt
+  tar_num: 107
+  weight: 10
+emotion_30K_sensevoice_no_thinking:
+  path: /home/A02_tmpdata3/emotion/中英混多音色情感数据库/s2s_handle/xlgeng_new_data/s2s_no_thinking/doubao/combines_list.txt
+  tar_num: 33
+  weight: 10
+S2SChat_osum_setting_qa_527_updated_by_cywang_added_by_20250616:
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_osum_setting_qa_527_updated_by_cywang_added_by_20250616/raw_data/s2s_handle/xlgeng_new_data/s2s_no_thinking/doubao/combines_list.txt
+  shard_num: 8
+  weight: 10
+# -------------------------------------------s2s 副语言 no thinking end-------------------------------------------
+S2SChat_syndata_merged_by_300W_zhguo_added_by_20250616:
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_syndata_merged_by_300W_zhguo_added_by_20250616/combines_data_s2s/combines_list.txt
+  tar_num: 3000
+  weight: 1
+S2SChat_osum_total_data_lst_check_final_100W_by_zhguo_added_by_20250616:
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_osum_total_data_lst_check_final_100W_by_zhguo_added_by_20250616/combines_data_s2s/combines_list.txt
+  tar_num: 1000
+  weight: 1
+gaozhiliang_gbma:
+  path: /home/A02_tmpdata3/osum_s2s/gaozhiliang_gbma/shards_list.txt
+  new_data_list: /home/node44_tmpdata3/netease/gbma/workspace/osum/data/process/0803/all_data_info.jsonl
+  new_lab_path: /home/work_nfs23/asr_data/data/osum_chat/s2s/gaozhiliang_gbma/shards_list.txt
+  shard_num: 24
+  weight: 1
+# ======================================s2s  no thinking end=====================================
+# emotion explicit;
+S2SChat_0628_E1_shard_by_kxxia_added_by_20250630:
+  huawei_path: /mnt/sfs/asr/update_data/S2SChat_0628_E1_shard_by_kxxia_added_by_20250630/shards_list.txt
+  new_data_list: /home/work_nfs16/cywang/workspace/OSUM/E1/0628_E1_shard.jsonl
+  new_lab_path: /home/work_nfs11/cywang/data/shard/S2Chat/0628_E1_shard/shards_list.txt
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_0628_E1_shard_by_kxxia_added_by_20250630/shards_list.txt
+  shard_num: 147
+  description: "E1 shard, 情感显示数据"
+  weight: 1
+S2SChat_eng_e1_by_cywang_added_by_20250711:
+  huawei_path: /mnt/sfs/asr/update_data/S2SChat_eng_e1_by_cywang_added_by_20250711/shards_list.txt
+  new_data_list: /home/work_nfs16/kxxia/work/common/eng_e1.jsonl1752154262.3374825
+  new_lab_path: /home/work_nfs11/cywang/data/shard/S2Chat/eng_e1/shards_list.txt
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_eng_e1_by_cywang_added_by_20250711/shards_list.txt
+  shard_num: 50
+  weight: 2
+# 下面一共才200多个
+S2SChat_0630_trans_en2zh_by_cywang_added_by_20250704:
+  huawei_path: /mnt/sfs/asr/update_data/S2SChat_0630_trans_en2zh_by_cywang_added_by_20250704/shards_list.txt
+  new_data_list: /home/work_nfs16/cywang/workspace/OSUM/trans_emotion/0630_trans_en2zh.jsonl
+  new_lab_path: /home/work_nfs11/cywang/data/shard/S2Chat/0630_trans_en2zh/shards_list.txt
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_0630_trans_en2zh_by_cywang_added_by_20250704/shards_list.txt
+  shard_num: 128
+  weight: 0.5
+S2SChat_0630_trans_zh2en_by_cywang_added_by_20250704:
+  huawei_path: /mnt/sfs/asr/update_data/S2SChat_0630_trans_zh2en_by_cywang_added_by_20250704/shards_list.txt
+  new_data_list: /home/work_nfs16/cywang/workspace/OSUM/trans_emotion/0630_trans_zh2en.jsonl
+  new_lab_path: /home/work_nfs11/cywang/data/shard/S2Chat/0630_trans_zh2en/shards_list.txt
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_0630_trans_zh2en_by_cywang_added_by_20250704/shards_list.txt
+  shard_num: 128
+  weight: 0.5
+S2SChat_pachong_part1_filter_author_data_by_gjli_added_by_20250622:
+  shard_num: 28
+  huawei_path: /mnt/sfs/asr/update_data/S2SChat_pachong_part1_filter_author_data_by_gjli_added_by_20250622/shards_list.txt
+  new_data_list: /home/work_nfs16/gjli/workspaces/poem/6-16_shigepachong/pachong_part1_filter_content_data.list
+  new_lab_path: /home/work_nfs11/cywang/data/shard/S2Chat/pachong_part1_filter_author_data/shards_list.txt
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_pachong_part1_filter_author_data_by_gjli_added_by_20250622/shards_list.txt
+  weight: 1
+S2SChat_pachong_part1_filter_content_data_by_gjli_added_by_20250622:
+  huawei_path: /mnt/sfs/asr/update_data/S2SChat_pachong_part1_filter_content_data_by_gjli_added_by_20250622/shards_list.txt
+  new_data_list: /home/work_nfs16/gjli/workspaces/poem/6-16_shigepachong/pachong_part1_filter_author_data.list
+  new_lab_path: /home/work_nfs11/cywang/data/shard/S2Chat/pachong_part1_filter_content_data/shards_list.txt
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_pachong_part1_filter_content_data_by_gjli_added_by_20250622/shards_list.txt
+  shard_num: 68
+  weight: 1
+S2SChat_poem_1_2_6_3_author_data_150num_by_gjli_added_by_20250622:
+  huawei_path: /mnt/sfs/asr/update_data/S2SChat_poem_1_2_6_3_author_data_150num_by_gjli_added_by_20250622/shards_list.txt
+  new_data_list: /home/work_nfs16/gjli/workspaces/poem/6.3/poem_1_2_6-3_author_data.list
+  new_lab_path: /home/work_nfs11/cywang/data/shard/S2Chat/poem_1_2_6-3_author_data/shards_list.txt
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_poem_1_2_6_3_author_data_150num_by_gjli_added_by_20250622/shards_list.txt
+  shard_num: 2
+  weight: 1
+S2SChat_poem_1_2_6_3_content_data_150num_by_gjli_added_by_20250622:
+  huawei_path: /mnt/sfs/asr/update_data/S2SChat_poem_1_2_6_3_content_data_150num_by_gjli_added_by_20250622/shards_list.txt
+  new_data_list: /home/work_nfs16/gjli/workspaces/poem/6.3/poem_1_2_6-3_content_data.list
+  new_lab_path: /home/work_nfs11/cywang/data/shard/S2Chat/poem_1_2_6-3_content_data/shards_list.txt
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_poem_1_2_6_3_content_data_150num_by_gjli_added_by_20250622/shards_list.txt
+  shard_num: 9
+  weight: 1
+S2SChat_poem_500_author_data_new_by_gjli_added_by_20250622:
+  huawei_path: /mnt/sfs/asr/update_data/S2SChat_poem_500_author_data_new_by_gjli_added_by_20250622/shards_list.txt
+  new_data_list: /home/work_nfs16/gjli/workspaces/poem/poem_500_author_data_new.list
+  new_lab_path: /home/work_nfs11/cywang/data/shard/S2Chat/poem_500_author_data_new/shards_list.txt
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_poem_500_author_data_new_by_gjli_added_by_20250622/shards_list.txt
+  shard_num: 4
+  weight: 1
+S2SChat_poem_500_content_data_new_by_gjli_added_by_20250622:
+  huawei_path: /mnt/sfs/asr/update_data/S2SChat_poem_500_content_data_new_by_gjli_added_by_20250622/shards_list.txt
+  new_data_list: /home/work_nfs16/gjli/workspaces/poem/poem_500_content_data_new.list
+  new_lab_path: /home/work_nfs11/cywang/data/shard/S2Chat/poem_500_content_data_new/shards_list.txt
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_poem_500_content_data_new_by_gjli_added_by_20250622/shards_list.txt
+  shard_num: 4
+  weight: 1

conf/data_s2t.yaml ADDED Viewed

	@@ -0,0 +1,402 @@

+# age gender,
+age_gender_common:
+  path: /home/A02_tmpdata3/osum_s2s/gender/xlgeng_new_data/s2t_thinking/doubao/combines_list.txt
+  tar_num: 1511
+gender_xianshi:
+  path: /home/A02_tmpdata3/osum_s2s/sex_xianshi_cosyvoice2_by_cywang_added_by_20250625/raw_data/s2s_handle/xlgeng_new_data/s2t_thinking/doubao/combines_list.txt
+  tar_num: 30
+gender_yinshi_3k:
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_sex_yinshi_7_4_osum_by_cywang_added_by_20250708/raw_data/s2s_handle/xlgeng_new_data/s2t_thinking/doubao/combines_list.txt
+  tar_num: 3
+gender_yinshi_5k:
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_sex_yinshi_5000_6_13_data_by_gjli_added_by_20250622/raw_data/s2s_handle/xlgeng_new_data/s2t_thinking/doubao/combines_list.txt
+  tar_num: 6
+age_xianshi:
+  path: /home/A02_tmpdata3/osum_s2s/age_xianshi_cosyvoice2_by_cywang_added_by_20250625/raw_data/s2s_handle/xlgeng_new_data/s2t_thinking/doubao/combines_list.txt
+  tar_num: 25
+# caption
+caption_common_7label:
+  path: /home/A02_tmpdata3/osum_s2s/caption/raw_data/xlgeng_new_data/s2t_thinking/doubao/combines_list.txt
+  tar_num: 162
+caption_common_50_label:
+  path: /home/A02_tmpdata3/osum_s2s/caption_add_2025_1_6/raw_data/s2s_data_with_gender/xlgeng_new_data/s2t_thinking/doubao/combines_list.txt
+  tar_num: 395  # 实际是196k
+caption_xianshi:
+  path: /home/A02_tmpdata3/osum_s2s/caption_s2s_xianshi_20250806/raw_data/s2s_data/xlgeng_new_data/s2t_thinking/doubao/combines_list.txt
+  tar_num: 6
+  weight: 10
+# emotion
+emotion_100K_sensevoice:
+  path: /home/A02_tmpdata3/osum_s2s/emotion_yinshi_zxzhao_with_q_emo_by_cywang_added_by_20250701/handle_data/s2s_handle/xlgeng_new_data/s2t_thinking/doubao/combines_list.txt
+  tar_num: 107
+  weight: 10
+emotion_30K_sensevoice:
+  path: /home/A02_tmpdata3/emotion/中英混多音色情感数据库/s2s_handle/xlgeng_new_data/s2t_thinking/doubao/combines_list.txt
+  tar_num: 33
+  weight: 10
+S2SChat_osum_setting_qa_527_updated_by_cywang_added_by_20250616_think:
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_osum_setting_qa_527_updated_by_cywang_added_by_20250616/raw_data/s2s_handle/xlgeng_new_data/s2t_thinking/doubao/combines_list.txt
+  shard_num: 8
+  weight: 10
+# ======================================s2s 副语言 thinking end=====================================
+#  ===========================副语言 s2s no thinking ===================================
+# age gender,
+age_gender_common_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/gender/xlgeng_new_data/s2t_no_thinking/doubao/combines_list.txt
+  tar_num: 1511
+gender_xianshi_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/sex_xianshi_cosyvoice2_by_cywang_added_by_20250625/raw_data/s2s_handle/xlgeng_new_data/s2t_no_thinking/doubao/combines_list.txt
+  tar_num: 30
+gender_yinshi_3k_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_sex_yinshi_7_4_osum_by_cywang_added_by_20250708/raw_data/s2s_handle/xlgeng_new_data/s2t_no_thinking/doubao/combines_list.txt
+  tar_num: 3
+gender_yinshi_5k_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_sex_yinshi_5000_6_13_data_by_gjli_added_by_20250622/raw_data/s2s_handle/xlgeng_new_data/s2t_no_thinking/doubao/combines_list.txt
+  tar_num: 6
+age_xianshi_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/age_xianshi_cosyvoice2_by_cywang_added_by_20250625/raw_data/s2s_handle/xlgeng_new_data/s2t_no_thinking/doubao/combines_list.txt
+  tar_num: 25
+# caption
+caption_common_7label_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/caption/raw_data/xlgeng_new_data/s2t_no_thinking/doubao/combines_list.txt
+  tar_num: 162
+caption_common_50_label_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/caption_add_2025_1_6/raw_data/s2s_data_with_gender/xlgeng_new_data/s2t_no_thinking/doubao/combines_list.txt
+  tar_num: 395  # 实际是196k
+caption_xianshi_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/caption_s2s_xianshi_20250806/raw_data/s2s_data/xlgeng_new_data/s2t_no_thinking/doubao/combines_list.txt
+  tar_num: 6
+# emotion
+emotion_100K_sensevoice_no_thinking:
+  path: /home/A02_tmpdata3/osum_s2s/emotion_yinshi_zxzhao_with_q_emo_by_cywang_added_by_20250701/handle_data/s2s_handle/xlgeng_new_data/s2t_no_thinking/doubao/combines_list.txt
+  tar_num: 107
+  weight: 10
+emotion_30K_sensevoice_no_thinking:
+  path: /home/A02_tmpdata3/emotion/中英混多音色情感数据库/s2s_handle/xlgeng_new_data/s2t_no_thinking/doubao/combines_list.txt
+  tar_num: 33
+  weight: 10
+S2SChat_osum_setting_qa_527_updated_by_cywang_added_by_20250616:
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_osum_setting_qa_527_updated_by_cywang_added_by_20250616/raw_data/s2s_handle/xlgeng_new_data/s2t_no_thinking/doubao/combines_list.txt
+  shard_num: 8
+  weight: 10
+# -------------------------------------------s2s 副语言 no thinking end-------------------------------------------
+# 知识问答
+S2SChat_syndata_merged_by_300W_zhguo_added_by_20250616:
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_syndata_merged_by_300W_zhguo_added_by_20250616/combines_data_s2t/combines_list.txt
+  tar_num: 3000
+S2SChat_osum_total_data_lst_check_final_100W_by_zhguo_added_by_20250616:
+  path: /home/A02_tmpdata3/osum_s2s/S2SChat_osum_total_data_lst_check_final_100W_by_zhguo_added_by_20250616/combines_data_s2t/combines_list.txt
+  tar_num: 1000
+# ======================================s2t 副语言 no thinking end==========================
+# 语音理解==========================================
+asr:
+  huawei_path: "/mnt/sfs/asr/asr/shards_list.txt" # 2.4
+  lab_path: "/home/node54_tmpdata/xlgeng/asr_data_2w/shards_list.txt"
+  path: "/home/A03_tmpdata1/s2s/asr_data_2.4w/asr_data_2w/shards_list.txt"
+  shard_num: 15477
+  weight: 0.1  # ~10000h
+# ===========理解任务 ==============================================
+librispeech:
+  huawei_path: "/mnt/sfs/asr/update_data/LibriSpeech_shard_common/shards_list.txt" #1000h
+  lab_path: "/home/work_nfs15/asr_data/data/LibriSpeech/LibriSpeech_shard_common/shards_list.txt"
+  path: "/home/A03_tmpdata3/asr_data/librispeech/shards_list.txt"
+  shard_num: 282
+  weight: 1
+mix_asru200_add_2025_2_14:
+  huawei_path: "/mnt/sfs/asr/update_data/mix_asru200_add_2025_2_14/shards_list.txt" # 200
+  path: "/home/A03_tmpdata1/s2s/asru700/train/shards_list.txt"
+  lab_path: "/home/work_nfs15/asr_data/data/ASRU700/train/shards_list.txt" # 中英混单词之间是有空格的
+  shard_num: 187
+  weight: 1
+caption:
+  path: "/home/A02_tmpdata3/osum_s2s/caption/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/caption/shards_list.txt" # 319h
+  lab_path: "/home/node54_tmpdata2/data4understand/update_data/caption/shards_list.txt"#  是cap audio set+aishell2的拼接
+  shard_num: 319
+  weight: 0.5
+caption_add_2025_1_6:
+  path: "/home/A02_tmpdata3/osum_s2s/caption_add_2025_1_6/shards_list.txt"
+  lab_path: "/home/work_nfs7/yacao/0106_twj_shard/shards_0306/add_label/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/caption_2025_1_6_newadd/shards_list.txt" # 130h
+  shard_num: 392
+  weight: 0.5
+caption_aslp_add_2025_1_15:
+  path: "/home/A02_tmpdata3/osum_s2s/caption_aslp_add_2025_1_15/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/caption_aslp_add_2025_1_15/shards_list.txt" # 5h
+  shard_num: 5
+  lab_path: "/home/work_nfs9/yacao/nfs7_copy/yacao/shard/0114_wjtian_simu2/aslp_caption_train/shards_list.txt"
+  weight: 5
+# 50类别的caption
+s2t_caption_50label:
+  shard_num: 392
+  path: "/home/A02_tmpdata3/osum_s2s/s2t_caption_50label/shards_list.txt"
+  lab_path: "/home/work_nfs7/yacao/0106_twj_shard/shards_0306/add_label/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/0106_twj_shard_caption_50label_add_by_2025_3_10/shards_list.txt" # 392tar
+  weight: 0.5 # 10
+emotion:  # 不全， 312tar
+  path: "/home/A02_tmpdata3/osum_s2s/emotion/shards_list.txt"
+  lab_path: "/home/xlgeng/sdb2/emotion/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/emotion/shards_list.txt"
+  shard_num: 370
+  weight: 0.5  # 538h
+emotion_stage2_add:
+  path: "/home/A02_tmpdata3/osum_s2s/emotion_stage2_add/shards_list.txt"
+  lab_path: "/home/xlgeng/sdb2/emotion_stage2_add/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/emotion_stage2_add/shards_list.txt"
+  shard_num: 44
+  weight: 0.1 # 150h
+emotion_stage3_add:
+  path: "/home/A02_tmpdata3/osum_s2s/emotion_stage3_add/shards_list.txt"
+  lab_path: "/home/xlgeng/sdb2/emotion_stage3_add/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/emotion_stage3_add/shards_list.txt"
+  shard_num: 53
+  weight: 0.1 # 138h
+emotion_stage4_add:
+  path: "/home/A02_tmpdata3/osum_s2s/emotion_stage4_add/shards_list.txt"
+  lab_path: "/home/xlgeng/sdb2/emotion_stage4_add/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/emotion_stage4_add/shards_list.txt"
+  shard_num: 54
+  weight: 0.1 #100h
+emotion_stage5_add:
+  path: "/home/A02_tmpdata3/osum_s2s/emotion_stage5_add/shards_list.txt"
+  lab_path: "/home/xlgeng/sdb2/emotion_stage5_add/shards_list.txt"
+  shard_num: 53
+  huawei_path: "/mnt/sfs/asr/emotion_stage5_add/shards_list.txt"
+  weight: 0.1
+emotion_meld:
+  path: "/home/A02_tmpdata3/osum_s2s/emotion_meld/shards_list.txt"
+  lab_path: "/home/xlgeng/sdb2/emotion_meld/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/emotion_meld/shards_list.txt" # 8h
+  shard_num: 9
+  weight: 1
+#emotion_dis_fear_add_2025_1_15:
+#  huawei_path: "/mnt/sfs/asr/update_data/emotion_dis_fear_add_2025_1_15/shards_list.txt"
+#  weight: 0
+emotion_lucy_Q_added_2025_4_9:
+  path: "/home/A02_tmpdata3/osum_s2s/s2s_lucy_Q_emotion/shards_list.txt"
+  shard_num: 121
+  lab_path: "/home/work_nfs11/cywang/data/shard/emotion/QEmo_Q_train/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/emotion_lucy_Q_added_2025_4_9/shards_list.txt"
+  weight: 0.5
+Age_with_noize_add_2025_2_4: # 不全，才245个
+  path: "/home/A02_tmpdata3/osum_s2s/age_3000_noize/shards_list.txt"
+  lab_path: "/home/work_nfs6/syliu/for_gxl/Age/simu_age/shards_list.txt"
+  shard_num: 2720
+  huawei_path: "/mnt/sfs/asr/update_data/Age_with_noize_add_2025_2_4/shards_list.txt"
+  weight: 0.1
+age:
+  path: "/home/A02_tmpdata3/osum_s2s/age_3000/shards_list.txt"
+  lab_path: "/home/work_nfs3/syliu/for_gxl/Age/age/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/age/shards_list.txt"
+  shard_num: 2820
+  weight: 0.1 #1.5  # 3000h
+gender:  # 不全，目前310个
+  shard_num: 1738
+  lab_path: "/home/xlgeng/sdb2/gender/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/sex/shards_list.txt" # 3000
+  path: "/home/A02_tmpdata3/osum_s2s/gender/shards_list.txt"
+  weight: 0.1 #1.5
+gender_add_2025_1_6_kaggle:  # 全了
+  shard_num: 116
+  path: "/home/A02_tmpdata3/osum_s2s/gender_kaggle/shards_list.txt"
+  lab_path: "/home/work_nfs3/syliu/for_gxl/new_gender/Sex/sex/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/sex_2025_1_6_newadd/shards_list.txt" # 107h, kaggle
+  weight: 0.1 #3
+gender_add_2025_2_4_fix: # 2100tar  # 不全，365个
+  path: "/home/A02_tmpdata3/osum_s2s/gender_add_2025_2_4_fix/shards_list.txt"
+  shard_num: 2140
+  lab_path: "/home/work_nfs6/xlgeng/for_gxl/gender_add_2025_2_4_fix/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/gender_add_2025_2_4_fix/shards_list.txt"
+  weight: 0.1
+gender_with_noize_add_2025_2_4: # 1500h  ,780tar # 不全，266个
+  path: "/home/A02_tmpdata3/osum_s2s/gender_with_noize_add_2025_2_4/shards_list.txt"
+  lab_path: "/home/work_nfs6/xlgeng/for_gxl/gender_with_noize_add_2025_2_4/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/gender_with_noize_add_2025_2_4/shards_list.txt"
+  shard_num: 780
+  weight: 0.1
+age_gender_stage2_add:
+  path: "/home/A02_tmpdata3/osum_s2s/age_gender_stage2_add/shards_list.txt"
+  lab_path: "/home/xlgeng/sdb2/age_gender_stage2_add/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/Speech_Age_Sex/shards_list.txt"
+  weight: 0.1 # 174h
+age_gender_add_2025_1_13:
+  path: "/home/A02_tmpdata3/osum_s2s/age_gender_add_2025_1_13/shards_list.txt"
+  lab_path: "/home/work_nfs3/syliu/for_gxl/Age_Sex/age_sex/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/Speech_Age_Sex_add_2025_1_13/shards_list.txt"
+  weight: 0.1 #2571h
+style_age_gender_stage3_add:
+  path: "/home/A02_tmpdata3/osum_s2s/style_age_gender_stage3_add/shards_list.txt"
+  lab_path: "/home/xlgeng/sdb2/style_age_gender_stage3_add/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/Speech_Style_Age_Sex/shards_list.txt"
+  weight: 0.1 # 85h
+age_gender_pure_stage3_add:
+  path: "/home/A02_tmpdata3/osum_s2s/age_gender_pure_stage3_add/shards_list.txt"
+  lab_path: "/home/xlgeng/sdb2/age_gender_pure_stage3_add/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/Age_Sex/shards_list.txt"
+  weight: 0.1 # 174h
+style_age_gender_pure_stage3_add:
+  path: "/home/A02_tmpdata3/osum_s2s/style_age_gender_pure_stage3_add/shards_list.txt"
+  lab_path: "/home/xlgeng/sdb2/style_age_gender_pure_stage3_add/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/Style_Age_Sex/shards_list.txt"
+  weight: 0.1 # 85h
+# 多任务, caption
+merged_output_caption_age_gender_add_2025_2_26:
+  path: "/home/A02_tmpdata3/osum_s2s/merged_output_caption_age_gender_add_2025_2_26/shards_list.txt"
+  lab_path: "/home/work_nfs7/yacao/0106_twj_shard/shards_0226/merged_output/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/multi_task/caption_new/merged_output/shards_list.txt"
+  weight: 0.1
+nfs10_time1_output_caption_age_gender_add_2025_2_26:
+  path: "/home/A02_tmpdata3/osum_s2s/nfs10_time1_output_caption_age_gender_add_2025_2_26/shards_list.txt"
+  lab_path: "/home/work_nfs7/yacao/0106_twj_shard/shards_0226/nfs10_time1/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/multi_task/caption_new/nfs10_time1/shards_list.txt"
+  weight: 0.1
+other_20000_caption_age_gender_add_2025_2_26:
+  path: "/home/A02_tmpdata3/osum_s2s/other_20000_caption_age_gender_add_2025_2_26/shards_list.txt"
+  lab_path: "/home/work_nfs7/yacao/0106_twj_shard/shards_0226/other_20000/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/multi_task/caption_new/other_20000/shards_list.txt"
+  weight: 0.1
+simu9_1227_caption_age_gender_add_2025_2_26:
+  path: "/home/A02_tmpdata3/osum_s2s/simu9_1227_caption_age_gender_add_2025_2_26/shards_list.txt"
+  lab_path: "/home/work_nfs7/yacao/0106_twj_shard/shards_0226/simu9_1227/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/multi_task/caption_new/simu9_1227/shards_list.txt"
+  weight: 0.1
+# 多任务, emotion
+merged_output_emotion_age_gender_add_2025_3_2:
+  path: "/home/A02_tmpdata3/osum_s2s/merged_output_emotion_age_gender_add_2025_3_2/shards_list.txt"
+  lab_path: "/home/work_nfs16/emotion_data/OSUM_age_gender/emotion_age_gender1/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/multi_task/emotion_age_gender1/shards_list.txt"
+  weight: 0.1
+merged_output_emotion_age_gender_add_2025_3_2_di2pi:
+  path: "/home/A02_tmpdata3/osum_s2s/merged_output_emotion_age_gender_add_2025_3_2_di2pi/shards_list.txt"
+  shard_num: 181
+  lab_path: "/home/work_nfs16/emotion_data/OSUM_age_gender/emotion_age_gender2/shards_list.txt"
+  huawei_path: ""
+  weight: 0.1
+# 多任务, style
+merged_output_style_age_gender_add_2025_3_2:
+  path: "/home/A02_tmpdata3/osum_s2s/merged_output_style_age_gender_add_2025_3_2/shards_list.txt"
+  lab_path: "/home/node54_tmpdata2/gjli/style_age_gender_data/style_labeling_100wto200w_part1_age_gender/shards_list.txt"
+  shard_num: 107
+  huawei_path: "/mnt/sfs/asr/update_data/multi_task/style_labeling_100wto200w_part1_age_gender/shards_list.txt"
+  weight: 0.1
+merged_output_style_origin_tts_age_gender_add_2025_3_2:
+  path: "/home/A02_tmpdata3/osum_s2s/merged_output_style_origin_tts_age_gender_add_2025_3_2/shards_list.txt"
+  lab_path: "/home/node54_tmpdata2/gjli/style_age_gender_data/style_origin_tts_age_gender/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/multi_task/style_origin_tts_age_gender/shards_list.txt"
+  weight: 0.1
+style_labeling_100wto200w_part1_age_gender_emotion_gjli:
+  path: "/home/A02_tmpdata3/osum_s2s/style_labeling_100wto200w_part1_age_gender_emotion_gjli/shards_list.txt"
+  lab_path: "/home/node54_tmpdata2/gjli/style_labeling_100wto200w_part1_age_gender_emotion/shards_list.txt" # 107
+  huawei_path: "/mnt/sfs/asr/update_data/style_labeling_100wto200w_part1_age_gender_emotion/shards_list.txt" #107tar
+  weight: 0.5
+style_labeling_200wto300w_part1_age_gender_emotion_gjli:
+  path: "/home/A02_tmpdata3/osum_s2s/style_labeling_200wto300w_part1_age_gender_emotion_gjli/shards_list.txt"
+  lab_path: "/home/node54_tmpdata2/gjli/style_labeling_200wto300w_part2/shards_list.txt"
+  shard_num: 236
+  huawei_path: "_"
+age_gender_style_emotion1_add_2025_3_29_zxzhao:
+  path: "/home/A02_tmpdata3/osum_s2s/age_gender_style_emotion1_add_2025_3_29_zxzhao/shards_list.txt"
+  lab_path: "/home/work_nfs16/emotion_data/OSUM_age_gender/age_gender_style_emotion1/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/age_gender_style_emotion1_add_2025_3_29_zxzhao/shards_list.txt" # 256tar
+  weight: 0.5
+5_label_caption_age_gender_style_emotion_added_2025_3_29_yacao:
+  path: "/home/A02_tmpdata3/osum_s2s/5_label_caption_age_gender_style_emotion_added_2025_3_29_yacao/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/5_label_caption_age_gender_style_emotion_added_2025_3_29_yacao/shards_list.txt" #270tar
+  lab_path: "/home/work_nfs7/yacao/0320_multilabel_2/shard/5_label/shards_list.txt"
+  weight: 0.5
+# audio description 数据
+audio_caption_by_wjtian_added_by_20250414: # 其实是 20250411 ，写日期的时候由于自动补全写错了
+  path: "/home/A02_tmpdata3/osum_s2s/audio_caption_by_wjtian_added_by_20250414/shards_list.txt"
+  lab_path: "/home/work_nfs7/cywang/OSUM/OSUM_data/shard/audio_caption/audio_caption/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/audio_caption_by_wjtian_added_by_20250414/shards_list.txt" # 2155 tar
+  weight: 0.15 # 开始上传天文杰准备的audio_caption数据，音频描述数据
+S2SChat_MMAU_training_all_by_wjtian_added_by_20250708:
+  path: "/home/A02_tmpdata3/osum_s2s/S2SChat_MMAU_training_all_by_wjtian_added_by_20250708/shards_list.txt"
+  lab_path: "/home/work_nfs11/cywang/data/shard/S2Chat/MMAU-training-all/shards_list.txt"
+  huawei_path: "/mnt/sfs/asr/update_data/S2SChat_MMAU_training_all_by_wjtian_added_by_20250708/shards_list.txt" # 1000 tar
+  shard_num: 22
+  weight: 5

conf/data_t2s.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+TEXT2TOKEN_hq_add_2025_3_17:
+  lab_path: "/home/node48_tmpdata/hkxie/4O/speech_data_final/10wh_token_data/TEXT2TOKEN_hq/shards_list.txt"
+  path_huawei: "/mnt/sfs/asr/update_data/TEXT2TOKEN_hq_add_2025_3_17/shards_list.txt"
+  path: /home/A03_tmpdata2/s2s/2000_hq_S2Chat_by_hkxie_added_by_20250411/combine_tts/combines_list.txt
+  weight: 5
+#english_text_token_add_2025_3_26:
+#  path_huawei: "/mnt/sfs/asr/update_data/english_speech_data_final_TEXT2TOKEN_part_1_added_2025_3_26/shards_list.txt" # 2000
+#  data_list_path: "/home/work_nfs14/code/hkxie/ASR/understanding_LLM_task/english/speech_data_final/data_libriheavy_part_1.list"
+#  lab_path: "?"
+#  weight: 0.5 #1 #1 #10
+#english_TEXT2TOKEN_part_2_added_by_20250402:
+#  path_huawei: "/mnt/sfs/asr/update_data/english_TEXT2TOKEN_part_2_added_by_20250402/shards_list.txt" #8050
+#  data_list_path: "/home/work_nfs14/code/hkxie/ASR/understanding_LLM_task/english/speech_data_final/data_libriheavy_part_2.list"
+#  lab_path: "?"
+#  weight: 0.5 #1 #1 #10
+#
+#zh_en_mix_tts_added_by_20250402: # tts
+#  path_huawei: "/mnt/sfs/asr/update_data/zh_en_mix_s2s_added_by_20250402/shards_list.txt" # 7
+#  weight: 1
+#poly_tts_added_by_20250402: # tts
+#  path: "/mnt/sfs/asr/update_data/poly_s2s_added_by_20250402/shards_list.txt" # 295
+#  weight: 0.5 #10
+#
+#text2token_itn_by_cywang_added_by_20250428: # zyzhang 负责，cywang打包
+#  lab_path: "/home/work_nfs7/cywang/OSUM/OSUM_data/shard/text2token/tn/shards_list.txt"
+#  path_huawei: "/mnt/sfs/asr/update_data/text2token_itn_by_cywang_added_by_20250428/shards_list.txt" # 1100
+#  weight: 0.5

conf/data_t2t.yaml ADDED Viewed

	@@ -0,0 +1,159 @@

+# 文本到文本
+#text2text_added_2025_4_4:
+#  path_huawei: "/mnt/sfs/asr/update_data/text2text_added_by_20250404/shards_list.txt" # 1850
+#  weight: 1
+#text2text_2_added_by_20250409:
+#  path_huawei: "/mnt/sfs/asr/update_data/text2text_2_added_by_20250409/shards_list.txt" # 2000
+#  weight: 1
+#
+#text2text_3_en_added_by_20250411:
+#  path_huawei: "/mnt/sfs/asr/update_data/text2text_3_en_added_by_20250411/shards_list.txt" # 185
+#  weight: 1
+#
+#text2text_4_en_added_by_20250416:
+#  path_huawei: "/mnt/sfs/asr/update_data/text2text_4_en_added_by_20250416/shards_list.txt"
+#  weight: 1
+#text2text_5_lucy_audioQA_1M_by_cywang_added_by_20250426:
+#  shard_num: 10000
+#  path_huawei: "/mnt/sfs/asr/update_data/text2text_5_lucy_audioQA_1M_by_cywang_added_by_20250426/shards_list.txt"
+#  weight: 0.1
+t2t_8772K_by_xlgeng_added_by_20250513:
+  path: "/home/A03_tmpdata1/text2text_data_xlgeng/t2t_8772K/shards_list.txt"
+  path_huawei: "/mnt/sfs/asr/update_data/t2t_8772K_by_xlgeng_added_by_20250513/shards_list.txt"
+  weight: 0.1
+#t2t_math_poetry_self_by_xlgeng_added_by_20250513:
+#  path: "/home/A03_tmpdata1/text2text_data_xlgeng/t2t_math_poetry_self/shards_list.txt"
+#  path_huawei: "/mnt/sfs/asr/update_data/t2t_math_poetry_self_by_xlgeng_added_by_20250513/shards_list.txt" # 75
+#  weight: 1
+Alpaca_CoT_3000W_by_wjt_added_by_20250605:
+  lab_path_huawei: ""
+  shard_num: 30000
+  path_huawei: "/mnt/sfs/asr/update_data/Alpaca_CoT_3000W_by_wjt_added_by_20250605/shards_list.txt"
+  path: "/home/A03_tmpdata1/text2text_data_xlgeng/Alpaca-CoT_3000W/shards_list.txt"
+  weight: 0.15
+qwenomni_bench_data:
+  path: "/home/A02_tmpdata3/osum_t2t/qwenomni_bench_data/shards_list.txt"
+  weight: 3
+three_kingdoms:
+  path: "/home/A02_tmpdata3/osum_t2t/three_kingdoms/shards_list.txt"
+  weight: 3
+voicebench_data:
+  path: "/home/A02_tmpdata3/osum_t2t/voicebench_data/shards_list.txt"
+  weight: 3
+t2t_osum_self_instruction_8K:
+  path: "/home/A02_tmpdata3/osum_t2t/t2t_osum_self_instruction_8K/shards_list.txt"
+  weight: 3
+#t2t_osum_self_instruction_8K_by_xlgeng_added_by_20250529:
+#  path: "/home/A02_tmpdata3/t2t_osum_self_instruction_8K_by_xlgeng_added_by_20250529/shards_list.txt"
+#  path_huawei: "/mnt/sfs/asr/update_data/t2t_osum_self_instruction_8K_by_xlgeng_added_by_20250529/shards_list.txt"
+#  weight: 5
+#kouyu_t2t_data_by_xlgeng_added_by_20250622:
+#  path: ""
+#  shard_num: 1758
+#  path_huawei: "/mnt/sfs/asr/update_data/kouyu_t2t_data_by_xlgeng_added_by_20250622s/shards_list.txt"
+#  weight: 1
+#   4653
+#text2text_data_xlgeng_three_kingdoms_by_xlgeng_added_by_20250701:
+#  path: "/home/A02_tmpdata3/text2text_data_xlgeng_three_kingdoms_by_xlgeng_added_by_20250701/shards_list.txt"
+#  path_huawei: "/mnt/sfs/asr/update_data/text2text_data_xlgeng_three_kingdoms_by_xlgeng_added_by_20250701/shards_list.txt"
+#  weight: 1
+#  lab_path_huawei: "/home/work_nfs11/asr_data/data/text2text_data_xlgeng/shard/benchdata/three_kingdoms/shard/shards_list.txt"
+#  shard_num: 24
+#
+#text2text_data_xlgeng_qwenomni_bench_data_by_xlgeng_added_by_20250701:
+#  path: "/home/A02_tmpdata3/text2text_data_xlgeng_qwenomni_bench_data_by_xlgeng_added_by_20250701/shards_list.txt"
+#  path_huawei: "/mnt/sfs/asr/update_data/text2text_data_xlgeng_qwenomni_bench_data_by_xlgeng_added_by_20250701/shards_list.txt"
+#  weight: 1
+#  lab_path_huawei: "/home/work_nfs11/asr_data/data/text2text_data_xlgeng/shard/benchdata/qwenomni_bench_data/shard/shards_list.txt"
+#  shard_num: 113
+#text2text_data_xlgeng_voicebench_data_by_xlgeng_added_by_20250701:
+#  path: "/home/A02_tmpdata3/text2text_data_xlgeng_voicebench_data_by_xlgeng_added_by_20250701/shards_list.txt"
+#  path_huawei: "/mnt/sfs/asr/update_data/text2text_data_xlgeng_voicebench_data_by_xlgeng_added_by_20250701/shards_list.txt"
+#  weight: 1
+#  lab_path_huawei: "/home/work_nfs11/asr_data/data/text2text_data_xlgeng/shard/benchdata/voicebench_data/shard/shards_list.txt"
+#  shard_num: 65
+#
+#t2t_age_chat_by_cywang_added_by_20250708: # have
+#  path: "/home/A02_tmpdata3/osum_s2s/t2t_age_chat_by_cywang_added_by_20250708/shards_list.txt"
+#  path_huawei: "/mnt/sfs/asr/update_data/t2t_age_chat_by_cywang_added_by_20250708/shards_list.txt"
+#  lab_path_huawei: "/home/work_nfs11/asr_data/data/osum_data/t2t_paralanguage_chat/age_chat/shard_dir/shards_list.txt"
+#  shard_num: 50
+#  weight: 1
+#
+#t2t_caption_chat_by_cywang_added_by_20250708: # have
+#  path: "/home/A02_tmpdata3/osum_s2s/t2t_caption_chat_by_cywang_added_by_20250708/shards_list.txt"
+#  lab_path_huawei: "/home/work_nfs11/asr_data/data/osum_data/t2t_paralanguage_chat/caption_chat/shard_dir/shards_list.txt"
+#  path_huawei: "/mnt/sfs/asr/update_data/t2t_caption_chat_by_cywang_added_by_20250708/shards_list.txt"
+#  shard_num: 100
+#  weight: 1
+#
+#t2t_emotion_chat_by_cywang_added_by_20250708: # have
+#  path: "/home/A02_tmpdata3/osum_s2s/t2t_emotion_chat_by_cywang_added_by_20250708/shards_list.txt"
+#  lab_path_huawei: "/home/work_nfs11/asr_data/data/osum_data/t2t_paralanguage_chat/emotion_chat/shard_dir/shards_list.txt"
+#  path_huawei: "/mnt/sfs/asr/update_data/t2t_emotion_chat_by_cywang_added_by_20250708/shards_list.txt"
+#  shard_num: 50
+#  weight: 1
+#
+#t2t_sex_chat_by_cywang_added_by_20250708: # have
+#  path: "/home/A02_tmpdata3/osum_s2s/t2t_sex_chat_by_cywang_added_by_20250708/shards_list.txt"
+#  lab_path_huawei: "/home/work_nfs11/asr_data/data/osum_data/t2t_paralanguage_chat/sex_chat/shard_dir/shards_list.txt"
+#  path_huawei: "/mnt/sfs/asr/update_data/t2t_sex_chat_by_cywang_added_by_20250708/shards_list.txt"
+#  shard_num: 50
+#  weight: 1
+#
+#t2t_xianshi_emotion_chat_by_cywang_added_by_20250711: # no
+#  path: "/home/A02_tmpdata3/osum_t2t/t2t_xianshi_emotion_chat/shards_list.txt"
+#  lab_path_huawei: "/home/work_nfs11/asr_data/data/osum_data/t2t_paralanguage_chat/xianshi_emotion_chat/shard_dir/shards_list.txt|/home/work_nfs23/asr_data/data/osum_chat/t2t_data/t2t_paralanguage_chat/xianshi_emotion_chat/shards_list.txt"
+#  path_huawei: "/mnt/sfs/asr/update_data/t2t_xianshi_emotion_chat_by_cywang_added_by_20250711/shards_list.txt"
+#  shard_num: 50
+#  weight: 1
+#
+#t2t_sex_chat_2_by_cywang_added_by_20250711: # no
+#  path: "/home/A02_tmpdata3/osum_t2t/t2t_sex_chat_2_by_cywang_added_by_20250711/shards_list.txt"
+#  lab_path_huawei: "/home/work_nfs11/asr_data/data/osum_data/t2t_paralanguage_chat/sex_chat_2/shard_dir/shards_list.txt|/home/work_nfs23/asr_data/data/osum_chat/t2t_data/t2t_paralanguage_chat/t2t_sex_chat_2_by_cywang_added_by_20250711/shards_list.txt"
+#  path_huawei: "/mnt/sfs/asr/update_data/t2t_sex_chat_2_by_cywang_added_by_20250711/shards_list.txt"
+#  shard_num: 27
+#  weight: 1
+#
+#t2t_age_chat_2_by_cywang_added_by_20250711: # no
+#  path: "/home/A02_tmpdata3/osum_t2t/t2t_age_chat_2_by_cywang_added_by_20250711/shards_list.txt"
+#  path_huawei: "/mnt/sfs/asr/update_data/t2t_age_chat_2_by_cywang_added_by_20250711/shards_list.txt"
+#  lab_path_huawei: "/home/work_nfs11/asr_data/data/osum_data/t2t_paralanguage_chat/age_chat_2/shard_dir/shards_list.txt|/home/work_nfs23/asr_data/data/osum_chat/t2t_data/t2t_paralanguage_chat/t2t_age_chat_2_by_cywang_added_by_20250711/shards_list.txt"
+#  shard_num: 27
+#  weight: 1
+#
+#t2t_sex_chat_2_by_cywang_added_by_20250715: # no
+#  path: "/home/A02_tmpdata3/osum_t2t/t2t_sex_chat_2_by_cywang_added_by_20250715/shards_list.txt"
+#  path_huawei: "/mnt/sfs/asr/update_data/t2t_sex_chat_2_by_cywang_added_by_20250715/shards_list.txt"
+#  lab_path_huawei: "/home/work_nfs14/asr_data/data/osum_data/t2t_paralanguage_chat/sex_chat_2/shard_dir/shards_list.txt"
+#  shard_num: 10
+#  weight: 1
+#
+#t2t_age_chat_3_by_cywang_added_by_20250716: # no
+#  path: "/home/A02_tmpdata3/osum_t2t/t2t_age_chat_3_by_cywang_added_by_20250716/shards_list.txt"
+#  path_huawei: "/mnt/sfs/asr/update_data/t2t_age_chat_3_by_cywang_added_by_20250716/shards_list.txt"
+#  lab_path_huawei: "/home/work_nfs14/asr_data/data/osum_data/t2t_paralanguage_chat/age_chat_3/shard_dir/shards_list.txt"
+#  shard_num: 10
+#  weight: 1
+#
+#t2t_caption_chat_3_by_cywang_added_by_20250716: # have
+#  path: "/home/A02_tmpdata3/osum_t2t/t2t_caption_chat_3_by_cywang_added_by_20250716/shards_list.txt"
+#  path_huawei: "/mnt/sfs/asr/update_data/t2t_caption_chat_3_by_cywang_added_by_20250716/shards_list.txt"
+#  lab_path_huawei: "/home/work_nfs14/asr_data/data/osum_data/t2t_paralanguage_chat/caption_chat_3/shard_dir/shards_list.txt"
+#  shard_num: 10
+#  weight: 1

conf/data_tmp.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+gaozhiliang_gbma:
+  path: /home/A02_tmpdata3/osum_s2s/gaozhiliang_gbma/shards_list.txt
+  new_data_list: /home/node44_tmpdata3/netease/gbma/workspace/osum/data/process/0803/all_data_info.jsonl
+  new_lab_path: /home/work_nfs23/asr_data/data/osum_chat/s2s/gaozhiliang_gbma/shards_list.txt
+  shard_num: 24
+  weight: 10

conf/ds_stage2.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "train_micro_batch_size_per_gpu": 1,
+  "gradient_accumulation_steps": 8,
+  "steps_per_print": 10,
+  "gradient_clipping": 5,
+  "fp16": {
+    "enabled": false,
+    "auto_cast": true,
+    "loss_scale": 0,
+    "initial_scale_power": 16,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "consecutive_hysteresis": false,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "zero_force_ds_cpu_optimizer": false,
+  "zero_optimization": {
+    "stage": 2,
+    "offload_optimizer": {
+      "device": "none",
+      "pin_memory": true
+    },
+    "allgather_partitions": true,
+    "allgather_bucket_size": 2e8,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 2e8,
+    "contiguous_gradients": false,
+    "overlap_comm": false
+  },
+  "find_unused_parameters": true
+}

conf/empty.yaml ADDED Viewed

File without changes

conf/prompt_config.yaml ADDED Viewed

The diff for this file is too large to render. See raw diff

conf/system_prompt.yaml ADDED Viewed

	@@ -0,0 +1,27 @@

+#        qwen_instruct_prompt_pattern_chat_s2t = "<|im_start|>system\nYou are OSUM-chat, a speech-to-text dialogue. You understand both the meaning and paralinguistic cues in speech then respond exclusively with appropriate text.<|im_end|>\n"
+#        qwen_instruct_prompt_pattern__chat_t2t = "<|im_start|>system\n<|im_end|>\n"
+#        qwen_instruct_prompt_pattern_chat_s2s = "<|im_start|>system\nYou are OSUM-chat, a speech-to-speech dialogue assistant by ASLP Lab. You understand both the meaning and paralinguistic cues in speech then respond with appropriate text and emotionally matching synthetic speech.<|im_end|>\n"
+#        qwen_instruct_prompt_pattern_chat_s2s_think = "<|im_start|>system\nYou are OSUM-chat, a speech-to-speech dialogue assistant by ASLP Lab. You understand both the meaning and paralinguistic cues in speech. Before responding, first output your reasoning inside <think>...</think end>, analyzing the user’s words and vocal cues. Then generate a reply with appropriate text and emotionally matched synthetic speech.<|im_end|>\n"
+#        qwen_instruct_prompt_pattern_chat_s2s_streaming = "<|im_start|>system\nYou are OSUM-chat, a speech-to-speech dialogue assistant by ASLP Lab. You analyze speech (content + paralinguistic cues) and respond with interleaved text and emotionally-matched synthetic speech.<|im_end|>\n"
+#        qwen_instruct_prompt_pattern_chat_s2s_streaming_think = "<|im_start|>system\nYou are OSUM-chat, a speech-to-speech dialogue assistant by ASLP Lab. You analyze speech (both content and paralinguistic cues). Before responding, output your reasoning in <think>...</think end>. Then reply with interleaved text and emotionally matched synthetic speech.<|im_end|>\n"
+#        qwen_instruct_prompt_pattern__chat_t2t = "<|im_start|>system\n
+#        qwen_instruct_prompt_pattern_1_understand = "<|im_start|>system\nYou are OSUM-chat, an audio understanding assistant by ASLP Lab. You can transcribe speech accurately and analyze paralinguistic cues to provide precise text responses.<|im_end|>\n"
+#        qwen_instruct_prompt_pattern_1_tts = "<|im_start|>system\nYou are OSUM-chat, a speech synthesis assistant by ASLP Lab. You generate natural and fluent speech from text input.<|im_end|>\n"
+#        qwen_instruct_prompt_pattern_1_tts_streaming = "<|im_start|>system\nYou are OSUM-chat, a speech synthesis assistant by ASLP Lab. You generate natural speech from text input and output both audio and the original text in interleaved format.<|im_end|>\n"
+#        qwen_instruct_prompt_pattern_1_old = "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n"
+#        # user_start = "<|im_start|>user\n"
+t2t_chat: # <TEXT2TEXT>
+  prompt: You are OSUM-chat, a text-to-text dialogue assistant by ASLP Lab. You understand user input in text then respond exclusively with appropriate text.
+s2t_chat: # <S2TCHAT>
+  prompt: You are OSUM-chat, a speech-to-text dialogue assistant by ASLP Lab. You understand both the meaning and paralinguistic cues in speech then respond exclusively with appropriate text.
+s2t_chat_thinker: # <S2TCHAT> <THINKER>
+  prompt: You are OSUM-chat, a thinking-enabled speech-to-text dialogue assistant by ASLP Lab. You not only comprehend the semantic meaning and paralinguistic cues in speech but also engage in deliberate reasoning to process such information. Based on this thinking process, you then respond exclusively with appropriate text.
+t2s: # <TEXT2TOKEN>
+  prompt: You are OSUM-chat, a speech synthesis assistant by ASLP Lab. You generate natural and fluent speech from text input.
+speech_understanding: # <TRANSCRIBE> <CAPTION> 。。
+  prompt: You are OSUM-chat, an audio understanding assistant by ASLP Lab. You can transcribe speech accurately and analyze paralinguistic cues to provide precise text responses.

patches/cumstom_stop_criteria.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torch
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.stopping_criteria import StoppingCriteria
+class ASRLogitsProcessor(LogitsProcessor):
+    def __init__(self, text_token_num: int):
+        self.text_token_num = text_token_num
+    def __call__(self, input_ids, scores):
+        scores[..., self.text_token_num:] = torch.finfo(scores.dtype).min
+        return scores
+class TTSLogitsProcessor(LogitsProcessor):
+    """
+    TTS 任务使用的LogitsProcessor，把所有text位置的logits设置为负无穷
+    """
+    def __init__(self, text_token_num: int):
+        self.text_token_num = text_token_num
+    def __call__(self, input_ids, scores):
+        scores[..., :self.text_token_num] = torch.finfo(scores.dtype).min
+        return scores
+class S2SLogitsProcessor(LogitsProcessor):
+    """Speech 2 Speech 任务使用的 LogitsProcessor，当前只适用于batch_size=1
+    Args:
+        LogitsProcessor (_type_): _description_
+    """
+    def __init__(self, text_token_num: int, text_eos_id: int):
+        self.text_token_num = text_token_num
+        self.text_eos_id = text_eos_id
+        self.text_phase = True
+    def __call__(self, input_ids, scores):
+        print(input_ids.shape)
+        assert input_ids.size(0) == 1, "ERROR: S2SSpeechLogitsProcessor only support bs=1 now"
+        if self.text_phase:
+            scores[..., self.text_token_num:] = torch.finfo(scores.dtype).min
+        else:
+            scores[..., :self.text_token_num] = torch.finfo(scores.dtype).min
+        if self.text_phase and torch.isin(input_ids, self.text_eos_id):
+            self.text_phase = False
+        return scores
+class S2SStopCriteria(StoppingCriteria):
+    """Speech 2 Speech 任务使用的 停止条件，当前只适用于batch_size=1
+    Args:
+        LogitsProcessor (_type_): _description_
+    """
+    def __init__(self, text_eos_id: int, speech_eos_id: int):
+        self.text_eos_id = text_eos_id
+        self.speech_eos_id = speech_eos_id
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
+        _input_ids = input_ids.flatten().view(-1)
+        if torch.isin(_input_ids, self.text_eos_id).any():
+            text_eos_idx = (_input_ids == self.text_eos_id).nonzero(as_tuple=True)[0][0].item()
+            if torch.sum(_input_ids[text_eos_idx:] == self.speech_eos_id) > 1:
+                return True
+        return False
+class MaxTokenStopper(StoppingCriteria):
+    def __init__(self, max_tokens):
+        self.max_tokens = max_tokens
+    # TODO@wsy:期望能够修改max_tokens，但好像没用，后续注意
+    def change_max_tokens(self, max_tokens):
+        self.max_tokens = max_tokens
+    def __call__(self, input_ids, scores, **kwargs):
+        return input_ids.shape[1] >= self.max_tokens  # 检查当前序列长度
+class InterruptStopper(StoppingCriteria):
+    def __init__(self):
+        self.stop = False
+    def __call__(self, input_ids, scores, **kwargs):
+        if self.stop == True:
+            # self.stop == False # reset
+            return True
+        else:
+            return False

patches/custom_speech_ngram_blocking.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from transformers.generation.logits_process import LogitsProcessor
+import torch
+class SpeechOnlyNGramBlockingLogitsProcessor(LogitsProcessor):
+    def __init__(
+        self,
+        speech_token_num,
+        repeat_times=5,
+        special_token_repeat_times_dict=None,
+        window_size=8,
+        window_repeat=5,
+        special_token_window_dict=None
+    ):
+        """
+        speech_token_num: int, speech token 的数量（token_id in [0, speech_token_num) 视为 speech token）
+        repeat_times: int, 普通 speech token 的最大允许连续重复次数
+        special_token_repeat_times_dict: dict, {token_id: repeat_times}，为特殊 speech token 单独指定最大连续重复次数
+        window_size: int, 默认滑动窗口大小
+        window_repeat: int, 默认窗口内最大允许出现次数
+        special_token_window_dict: dict, {token_id: (window_size, window_repeat)}，为特殊 token 单独指定窗口参数
+        """
+        self.speech_token_num = speech_token_num
+        self.repeat_times = repeat_times
+        self.special_token_repeat_times_dict = special_token_repeat_times_dict or {}
+        self.speech_phase = False  # 你需要在外部控制这个变量
+        self.window_size = window_size
+        self.window_repeat = window_repeat
+        self.special_token_window_dict = special_token_window_dict or {1446: (13, 10)}
+    def set_phase(self, speech_phase: bool):
+        self.speech_phase = speech_phase
+    def __call__(self, input_ids, scores):
+        if not self.speech_phase:
+            # text 阶段，什么都不做
+            return scores
+        batch_size, seq_len = input_ids.size()
+        for batch_idx in range(batch_size):
+            generated = input_ids[batch_idx].tolist()
+            if seq_len == 0:
+                continue
+            last_token = generated[-1]
+            if last_token >= self.speech_token_num:
+                continue  # 不是 speech token
+            # 统计最近的 token 连续重复了多少次
+            repeat_count = 1
+            for i in range(seq_len-2, -1, -1):
+                if generated[i] == last_token:
+                    repeat_count += 1
+                else:
+                    break
+            # 获取该 token 的最大允许重复次数
+            max_repeat = self.special_token_repeat_times_dict.get(last_token, self.repeat_times)
+            if repeat_count >= max_repeat:
+                scores[batch_idx, last_token] = -float('inf')  # 阻止生成
+            # ====== 滑动窗口内频率抑制 ======
+            # 对窗口内所有 speech token 检查
+            window_tokens = set(generated[-max(self.window_size, max([v[0] for v in self.special_token_window_dict.values()], default=0)):])
+            for token in window_tokens:
+                if token >= self.speech_token_num:
+                    continue
+                # 获取该 token 的窗口参数
+                window_size, window_repeat = self.special_token_window_dict.get(
+                    token, (self.window_size, self.window_repeat)
+                )
+                window = generated[-window_size:]
+                if window.count(token) >= window_repeat:
+                    scores[batch_idx, token] = -float('inf')
+            # ====== 滑动窗口内频率抑制结束 ======
+        return scores
+class OSUM_chat_LogitsProcessor(LogitsProcessor):
+    def __init__(self, allowed_tokens, sequence_to_match):
+        """
+        初始化OSUM_chat_LogitsProcessor。
+        参数：
+        allowed_tokens (list): 允许出现在当前时间步的token的ID列表
+        sequence_to_match (list): 用来判断当前时间步允许token的前置序列
+        """
+        self.allowed_tokens = allowed_tokens
+        self.sequence_to_match = sequence_to_match
+        self.match_found = False  # 添加一个标志，表示是否已经找到匹配的序列
+    def init_match_found(self):
+        """
+        初始化match_found标志。
+        """
+        self.match_found = False
+    def __call__(self, input_ids: torch.Tensor, logits: torch.Tensor) -> torch.Tensor:
+        """
+        在每个时间步处理logits，对不符合条件的token设置极小的概率。
+        参数：
+        input_ids (torch.Tensor): 当前输入的token ID序列
+        logits (torch.Tensor): 当前时间步的logits (shape: [batch_size, vocab_size])
+        返回：
+        torch.Tensor: 被处理过的logits
+        """
+        # 如果已经匹配过一次，就跳过匹配检测，直接返回logits
+        # print("recent_tokens:！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！")  # 打印当前生成的序列
+        if self.match_found:
+            return logits
+        # 获取当前生成的序列的最后几个token（假设生成的长度大于等于序列长度）
+        sequence_length = len(self.sequence_to_match)
+        if input_ids.shape[-1] >= sequence_length:
+            recent_tokens = input_ids[:, -sequence_length:].tolist()
+            # print("recent_tokens:", recent_tokens)   # 打印当前生成的序列
+            # 检查前面生成的token是否匹配我们需要的序列
+            if all(recent_tokens[0][i] == self.sequence_to_match[i] for i in range(sequence_length)):
+                # Create a mask for allowed tokens while preserving original logits
+                mask = torch.zeros_like(logits, dtype=torch.bool)  # Initialize mask as False
+                mask[:, self.allowed_tokens] = True  # Mark allowed tokens as True
+                # Apply mask: keep original logits for allowed tokens, set others to -inf
+                logits = torch.where(mask, logits, -float('inf'))
+                # 设置标志，表示匹配已成功
+                self.match_found = True
+                print("match found!!!!!!!!!!!!!!!!!!!!!!!")
+        return logits

patches/custom_speech_repetition_penalty.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from transformers.generation.logits_process import LogitsProcessor
+class SpeechOnlyRepetitionPenaltyLogitsProcessor(LogitsProcessor):
+    def __init__(self, speech_token_num, penalty=1.2):
+        self.speech_token_num = speech_token_num
+        self.penalty = penalty
+        self.speech_phase = False  # 你需要在外部控制这个变量
+    def set_phase(self, speech_phase: bool):
+        self.speech_phase = speech_phase
+    def __call__(self, input_ids, scores):
+        if not self.speech_phase:
+            # text阶段，什么都不做
+            return scores
+        # speech阶段，只对speech token做重复抑制
+        for batch_idx in range(input_ids.size(0)):
+            generated = input_ids[batch_idx].tolist()
+            for token_id in set(generated):
+                if 0 <= token_id < self.speech_token_num:
+                    scores[batch_idx, token_id] /= self.penalty
+        return scores

patches/modelling_fm_infer_gpu.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import torch
+import torch.nn as nn
+from typing import List, Optional, Tuple, Union
+import f5_tts
+from f5_tts.model.backbones.dit_mask import DiT as DiT_
+_GPU_FM_TORCH_COMPILE = True
+class GPUDiT(DiT_):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.fast_forward = torch.compile(self.fast_forward, dynamic=False, fullgraph=True) \
+            if _GPU_FM_TORCH_COMPILE else self.fast_forward
+# ===================================================================
+print("========================= DO FM PATCH ============================")
+# ===================================================================
+f5_tts.model.backbones.dit_mask.DiT = GPUDiT

patches/modelling_qwen2_infer_gpu.py ADDED Viewed

	@@ -0,0 +1,416 @@

+import math
+import torch
+import torch.nn as nn
+from typing import List, Optional, Tuple, Union
+import transformers.models
+from transformers.models.qwen2.modeling_qwen2 import (
+    Qwen2RotaryEmbedding,
+    Qwen2ForCausalLM,
+    Qwen2MLP,
+    Qwen2RMSNorm,
+    apply_rotary_pos_emb,
+    repeat_kv,
+    _prepare_4d_causal_attention_mask_with_cache_position,
+    )
+from transformers.utils import logging
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.cache_utils import Cache, StaticCache, SlidingWindowCache
+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+from .utils import InferTaskCode
+logger = logging.get_logger(__name__)
+_GPU_QWEN_TORCH_COMPILE = True
+# ===================================================================
+# =============================Attention=============================
+# ===================================================================
+class GPUQwen2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = Qwen2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+    # Adapted from Qwen2Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # NOTE: RoPE return all embedding (to satisfy torch compile)
+        cos, sin = self.rotary_emb(value_states, seq_len=past_key_value.get_max_length())
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : past_key_value.get_max_length()]
+        query_states = query_states.contiguous()
+        key_states = key_states.contiguous()
+        value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+# ===================================================================
+# =============================Layer=================================
+# ===================================================================
+class GPUQwen2DecoderLayer(nn.Module):
+    def __init__(self, config: Qwen2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        if config.sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+        self.self_attn = GPUQwen2Attention(config, layer_idx)
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+# ===================================================================
+# ========================Qwen2ForCausalLM===========================
+# ===================================================================
+class InferQwen2ForCausalLM(Qwen2ForCausalLM):
+    def __init__(self, config):
+        super().__init__(config)
+        self.compile_forward = torch.compile(self.simplify_forward, dynamic=False, fullgraph=True) \
+            if _GPU_QWEN_TORCH_COMPILE else self.simplify_forward
+        self.text_phase = True
+    '''
+    NOTE: 重写原Qwen2ForCausalLM forward函数，torchair直接编译原函数在返回CausalLMOutputWithPast时会出现编译错误
+    '''
+    def simplify_forward(self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            cache_position: Optional[torch.LongTensor] = None,
+            ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        return outputs
+    def forward(self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            cache_position: Optional[torch.LongTensor] = None,
+            do_compile = True
+        ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if past_key_values is not None:
+            past_key_values.training = False
+        # print(self.text_phase)
+        if input_ids is not None:
+            if self.text_phase:
+                inputs_embeds = self.model.embed_tokens(input_ids)
+            else:
+                inputs_embeds = self.speech_token_emded(input_ids)
+            if torch.isin(input_ids, 151645).any():
+                self.text_phase = False
+            input_ids = None
+        if (inputs_embeds is not None and cache_position[0] == 0) or do_compile==False :
+            # prefill branch
+            outputs = self.simplify_forward(input_ids,
+                            attention_mask,
+                            position_ids,
+                            past_key_values,
+                            inputs_embeds,
+                            labels,
+                            use_cache,
+                            output_attentions,
+                            output_hidden_states,
+                            return_dict,
+                            cache_position)
+        else:
+            # decoding
+            outputs = self.compile_forward(input_ids,
+                            attention_mask,
+                            position_ids,
+                            past_key_values,
+                            inputs_embeds,
+                            labels,
+                            use_cache,
+                            output_attentions,
+                            output_hidden_states,
+                            return_dict,
+                            cache_position)
+        last_hidden_states = outputs.last_hidden_state
+        if self.text_phase:
+            logits = self.lm_head(last_hidden_states)
+        else:
+            logits = self.speech_head(last_hidden_states)
+        logits = logits.float()
+        return CausalLMOutputWithPast(
+            loss=None,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        """
+        Mainly add static cache support
+        """
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`,
+                # as otherwise the input `position_ids` would have various stride during the decoding.
+                # Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case,
+                # `position_ids` is already contiguous but with varying stride which retriggers a capture.
+                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            # NOTE: 与上述的position_ids相同，same as position_ids, for torch.compile and cuda graph
+            input_ids = input_ids.clone(memory_format=torch.contiguous_format)
+            model_inputs = {"input_ids": input_ids}
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if inputs_embeds is not None and cache_position[0] == 0:
+                # prefill phase, inputs_embeds has shape (B,S,H)
+                batch_size, sequence_length = inputs_embeds.shape[0], inputs_embeds.shape[1]
+                device = inputs_embeds.device
+            else:
+                # decdoing phase, input_ids has shape (B,S)
+                batch_size, sequence_length = input_ids.shape
+                device = input_ids.device
+            dtype = self.lm_head.weight.dtype
+            min_dtype = torch.finfo(dtype).min
+            if inputs_embeds is not None and inputs_embeds.ndim == 2 or input_ids is not None and input_ids.size(-1) == 1:
+                # we only expand attention mask in docoding mode
+                attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+                    attention_mask,
+                    sequence_length=sequence_length,
+                    target_length=past_key_values.get_max_length(),
+                    dtype=dtype,
+                    device=device,
+                    min_dtype=min_dtype,
+                    cache_position=cache_position,
+                    batch_size=batch_size,
+                )
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "do_compile": kwargs['do_compile'],
+            }
+        )
+        return model_inputs
+# ===================================================================
+print("========================= DO Qwen2 PATCH ===========================")
+# ===================================================================
+transformers.models.qwen2.modeling_qwen2.Qwen2PreTrainedModel._supports_static_cache = True # enable static cache
+transformers.models.qwen2.modeling_qwen2.Qwen2DecoderLayer = GPUQwen2DecoderLayer
+transformers.models.qwen2.modeling_qwen2.Qwen2ForCausalLM = InferQwen2ForCausalLM

patches/utils.py ADDED Viewed

	@@ -0,0 +1,4 @@

+class InferTaskCode:
+    _ASR = 0
+    _TTS = 1
+    _S2S = 2

requirements.txt ADDED Viewed

	@@ -0,0 +1,41 @@

+numpy==1.24
+jsonlines==4.0.0
+torch==2.1.0
+transformers==4.44.0
+torchaudio==2.1.0
+accelerate==1.7.0
+peft==0.17.0
+librosa
+tensorboardX>=2.5
+# torch_npu==2.1.0.post8
+tqdm
+absl-py
+psutil
+cloudpickle
+ml-dtypes
+tornado
+openai-whisper
+colorama
+sox
+deepspeed
+librosa
+gxl_ai_utils
+hyperpyyaml
+modelscope
+onnxruntime
+inflect
+omegaconf
+conformer
+diffusers
+hydra-core
+lightning
+gradio
+cn2an
+gdown
+matplotlib
+wget
+pyarrow
+pyworld

tts/__init__.py ADDED Viewed

File without changes

tts/assert//345/256/236/351/252/214/345/256/244.png ADDED Viewed

tts/cosyvoice/__init__.py ADDED Viewed

File without changes

tts/cosyvoice/bin/average_model.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Copyright (c) 2020 Mobvoi Inc (Di Wu)
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+import glob
+import yaml
+import torch
+def get_args():
+    parser = argparse.ArgumentParser(description='average model')
+    parser.add_argument('--dst_model', required=True, help='averaged model')
+    parser.add_argument('--src_path',
+                        required=True,
+                        help='src model path for average')
+    parser.add_argument('--val_best',
+                        action="store_true",
+                        help='averaged model')
+    parser.add_argument('--num',
+                        default=5,
+                        type=int,
+                        help='nums for averaged model')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    val_scores = []
+    if args.val_best:
+        yamls = glob.glob('{}/*.yaml'.format(args.src_path))
+        yamls = [
+            f for f in yamls
+            if not (os.path.basename(f).startswith('train')
+                    or os.path.basename(f).startswith('init'))
+        ]
+        for y in yamls:
+            with open(y, 'r') as f:
+                dic_yaml = yaml.load(f, Loader=yaml.BaseLoader)
+                loss = float(dic_yaml['loss_dict']['loss'])
+                epoch = int(dic_yaml['epoch'])
+                step = int(dic_yaml['step'])
+                tag = dic_yaml['tag']
+                val_scores += [[epoch, step, loss, tag]]
+        sorted_val_scores = sorted(val_scores,
+                                   key=lambda x: x[2],
+                                   reverse=False)
+        print("best val (epoch, step, loss, tag) = " +
+              str(sorted_val_scores[:args.num]))
+        path_list = [
+            args.src_path + '/epoch_{}_whole.pt'.format(score[0])
+            for score in sorted_val_scores[:args.num]
+        ]
+    print(path_list)
+    avg = {}
+    num = args.num
+    assert num == len(path_list)
+    for path in path_list:
+        print('Processing {}'.format(path))
+        states = torch.load(path, map_location=torch.device('cpu'))
+        for k in states.keys():
+            if k not in avg.keys():
+                avg[k] = states[k].clone()
+            else:
+                avg[k] += states[k]
+    # average
+    for k in avg.keys():
+        if avg[k] is not None:
+            # pytorch 1.6 use true_divide instead of /=
+            avg[k] = torch.true_divide(avg[k], num)
+    print('Saving to {}'.format(args.dst_model))
+    torch.save(avg, args.dst_model)
+if __name__ == '__main__':
+    main()

tts/cosyvoice/bin/export_jit.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import sys
+import torch
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../..'.format(ROOT_DIR))
+sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
+def get_args():
+    parser = argparse.ArgumentParser(description='export your model for deployment')
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default='pretrained_models/CosyVoice-300M',
+                        help='local path')
+    args = parser.parse_args()
+    print(args)
+    return args
+def get_optimized_script(model, preserved_attrs=[]):
+    script = torch.jit.script(model)
+    if preserved_attrs != []:
+        script = torch.jit.freeze(script, preserved_attrs=preserved_attrs)
+    else:
+        script = torch.jit.freeze(script)
+    script = torch.jit.optimize_for_inference(script)
+    return script
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    torch._C._jit_set_fusion_strategy([('STATIC', 1)])
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    try:
+        model = CosyVoice(args.model_dir)
+    except Exception:
+        try:
+            model = CosyVoice2(args.model_dir)
+        except Exception:
+            raise TypeError('no valid model_type!')
+    if not isinstance(model, CosyVoice2):
+        # 1. export llm text_encoder
+        llm_text_encoder = model.model.llm.text_encoder
+        script = get_optimized_script(llm_text_encoder)
+        script.save('{}/llm.text_encoder.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(llm_text_encoder.half())
+        script.save('{}/llm.text_encoder.fp16.zip'.format(args.model_dir))
+        # 2. export llm llm
+        llm_llm = model.model.llm.llm
+        script = get_optimized_script(llm_llm, ['forward_chunk'])
+        script.save('{}/llm.llm.fp32.zip'.format(args.model_dir))
+        script = get_optimized_script(llm_llm.half(), ['forward_chunk'])
+        script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
+    # 3. export flow encoder
+    flow_encoder = model.model.flow.encoder
+    script = get_optimized_script(flow_encoder)
+    script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
+    script = get_optimized_script(flow_encoder.half())
+    script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
+if __name__ == '__main__':
+    main()

tts/cosyvoice/bin/export_onnx.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Copyright (c) 2024 Antgroup Inc (authors: Zhoubofan, [email protected])
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import sys
+import onnxruntime
+import random
+import torch
+from tqdm import tqdm
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../..'.format(ROOT_DIR))
+sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
+def get_dummy_input(batch_size, seq_len, out_channels, device):
+    x = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    mask = torch.ones((batch_size, 1, seq_len), dtype=torch.float32, device=device)
+    mu = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    t = torch.rand((batch_size), dtype=torch.float32, device=device)
+    spks = torch.rand((batch_size, out_channels), dtype=torch.float32, device=device)
+    cond = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    return x, mask, mu, t, spks, cond
+def get_args():
+    parser = argparse.ArgumentParser(description='export your model for deployment')
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default='pretrained_models/CosyVoice-300M',
+                        help='local path')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    try:
+        model = CosyVoice(args.model_dir)
+    except Exception:
+        try:
+            model = CosyVoice2(args.model_dir)
+        except Exception:
+            raise TypeError('no valid model_type!')
+    # 1. export flow decoder estimator
+    estimator = model.model.flow.decoder.estimator
+    device = model.model.device
+    batch_size, seq_len = 2, 256
+    out_channels = model.model.flow.decoder.estimator.out_channels
+    x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
+    torch.onnx.export(
+        estimator,
+        (x, mask, mu, t, spks, cond),
+        '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+        export_params=True,
+        opset_version=18,
+        do_constant_folding=True,
+        input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
+        output_names=['estimator_out'],
+        dynamic_axes={
+            'x': {2: 'seq_len'},
+            'mask': {2: 'seq_len'},
+            'mu': {2: 'seq_len'},
+            'cond': {2: 'seq_len'},
+            'estimator_out': {2: 'seq_len'},
+        }
+    )
+    # 2. test computation consistency
+    option = onnxruntime.SessionOptions()
+    option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+    option.intra_op_num_threads = 1
+    providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+    estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+                                                  sess_options=option, providers=providers)
+    for _ in tqdm(range(10)):
+        x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
+        output_pytorch = estimator(x, mask, mu, t, spks, cond)
+        ort_inputs = {
+            'x': x.cpu().numpy(),
+            'mask': mask.cpu().numpy(),
+            'mu': mu.cpu().numpy(),
+            't': t.cpu().numpy(),
+            'spks': spks.cpu().numpy(),
+            'cond': cond.cpu().numpy()
+        }
+        output_onnx = estimator_onnx.run(None, ort_inputs)[0]
+        torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
+if __name__ == "__main__":
+    main()

tts/cosyvoice/bin/export_trt.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/bin/bash
+# Copyright 2024 Alibaba Inc. All Rights Reserved.
+# download tensorrt from https://developer.nvidia.com/tensorrt/download/10x, check your system and cuda for compatibability
+# for example for linux + cuda12.4, you can download https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz
+TRT_DIR=<YOUR_TRT_DIR>
+MODEL_DIR=<COSYVOICE2_MODEL_DIR>
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRT_DIR/lib:/usr/local/cuda/lib64
+$TRT_DIR/bin/trtexec --onnx=$MODEL_DIR/flow.decoder.estimator.fp32.onnx --saveEngine=$MODEL_DIR/flow.decoder.estimator.fp32.mygpu.plan --minShapes=x:2x80x4,mask:2x1x4,mu:2x80x4,cond:2x80x4 --optShapes=x:2x80x193,mask:2x1x193,mu:2x80x193,cond:2x80x193 --maxShapes=x:2x80x6800,mask:2x1x6800,mu:2x80x6800,cond:2x80x6800 --inputIOFormats=fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw,fp32:chw --outputIOFormats=fp32:chw
+$TRT_DIR/bin/trtexec --onnx=$MODEL_DIR/flow.decoder.estimator.fp32.onnx --saveEngine=$MODEL_DIR/flow.decoder.estimator.fp16.mygpu.plan --fp16 --minShapes=x:2x80x4,mask:2x1x4,mu:2x80x4,cond:2x80x4 --optShapes=x:2x80x193,mask:2x1x193,mu:2x80x193,cond:2x80x193 --maxShapes=x:2x80x6800,mask:2x1x6800,mu:2x80x6800,cond:2x80x6800 --inputIOFormats=fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw,fp16:chw --outputIOFormats=fp16:chw

tts/cosyvoice/bin/inference.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import torch
+from torch.utils.data import DataLoader
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+from tqdm import tqdm
+from cosyvoice.cli.model import CosyVoiceModel
+from cosyvoice.dataset.dataset import Dataset
+def get_args():
+    parser = argparse.ArgumentParser(description='inference with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--prompt_data', required=True, help='prompt data file')
+    parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
+    parser.add_argument('--tts_text', required=True, help='tts input file')
+    parser.add_argument('--llm_model', required=True, help='llm model file')
+    parser.add_argument('--flow_model', required=True, help='flow model file')
+    parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--mode',
+                        default='sft',
+                        choices=['sft', 'zero_shot'],
+                        help='inference mode')
+    parser.add_argument('--result_dir', required=True, help='asr result file')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+    # Init cosyvoice models from configs
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f)
+    model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
+    model.load(args.llm_model, args.flow_model, args.hifigan_model)
+    test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False,
+                           tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
+    test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+    del configs
+    os.makedirs(args.result_dir, exist_ok=True)
+    fn = os.path.join(args.result_dir, 'wav.scp')
+    f = open(fn, 'w')
+    with torch.no_grad():
+        for _, batch in tqdm(enumerate(test_data_loader)):
+            utts = batch["utts"]
+            assert len(utts) == 1, "inference mode only support batchsize 1"
+            text_token = batch["text_token"].to(device)
+            text_token_len = batch["text_token_len"].to(device)
+            tts_index = batch["tts_index"]
+            tts_text_token = batch["tts_text_token"].to(device)
+            tts_text_token_len = batch["tts_text_token_len"].to(device)
+            speech_token = batch["speech_token"].to(device)
+            speech_token_len = batch["speech_token_len"].to(device)
+            speech_feat = batch["speech_feat"].to(device)
+            speech_feat_len = batch["speech_feat_len"].to(device)
+            utt_embedding = batch["utt_embedding"].to(device)
+            spk_embedding = batch["spk_embedding"].to(device)
+            if args.mode == 'sft':
+                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                               'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding}
+            else:
+                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                               'prompt_text': text_token, 'prompt_text_len': text_token_len,
+                               'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                               'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                               'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                               'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding}
+            tts_speeches = []
+            for model_output in model.tts(**model_input):
+                tts_speeches.append(model_output['tts_speech'])
+            tts_speeches = torch.concat(tts_speeches, dim=1)
+            tts_key = '{}_{}'.format(utts[0], tts_index[0])
+            tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
+            torchaudio.save(tts_fn, tts_speeches, sample_rate=22050)
+            f.write('{} {}\n'.format(tts_key, tts_fn))
+            f.flush()
+    f.close()
+    logging.info('Result wav.scp saved in {}'.format(fn))
+if __name__ == '__main__':
+    main()

tts/cosyvoice/bin/train.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import datetime
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+from copy import deepcopy
+import os
+import torch
+import torch.distributed as dist
+import deepspeed
+from hyperpyyaml import load_hyperpyyaml
+from torch.distributed.elastic.multiprocessing.errors import record
+from cosyvoice.utils.executor import Executor
+from cosyvoice.utils.train_utils import (
+    init_distributed,
+    init_dataset_and_dataloader,
+    init_optimizer_and_scheduler,
+    init_summarywriter, save_model,
+    wrap_cuda_model, check_modify_and_save_config)
+def get_args():
+    parser = argparse.ArgumentParser(description='training your network')
+    parser.add_argument('--train_engine',
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'deepspeed'],
+                        help='Engine for paralleled training')
+    parser.add_argument('--model', required=True, help='model which will be trained')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--train_data', required=True, help='train data file')
+    parser.add_argument('--cv_data', required=True, help='cv data file')
+    parser.add_argument('--checkpoint', help='checkpoint model')
+    parser.add_argument('--model_dir', required=True, help='save model dir')
+    parser.add_argument('--tensorboard_dir',
+                        default='tensorboard',
+                        help='tensorboard log dir')
+    parser.add_argument('--ddp.dist_backend',
+                        dest='dist_backend',
+                        default='nccl',
+                        choices=['nccl', 'gloo'],
+                        help='distributed backend')
+    parser.add_argument('--num_workers',
+                        default=0,
+                        type=int,
+                        help='num of subprocess workers for reading')
+    parser.add_argument('--prefetch',
+                        default=100,
+                        type=int,
+                        help='prefetch number')
+    parser.add_argument('--pin_memory',
+                        action='store_true',
+                        default=False,
+                        help='Use pinned memory buffers used for reading')
+    parser.add_argument('--use_amp',
+                        action='store_true',
+                        default=False,
+                        help='Use automatic mixed precision training')
+    parser.add_argument('--deepspeed.save_states',
+                        dest='save_states',
+                        default='model_only',
+                        choices=['model_only', 'model+optimizer'],
+                        help='save model/optimizer states')
+    parser.add_argument('--timeout',
+                        default=60,
+                        type=int,
+                        help='timeout (in seconds) of cosyvoice_join.')
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+    return args
+@record
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    # gan train has some special initialization logic
+    gan = True if args.model == 'hifigan' else False
+    override_dict = {k: None for k in ['llm', 'flow', 'hift', 'hifigan'] if k != args.model}
+    if gan is True:
+        override_dict.pop('hift')
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f, overrides=override_dict)
+    if gan is True:
+        configs['train_conf'] = configs['train_conf_gan']
+    configs['train_conf'].update(vars(args))
+    # Init env for ddp
+    init_distributed(args)
+    # Get dataset & dataloader
+    train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
+        init_dataset_and_dataloader(args, configs, gan)
+    # Do some sanity checks and save config to arsg.model_dir
+    configs = check_modify_and_save_config(args, configs)
+    # Tensorboard summary
+    writer = init_summarywriter(args)
+    # load checkpoint
+    model = configs[args.model]
+    start_step, start_epoch = 0, -1
+    if args.checkpoint is not None:
+        if os.path.exists(args.checkpoint):
+            state_dict = torch.load(args.checkpoint, map_location='cpu')
+            model.load_state_dict(state_dict, strict=False)
+            if 'step' in state_dict:
+                start_step = state_dict['step']
+            if 'epoch' in state_dict:
+                start_epoch = state_dict['epoch']
+        else:
+            logging.warning('checkpoint {} do not exsist!'.format(args.checkpoint))
+    # Dispatch model from cpu to gpu
+    model = wrap_cuda_model(args, model)
+    # Get optimizer & scheduler
+    model, optimizer, scheduler, optimizer_d, scheduler_d = init_optimizer_and_scheduler(args, configs, model, gan)
+    scheduler.set_step(start_step)
+    if scheduler_d is not None:
+        scheduler_d.set_step(start_step)
+    # Save init checkpoints
+    info_dict = deepcopy(configs['train_conf'])
+    info_dict['step'] = start_step
+    info_dict['epoch'] = start_epoch
+    save_model(model, 'init', info_dict)
+    # Get executor
+    executor = Executor(gan=gan)
+    executor.step = start_step
+    # Init scaler, used for pytorch amp mixed precision training
+    scaler = torch.cuda.amp.GradScaler() if args.use_amp else None
+    print('start step {} start epoch {}'.format(start_step, start_epoch))
+    # Start training loop
+    for epoch in range(start_epoch + 1, info_dict['max_epoch']):
+        executor.epoch = epoch
+        train_dataset.set_epoch(epoch)
+        dist.barrier()
+        group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout))
+        if gan is True:
+            executor.train_one_epoc_gan(model, optimizer, scheduler, optimizer_d, scheduler_d, train_data_loader, cv_data_loader,
+                                        writer, info_dict, scaler, group_join)
+        else:
+            executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, scaler, group_join)
+        dist.destroy_process_group(group_join)
+if __name__ == '__main__':
+    main()

tts/cosyvoice/cli/__init__.py ADDED Viewed

File without changes

tts/cosyvoice/cli/cosyvoice.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import time
+from typing import Generator
+from tqdm import tqdm
+from hyperpyyaml import load_hyperpyyaml
+from modelscope import snapshot_download
+import torch
+from cosyvoice.cli.frontend import CosyVoiceFrontEnd
+from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model
+from cosyvoice.utils.file_utils import logging
+from cosyvoice.utils.class_utils import get_model_type
+class CosyVoice:
+    def __init__(self, model_dir,gpu_id=0, load_jit=False, load_trt=False, fp16=False):
+        self.instruct = True if '-Instruct' in model_dir else False
+        self.model_dir = model_dir
+        self.fp16 = fp16
+        if not os.path.exists(model_dir):
+            model_dir = snapshot_download(model_dir)
+        with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
+            configs = load_hyperpyyaml(f)
+        assert get_model_type(configs) != CosyVoice2Model, 'do not use {} for CosyVoice initialization!'.format(model_dir)
+        self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
+                                          configs['feat_extractor'],
+                                          '{}/campplus.onnx'.format(model_dir),
+                                          '{}/speech_tokenizer_v1.onnx'.format(model_dir),
+                                          '{}/spk2info.pt'.format(model_dir),
+                                          configs['allowed_special'],
+                                          gpu_id=gpu_id)
+        self.sample_rate = configs['sample_rate']
+        if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
+            load_jit, load_trt, fp16 = False, False, False
+            logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
+        self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16, gpu_id=gpu_id)
+        self.model.load('{}/llm.pt'.format(model_dir),
+                        '{}/flow.pt'.format(model_dir),
+                        '{}/hift.pt'.format(model_dir))
+        if load_jit:
+            self.model.load_jit('{}/llm.text_encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
+                                '{}/llm.llm.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
+                                '{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
+        if load_trt:
+            self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
+                                '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
+                                self.fp16)
+        del configs
+    def list_available_spks(self):
+        spks = list(self.frontend.spk2info.keys())
+        return spks
+    def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0, text_frontend=True):
+        for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
+            model_input = self.frontend.frontend_sft(i, spk_id)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()
+    def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0, text_frontend=True, token_list=None):
+        prompt_text = self.frontend.text_normalize(prompt_text, split=False, text_frontend=text_frontend)
+        for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
+            if (not isinstance(i, Generator)) and len(i) < 0.5 * len(prompt_text):
+                logging.warning('synthesis text {} too short than prompt text {}, this may lead to bad performance'.format(i, prompt_text))
+            model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k, self.sample_rate)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            # import pdb;pdb.set_trace()
+            for model_output in self.model.tts(**model_input, stream=stream, speed=speed, token_list=token_list):
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                return model_output
+    def inference_zero_shot_gxl(self,tts_text, prompt_text,prompt_speech_16k, stream=False, speed=1.0, text_frontend=True, token_list=None):
+        prompt_text = self.frontend.text_normalize(prompt_text, split=False, text_frontend=text_frontend)
+        input_text = self.frontend.text_normalize(tts_text, split=False, text_frontend=text_frontend)
+        model_input = self.frontend.frontend_zero_shot(input_text, prompt_text, prompt_speech_16k, self.sample_rate)
+        start_time = time.time()
+        model_output = self.model.tts_gxl(**model_input, stream=stream, speed=speed, token_list=token_list)
+        speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+        logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+        return model_output
+    def inference_zero_shot_gz_22k(self,tts_text, prompt_text,prompt_speech_22k, stream=False, speed=1.0, text_frontend=True, token_list=None):
+        prompt_text = self.frontend.text_normalize(prompt_text, split=False, text_frontend=text_frontend)
+        input_text = self.frontend.text_normalize(tts_text, split=False, text_frontend=text_frontend)
+        model_input = self.frontend.frontend_zero_shot_22k(input_text, prompt_text, prompt_speech_22k, self.sample_rate)
+        start_time = time.time()
+        model_output = self.model.tts_gxl(**model_input, stream=stream, speed=speed, token_list=token_list)
+        speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+        logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+        return model_output
+    def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0, text_frontend=True):
+        for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
+            model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k, self.sample_rate)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()
+    def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0, text_frontend=True):
+        assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
+        if self.instruct is False:
+            raise ValueError('{} do not support instruct inference'.format(self.model_dir))
+        instruct_text = self.frontend.text_normalize(instruct_text, split=False, text_frontend=text_frontend)
+        for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
+            model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()
+    def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
+        model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
+        start_time = time.time()
+        for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
+            speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+            logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+            yield model_output
+            start_time = time.time()
+class CosyVoice2(CosyVoice):
+    def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False):
+        self.instruct = True if '-Instruct' in model_dir else False
+        self.model_dir = model_dir
+        self.fp16 = fp16
+        if not os.path.exists(model_dir):
+            model_dir = snapshot_download(model_dir)
+        with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
+            configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'CosyVoice-BlankEN')})
+        assert get_model_type(configs) == CosyVoice2Model, 'do not use {} for CosyVoice2 initialization!'.format(model_dir)
+        self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
+                                          configs['feat_extractor'],
+                                          '{}/campplus.onnx'.format(model_dir),
+                                          '{}/speech_tokenizer_v2.onnx'.format(model_dir),
+                                          '{}/spk2info.pt'.format(model_dir),
+                                          configs['allowed_special'])
+        self.sample_rate = configs['sample_rate']
+        if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True):
+            load_jit, load_trt, fp16 = False, False, False
+            logging.warning('no cuda device, set load_jit/load_trt/fp16 to False')
+        self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16)
+        self.model.load('{}/llm.pt'.format(model_dir),
+                        '{}/flow.pt'.format(model_dir),
+                        '{}/hift.pt'.format(model_dir))
+        if load_jit:
+            self.model.load_jit('{}/flow.encoder.{}.zip'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'))
+        if load_trt:
+            self.model.load_trt('{}/flow.decoder.estimator.{}.mygpu.plan'.format(model_dir, 'fp16' if self.fp16 is True else 'fp32'),
+                                '{}/flow.decoder.estimator.fp32.onnx'.format(model_dir),
+                                self.fp16)
+        del configs
+    def inference_instruct(self, *args, **kwargs):
+        raise NotImplementedError('inference_instruct is not implemented for CosyVoice2!')
+    def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0, text_frontend=True):
+        assert isinstance(self.model, CosyVoice2Model), 'inference_instruct2 is only implemented for CosyVoice2!'
+        for i in tqdm(self.frontend.text_normalize(tts_text, split=True, text_frontend=text_frontend)):
+            model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
+            start_time = time.time()
+            logging.info('synthesis text {}'.format(i))
+            for model_output in self.model.tts(**model_input, stream=stream, speed=speed):
+                speech_len = model_output['tts_speech'].shape[1] / self.sample_rate
+                logging.info('yield speech len {}, rtf {}'.format(speech_len, (time.time() - start_time) / speech_len))
+                yield model_output
+                start_time = time.time()

tts/cosyvoice/cli/frontend.py ADDED Viewed

	@@ -0,0 +1,240 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+from typing import Generator
+import json
+import onnxruntime
+import torch
+import numpy as np
+import whisper
+from typing import Callable
+import torchaudio.compliance.kaldi as kaldi
+import torchaudio
+import os
+import re
+import inflect
+try:
+    import ttsfrd
+    use_ttsfrd = True
+except ImportError:
+    print("failed to import ttsfrd, use WeTextProcessing instead")
+    # from tn.chinese.normalizer import Normalizer as ZhNormalizer
+    # from tn.english.normalizer import Normalizer as EnNormalizer
+    use_ttsfrd = False
+from cosyvoice.utils.file_utils import logging
+from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph, is_only_punctuation
+is_npu = True
+try:
+    import torch_npu
+except ImportError:
+    is_npu = False
+    print("failed to import torch_npu")
+class CosyVoiceFrontEnd:
+    def __init__(self,
+                 get_tokenizer: Callable,
+                 feat_extractor: Callable,
+                 campplus_model: str,
+                 speech_tokenizer_model: str,
+                 spk2info: str = '',
+                 allowed_special: str = 'all',
+                 gpu_id: int = 0):
+        self.tokenizer = get_tokenizer()
+        self.feat_extractor = feat_extractor
+        if is_npu:
+            self.device = torch.device(f'npu:{gpu_id}')
+        else:
+            self.device = torch.device(f'cuda:{gpu_id}')
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
+        self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
+                                                                     providers=["CUDAExecutionProvider" if torch.cuda.is_available() else
+                                                                                "CPUExecutionProvider"])
+        if os.path.exists(spk2info):
+            self.spk2info = torch.load(spk2info, map_location=self.device)
+        else:
+            self.spk2info = {}
+        self.allowed_special = allowed_special
+        self.use_ttsfrd = use_ttsfrd
+        if self.use_ttsfrd:
+            self.frd = ttsfrd.TtsFrontendEngine()
+            ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+            assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, \
+                'failed to initialize ttsfrd resource'
+            self.frd.set_lang_type('pinyinvg')
+        else:
+            self.zh_tn_model = lambda x: x #ZhNormalizer(remove_erhua=False, full_to_half=False, overwrite_cache=True)
+            self.en_tn_model = lambda x: x #EnNormalizer()
+            self.inflect_parser = inflect.engine()
+    def _extract_text_token(self, text):
+        if isinstance(text, Generator):
+            logging.info('get tts_text generator, will return _extract_text_token_generator!')
+            # NOTE add a dummy text_token_len for compatibility
+            return self._extract_text_token_generator(text), torch.tensor([0], dtype=torch.int32).to(self.device)
+        else:
+            text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
+            text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
+            text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
+            return text_token, text_token_len
+    def _extract_text_token_generator(self, text_generator):
+        for text in text_generator:
+            text_token, _ = self._extract_text_token(text)
+            for i in range(text_token.shape[1]):
+                yield text_token[:, i: i + 1]
+    def _extract_speech_token(self, speech):
+        assert speech.shape[1] / 16000 <= 30, 'do not support extract speech token for audio longer than 30s'
+        feat = whisper.log_mel_spectrogram(speech, n_mels=128)
+        speech_token = self.speech_tokenizer_session.run(None,
+                                                         {self.speech_tokenizer_session.get_inputs()[0].name:
+                                                          feat.detach().cpu().numpy(),
+                                                          self.speech_tokenizer_session.get_inputs()[1].name:
+                                                          np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
+        speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device)
+        speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device)
+        return speech_token, speech_token_len
+    def _extract_spk_embedding(self, speech):
+        feat = kaldi.fbank(speech,
+                           num_mel_bins=80,
+                           dither=0,
+                           sample_frequency=16000)
+        feat = feat - feat.mean(dim=0, keepdim=True)
+        embedding = self.campplus_session.run(None,
+                                              {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
+        embedding = torch.tensor([embedding]).to(self.device)
+        return embedding
+    def _extract_speech_feat(self, speech):
+        speech_feat = self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device)
+        speech_feat = speech_feat.unsqueeze(dim=0)
+        speech_feat_len = torch.tensor([speech_feat.shape[1]], dtype=torch.int32).to(self.device)
+        return speech_feat, speech_feat_len
+    def text_normalize(self, text, split=True, text_frontend=True):
+        if isinstance(text, Generator):
+            logging.info('get tts_text generator, will skip text_normalize!')
+            return [text]
+        if text_frontend is False:
+            return [text] if split is True else text
+        text = text.strip()
+        if self.use_ttsfrd:
+            texts = [i["text"] for i in json.loads(self.frd.do_voicegen_frd(text))["sentences"]]
+            text = ''.join(texts)
+        else:
+            if contains_chinese(text):
+                # text = self.zh_tn_model.normalize(text)
+                text = text.replace("\n", "")
+                text = replace_blank(text)
+                text = replace_corner_mark(text)
+                text = text.replace(".", "。")
+                text = text.replace(" - ", "，")
+                text = remove_bracket(text)
+                text = re.sub(r'[，,、]+$', '。', text)
+                texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
+                                             token_min_n=60, merge_len=20, comma_split=False))
+            else:
+                # text = self.en_tn_model.normalize(text)
+                text = spell_out_number(text, self.inflect_parser)
+                texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
+                                             token_min_n=60, merge_len=20, comma_split=False))
+        texts = [i for i in texts if not is_only_punctuation(i)]
+        return texts if split is True else text
+    def frontend_sft(self, tts_text, spk_id):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        embedding = self.spk2info[spk_id]['embedding']
+        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
+        return model_input
+    def frontend_zero_shot_22k(self, tts_text, prompt_text, prompt_speech_22k, resample_rate=16000):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
+        prompt_speech_16k = torchaudio.transforms.Resample(orig_freq=22050, new_freq=16000)(prompt_speech_22k)
+        speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_22k)
+        speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
+       # if resample_rate == 16000:
+       #     # cosyvoice2, force speech_feat % speech_token = 2
+       #     token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
+       #     speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
+       #     speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
+        embedding = self._extract_spk_embedding(prompt_speech_16k)
+        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                       'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
+                       'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                       'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                       'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                       'llm_embedding': embedding, 'flow_embedding': embedding}
+        return model_input
+    def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, resample_rate):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
+        prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
+        speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
+        speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
+        if resample_rate == 24000:
+            # cosyvoice2, force speech_feat % speech_token = 2
+            token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
+            speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
+            speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
+        embedding = self._extract_spk_embedding(prompt_speech_16k)
+        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                       'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
+                       'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                       'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                       'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                       'llm_embedding': embedding, 'flow_embedding': embedding}
+        return model_input
+    def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate):
+        model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate)
+        # in cross lingual mode, we remove prompt in llm
+        del model_input['prompt_text']
+        del model_input['prompt_text_len']
+        del model_input['llm_prompt_speech_token']
+        del model_input['llm_prompt_speech_token_len']
+        return model_input
+    def frontend_instruct(self, tts_text, spk_id, instruct_text):
+        model_input = self.frontend_sft(tts_text, spk_id)
+        # in instruct mode, we remove spk_embedding in llm due to information leakage
+        del model_input['llm_embedding']
+        instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '<endofprompt>')
+        model_input['prompt_text'] = instruct_text_token
+        model_input['prompt_text_len'] = instruct_text_token_len
+        return model_input
+    def frontend_instruct2(self, tts_text, instruct_text, prompt_speech_16k, resample_rate):
+        model_input = self.frontend_zero_shot(tts_text, instruct_text + '<|endofprompt|>', prompt_speech_16k, resample_rate)
+        del model_input['llm_prompt_speech_token']
+        del model_input['llm_prompt_speech_token_len']
+        return model_input
+    def frontend_vc(self, source_speech_16k, prompt_speech_16k, resample_rate):
+        prompt_speech_token, prompt_speech_token_len = self._extract_speech_token(prompt_speech_16k)
+        prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
+        prompt_speech_feat, prompt_speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
+        embedding = self._extract_spk_embedding(prompt_speech_16k)
+        source_speech_token, source_speech_token_len = self._extract_speech_token(source_speech_16k)
+        model_input = {'source_speech_token': source_speech_token, 'source_speech_token_len': source_speech_token_len,
+                       'flow_prompt_speech_token': prompt_speech_token, 'flow_prompt_speech_token_len': prompt_speech_token_len,
+                       'prompt_speech_feat': prompt_speech_feat, 'prompt_speech_feat_len': prompt_speech_feat_len,
+                       'flow_embedding': embedding}
+        return model_input

tts/cosyvoice/cli/model.py ADDED Viewed

	@@ -0,0 +1,480 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Generator
+import torch
+import numpy as np
+import threading
+import time
+from torch.nn import functional as F
+from contextlib import nullcontext
+import uuid
+from cosyvoice.utils.common import fade_in_out
+from cosyvoice.utils.file_utils import convert_onnx_to_trt
+is_npu = True
+try:
+    import torch_npu
+except ImportError:
+    is_npu = False
+    print(f'torch_npu not found, set is_npu to False')
+class CosyVoiceModel:
+    def __init__(self,
+                 llm: torch.nn.Module,
+                 flow: torch.nn.Module,
+                 hift: torch.nn.Module,
+                 fp16: bool,
+                 gpu_id: int = 0):
+        if is_npu:
+            self.device = torch.device(f'npu:{gpu_id}')
+        else:
+            self.device = torch.device(f'cuda:{gpu_id}')
+        self.llm = llm
+        self.flow = flow
+        self.hift = hift
+        self.fp16 = fp16
+        self.llm.fp16 = fp16
+        self.flow.fp16 = fp16
+        if self.fp16 is True:
+            self.llm.half()
+            self.flow.half()
+        self.token_min_hop_len = 2 * self.flow.input_frame_rate
+        self.token_max_hop_len = 4 * self.flow.input_frame_rate
+        self.token_overlap_len = 20
+        # here we fix set flow.decoder.estimator.static_chunk_size = 0 for compatibability
+        self.flow.decoder.estimator.static_chunk_size = 0
+        # mel fade in out
+        self.mel_overlap_len = int(self.token_overlap_len / self.flow.input_frame_rate * 22050 / 256)
+        self.mel_window = np.hamming(2 * self.mel_overlap_len)
+        # hift cache
+        self.mel_cache_len = 20
+        self.source_cache_len = int(self.mel_cache_len * 256)
+        # speech fade in out
+        self.speech_window = np.hamming(2 * self.source_cache_len)
+        # rtf and decoding related
+        self.stream_scale_factor = 1
+        assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
+        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.lock = threading.Lock()
+        # dict used to store session related variable
+        self.tts_speech_token_dict = {}
+        self.llm_end_dict = {}
+        self.mel_overlap_dict = {}
+        self.flow_cache_dict = {}
+        self.hift_cache_dict = {}
+    def load(self, llm_model, flow_model, hift_model):
+        self.llm.load_state_dict(torch.load(llm_model, map_location=self.device), strict=True)
+        self.llm.to(self.device).eval()
+        self.flow.load_state_dict(torch.load(flow_model, map_location=self.device), strict=True)
+        self.flow.to(self.device).eval()
+        # in case hift_model is a hifigan model
+        hift_state_dict = {k.replace('generator.', ''): v for k, v in torch.load(hift_model, map_location=self.device).items()}
+        self.hift.load_state_dict(hift_state_dict, strict=True)
+        self.hift.to(self.device).eval()
+    def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
+        llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)
+        self.llm.text_encoder = llm_text_encoder
+        llm_llm = torch.jit.load(llm_llm_model, map_location=self.device)
+        self.llm.llm = llm_llm
+        flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
+        self.flow.encoder = flow_encoder
+    def load_trt(self, flow_decoder_estimator_model, flow_decoder_onnx_model, fp16):
+        assert torch.cuda.is_available(), 'tensorrt only supports gpu!'
+        if not os.path.exists(flow_decoder_estimator_model):
+            convert_onnx_to_trt(flow_decoder_estimator_model, flow_decoder_onnx_model, fp16)
+        if os.path.getsize(flow_decoder_estimator_model) == 0:
+            raise ValueError('{} is empty file, delete it and export again!'.format(flow_decoder_estimator_model))
+        del self.flow.decoder.estimator
+        import tensorrt as trt
+        with open(flow_decoder_estimator_model, 'rb') as f:
+            self.flow.decoder.estimator_engine = trt.Runtime(trt.Logger(trt.Logger.INFO)).deserialize_cuda_engine(f.read())
+        if self.flow.decoder.estimator_engine is None:
+            raise ValueError('failed to load trt {}'.format(flow_decoder_estimator_model))
+        self.flow.decoder.estimator = self.flow.decoder.estimator_engine.create_execution_context()
+    def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid):
+        with self.llm_context:
+            if isinstance(text, Generator):
+                assert isinstance(self, CosyVoice2Model), 'streaming input text is only implemented for CosyVoice2!'
+                for i in self.llm.inference_bistream(text=text,
+                                                     prompt_text=prompt_text.to(self.device),
+                                                     prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
+                                                     prompt_speech_token=llm_prompt_speech_token.to(self.device),
+                                                     prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
+                                                     embedding=llm_embedding.to(self.device)):
+                    self.tts_speech_token_dict[uuid].append(i)
+            else:
+                for i in self.llm.inference(text=text.to(self.device),
+                                            text_len=torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
+                                            prompt_text=prompt_text.to(self.device),
+                                            prompt_text_len=torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
+                                            prompt_speech_token=llm_prompt_speech_token.to(self.device),
+                                            prompt_speech_token_len=torch.tensor([llm_prompt_speech_token.shape[1]], dtype=torch.int32).to(self.device),
+                                            embedding=llm_embedding.to(self.device)):
+                    self.tts_speech_token_dict[uuid].append(i)
+        self.llm_end_dict[uuid] = True
+    def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0):
+        tts_mel, flow_cache = self.flow.inference(token=token.to(self.device),
+                                                  token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
+                                                  prompt_token=prompt_token.to(self.device),
+                                                  prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
+                                                  prompt_feat=prompt_feat.to(self.device),
+                                                  prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
+                                                  embedding=embedding.to(self.device),
+                                                  flow_cache=self.flow_cache_dict[uuid])
+        self.flow_cache_dict[uuid] = flow_cache
+        # mel overlap fade in out
+        if self.mel_overlap_dict[uuid].shape[2] != 0:
+            tts_mel = fade_in_out(tts_mel, self.mel_overlap_dict[uuid], self.mel_window)
+        # append hift cache
+        if self.hift_cache_dict[uuid] is not None:
+            hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
+            tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
+        else:
+            hift_cache_source = torch.zeros(1, 1, 0)
+        # keep overlap mel and hift cache
+        if finalize is False:
+            self.mel_overlap_dict[uuid] = tts_mel[:, :, -self.mel_overlap_len:]
+            tts_mel = tts_mel[:, :, :-self.mel_overlap_len]
+            tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
+            if self.hift_cache_dict[uuid] is not None:
+                tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
+            self.hift_cache_dict[uuid] = {'mel': tts_mel[:, :, -self.mel_cache_len:],
+                                          'source': tts_source[:, :, -self.source_cache_len:],
+                                          'speech': tts_speech[:, -self.source_cache_len:]}
+            tts_speech = tts_speech[:, :-self.source_cache_len]
+        else:
+            if speed != 1.0:
+                assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
+                tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
+            tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
+            if self.hift_cache_dict[uuid] is not None:
+                tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
+        return tts_speech
+    def tts(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
+            prompt_text=torch.zeros(1, 0, dtype=torch.int32),
+            llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+            flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+            prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0,
+            token_list=None, **kwargs):
+        # this_uuid is used to track variables related to this inference thread
+        this_uuid = str(uuid.uuid1())
+        with self.lock:
+            self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
+            self.hift_cache_dict[this_uuid] = None
+            self.mel_overlap_dict[this_uuid] = torch.zeros(1, 80, 0)
+            self.flow_cache_dict[this_uuid] = torch.zeros(1, 80, 0, 2)
+        p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
+        p.start()
+        # import pdb;pdb.set_trace()
+        if stream is True:
+            token_hop_len = self.token_min_hop_len
+            while True:
+                time.sleep(0.1)
+                if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
+                    this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len]) \
+                        .unsqueeze(dim=0)
+                    # import pdb;pdb.set_trace()
+                    gen_token = [1650, 2163, 3062, 41, 347, 754, 1705, 73, 38, 2583, 59, 1660, 1716, 28, 324, 1260, 1018, 254, 1650, 3552, 1804, 2515, 2368, 38, 1660, 3106, 848, 3250, 1611, 511, 1037, 2964, 2255, 1509, 890, 1494, 2250, 1349, 2621, 3420, 46, 2646, 2646, 3025, 2579, 393, 824, 1609, 2089, 2162, 24, 2, 3768, 1155, 343, 325, 2764, 814, 426, 1243, 2579, 3916, 20, 1611, 349, 701, 1346, 3768, 927, 3305, 8, 2099, 511, 3582, 8, 421, 1494, 2323, 2253, 3607, 692, 3929, 511, 3710, 3662, 3179, 1204, 7, 2579, 2579, 3025, 3025, 571, 540, 1509, 2786, 2548, 1404, 699, 1260, 2250, 202, 202, 84, 3458, 73, 3458, 1716, 302, 2105, 193, 974, 3761, 2893, 2250, 193, 754, 69, 69, 599, 2554, 890, 1608, 148, 1243, 480, 1, 489, 271, 1038, 1736, 1865, 3337, 569, 28, 2246, 2426, 2250, 3768, 569, 1027, 3305, 3106, 8, 3635, 269, 1854, 70, 1385, 1584, 1385, 2187, 3064, 3064, 2579, 3025, 3337, 2579, 3768]
+                    token_list = [66, 2307, 599, 1602, 714, 1100, 1243, 2657, 349, 535, 3662, 1403, 2610, 669, 569, 49, 48, 1027, 2684, 373, 728, 728, 186, 186, 7, 2250, 754, 1346, 1289, 2691, 3740, 3082, 629, 2841, 432, 1513, 1716, 302, 3607, 3607, 692, 1609, 2579, 3025, 2513, 2513, 1043, 1043, 2704, 53, 2893, 1043, 2704, 1043, 2513, 2513, 1043, 1083, 3600, 421, 8, 8, 1256, 1243, 3278, 2932, 510, 2515, 2582, 1906, 4056, 1346, 1241, 2253, 1346, 1698, 962, 409, 1507, 1377, 2162, 10, 21, 396, 3649, 373, 728, 2513, 2513, 2513, 2513, 1865, 1893, 1712, 375, 4064, 3062, 41, 569, 3887, 1716, 472, 3830, 186, 408, 203, 3478, 3340, 800, 1243, 480, 271, 2162, 3240, 3238, 3193, 599, 2391, 1317, 1346, 269, 2253, 2209, 8, 1974, 2764, 1579, 421, 1073, 3929, 590, 31, 3898, 53, 53, 1043, 1957]
+                    this_tts_speech_token = np.array(token_list)
+                    this_tts_speech_token = torch.tensor(this_tts_speech_token)
+                    # this_tts_speech_token = np.load("/home/node57_data/hkxie/4O/streaming_fm/data/s3token1/05343304771_EIjYa_VAD27_3.hubert_code.npy")
+                    # this_tts_speech_token = torch.tensor(this_tts_speech_token)
+                    this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                                     prompt_token=flow_prompt_speech_token,
+                                                     prompt_feat=prompt_speech_feat,
+                                                     embedding=flow_embedding,
+                                                     uuid=this_uuid,
+                                                     finalize=False)
+                    yield {'tts_speech': this_tts_speech.cpu()}
+                    with self.lock:
+                        self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][token_hop_len:]
+                    # increase token_hop_len for better speech quality
+                    token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
+                if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
+                    break
+            p.join()
+            # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
+            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             uuid=this_uuid,
+                                             finalize=True)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        else:
+            # deal with all tokens
+            p.join()
+            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+            this_tts_speech_token = np.array(token_list)
+            this_tts_speech_token = torch.tensor(this_tts_speech_token)
+            this_tts_speech_token = torch.tensor(this_tts_speech_token).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             uuid=this_uuid,
+                                             finalize=True,
+                                             speed=speed)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        with self.lock:
+            self.tts_speech_token_dict.pop(this_uuid)
+            self.llm_end_dict.pop(this_uuid)
+            self.mel_overlap_dict.pop(this_uuid)
+            self.hift_cache_dict.pop(this_uuid)
+            self.flow_cache_dict.pop(this_uuid)
+        torch.cuda.empty_cache()
+    def tts_gxl(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
+            prompt_text=torch.zeros(1, 0, dtype=torch.int32),
+            llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+            flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+            prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0,
+            token_list=None, **kwargs):
+        # this_uuid is used to track variables related to this inference thread
+        this_uuid = str(uuid.uuid1())
+        with self.lock:
+            self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
+            self.hift_cache_dict[this_uuid] = None
+            self.mel_overlap_dict[this_uuid] = torch.zeros(1, 80, 0)
+            self.flow_cache_dict[this_uuid] = torch.zeros(1, 80, 0, 2)
+        # p = threading.Thread(target=self.llm_job,
+        #                      args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
+        # p.start()
+        # p.join()
+        # this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+        this_tts_speech_token = np.array(token_list)
+        this_tts_speech_token = torch.tensor(this_tts_speech_token)
+        this_tts_speech_token = torch.tensor(this_tts_speech_token).unsqueeze(dim=0)
+        this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                         prompt_token=flow_prompt_speech_token,
+                                         prompt_feat=prompt_speech_feat,
+                                         embedding=flow_embedding,
+                                         uuid=this_uuid,
+                                         finalize=True,
+                                         speed=speed)
+        torch.cuda.empty_cache()
+        with self.lock:
+            self.tts_speech_token_dict.pop(this_uuid)
+            self.llm_end_dict.pop(this_uuid)
+            self.mel_overlap_dict.pop(this_uuid)
+            self.hift_cache_dict.pop(this_uuid)
+            self.flow_cache_dict.pop(this_uuid)
+        return {'tts_speech': this_tts_speech.cpu()}
+    def vc(self, source_speech_token, flow_prompt_speech_token, prompt_speech_feat, flow_embedding, stream=False, speed=1.0, **kwargs):
+        # this_uuid is used to track variables related to this inference thread
+        this_uuid = str(uuid.uuid1())
+        with self.lock:
+            self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = source_speech_token.flatten().tolist(), True
+            self.hift_cache_dict[this_uuid] = None
+            self.mel_overlap_dict[this_uuid] = torch.zeros(1, 80, 0)
+            self.flow_cache_dict[this_uuid] = torch.zeros(1, 80, 0, 2)
+        if stream is True:
+            token_hop_len = self.token_min_hop_len
+            while True:
+                if len(self.tts_speech_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
+                    this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_hop_len + self.token_overlap_len]) \
+                        .unsqueeze(dim=0)
+                    this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                                     prompt_token=flow_prompt_speech_token,
+                                                     prompt_feat=prompt_speech_feat,
+                                                     embedding=flow_embedding,
+                                                     uuid=this_uuid,
+                                                     finalize=False)
+                    yield {'tts_speech': this_tts_speech.cpu()}
+                    with self.lock:
+                        self.tts_speech_token_dict[this_uuid] = self.tts_speech_token_dict[this_uuid][token_hop_len:]
+                    # increase token_hop_len for better speech quality
+                    token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
+                if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
+                    break
+            # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
+            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             uuid=this_uuid,
+                                             finalize=True)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        else:
+            # deal with all tokens
+            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             uuid=this_uuid,
+                                             finalize=True,
+                                             speed=speed)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        with self.lock:
+            self.tts_speech_token_dict.pop(this_uuid)
+            self.llm_end_dict.pop(this_uuid)
+            self.mel_overlap_dict.pop(this_uuid)
+            self.hift_cache_dict.pop(this_uuid)
+        torch.cuda.empty_cache()
+class CosyVoice2Model(CosyVoiceModel):
+    def __init__(self,
+                 llm: torch.nn.Module,
+                 flow: torch.nn.Module,
+                 hift: torch.nn.Module,
+                 fp16: bool):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.llm = llm
+        self.flow = flow
+        self.hift = hift
+        self.fp16 = fp16
+        self.llm.fp16 = fp16
+        self.flow.fp16 = fp16
+        if self.fp16 is True:
+            self.llm.half()
+            self.flow.half()
+        self.token_hop_len = 2 * self.flow.input_frame_rate
+        # here we fix flow encoder/decoder decoding_chunk_size, in the future we will send it as arguments, or use cache
+        self.flow.encoder.static_chunk_size = 2 * self.flow.input_frame_rate
+        self.flow.decoder.estimator.static_chunk_size = 2 * self.flow.input_frame_rate * self.flow.token_mel_ratio
+        # hift cache
+        self.mel_cache_len = 8
+        self.source_cache_len = int(self.mel_cache_len * 480)
+        # speech fade in out
+        self.speech_window = np.hamming(2 * self.source_cache_len)
+        # rtf and decoding related
+        self.stream_scale_factor = 1
+        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.lock = threading.Lock()
+        # dict used to store session related variable
+        self.tts_speech_token_dict = {}
+        self.llm_end_dict = {}
+        self.hift_cache_dict = {}
+    def load_jit(self, flow_encoder_model):
+        flow_encoder = torch.jit.load(flow_encoder_model, map_location=self.device)
+        self.flow.encoder = flow_encoder
+    def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, token_offset, finalize=False, speed=1.0):
+        tts_mel, _ = self.flow.inference(token=token.to(self.device),
+                                         token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device),
+                                         prompt_token=prompt_token.to(self.device),
+                                         prompt_token_len=torch.tensor([prompt_token.shape[1]], dtype=torch.int32).to(self.device),
+                                         prompt_feat=prompt_feat.to(self.device),
+                                         prompt_feat_len=torch.tensor([prompt_feat.shape[1]], dtype=torch.int32).to(self.device),
+                                         embedding=embedding.to(self.device),
+                                         finalize=finalize)
+        tts_mel = tts_mel[:, :, token_offset * self.flow.token_mel_ratio:]
+        # append hift cache
+        if self.hift_cache_dict[uuid] is not None:
+            hift_cache_mel, hift_cache_source = self.hift_cache_dict[uuid]['mel'], self.hift_cache_dict[uuid]['source']
+            tts_mel = torch.concat([hift_cache_mel, tts_mel], dim=2)
+        else:
+            hift_cache_source = torch.zeros(1, 1, 0)
+        # keep overlap mel and hift cache
+        if finalize is False:
+            tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
+            if self.hift_cache_dict[uuid] is not None:
+                tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
+            self.hift_cache_dict[uuid] = {'mel': tts_mel[:, :, -self.mel_cache_len:],
+                                          'source': tts_source[:, :, -self.source_cache_len:],
+                                          'speech': tts_speech[:, -self.source_cache_len:]}
+            tts_speech = tts_speech[:, :-self.source_cache_len]
+        else:
+            if speed != 1.0:
+                assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
+                tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
+            tts_speech, tts_source = self.hift.inference(speech_feat=tts_mel, cache_source=hift_cache_source)
+            if self.hift_cache_dict[uuid] is not None:
+                tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
+        return tts_speech
+    def tts(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
+            prompt_text=torch.zeros(1, 0, dtype=torch.int32),
+            llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+            flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32),
+            prompt_speech_feat=torch.zeros(1, 0, 80), stream=False, speed=1.0, **kwargs):
+        # this_uuid is used to track variables related to this inference thread
+        this_uuid = str(uuid.uuid1())
+        with self.lock:
+            self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
+            self.hift_cache_dict[this_uuid] = None
+        p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
+        p.start()
+        if stream is True:
+            token_offset = 0
+            while True:
+                time.sleep(0.1)
+                if len(self.tts_speech_token_dict[this_uuid]) - token_offset >= self.token_hop_len + self.flow.pre_lookahead_len:
+                    this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid][:token_offset + self.token_hop_len + self.flow.pre_lookahead_len]).unsqueeze(dim=0)
+                    this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                                     prompt_token=flow_prompt_speech_token,
+                                                     prompt_feat=prompt_speech_feat,
+                                                     embedding=flow_embedding,
+                                                     uuid=this_uuid,
+                                                     token_offset=token_offset,
+                                                     finalize=False)
+                    token_offset += self.token_hop_len
+                    yield {'tts_speech': this_tts_speech.cpu()}
+                if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < self.token_hop_len + self.flow.pre_lookahead_len:
+                    break
+            p.join()
+            # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
+            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             uuid=this_uuid,
+                                             token_offset=token_offset,
+                                             finalize=True)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        else:
+            # deal with all tokens
+            p.join()
+            this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
+            # import pdb;pdb.set_trace()
+            # this_tts_speech_token = np.load("/home/node57_data/hkxie/4O/streaming_fm/data/s3token2/05343304771_EIjYa_VAD27_3.hubert_code.npy")
+            # this_tts_speech_token = np.load("/home/node57_data/hkxie/4O/streaming_fm/data/s3token2/05343304771_EIjYa_VAD41_6.hubert_code.npy")
+            # token2 = [2745, 860, 393, 393, 2579, 2926, 1842, 2136, 480, 205, 3910, 3251, 73, 42, 38, 1346, 2554, 368, 40, 1660, 1660, 1055, 2597, 1712, 28, 2246, 386, 122, 38, 3607, 3818, 1098, 980, 38, 1353, 1660, 426, 1694, 1406, 511, 511, 396, 671, 2571, 2809, 2385, 3947, 229, 2000, 773, 2786, 858, 2554, 701, 46, 2646, 1608, 2890, 393, 393, 393, 393, 393, 393, 393, 393, 393, 393, 393, 393, 3, 31, 758, 3438, 3438, 3438, 54, 269, 2246, 343, 1600, 1608, 3554, 3649, 60, 511, 701, 44, 3554, 3775, 20, 2099, 535, 2099, 3545, 3267, 1223, 1650, 3607, 3611, 2646, 3545, 3545, 802, 802, 393, 393, 393, 393, 393, 393, 393, 393, 393, 393, 393, 393, 393, 393, 393, 393, 3, 26, 1734, 571, 1240, 1509, 2786, 1509, 740, 890, 2426, 1241, 1241, 2399, 2, 3458, 2285, 25, 2105, 4082, 3761, 3121, 3121, 269, 4082, 1353, 2285, 463, 758, 1193, 421, 3662, 148, 1516, 101, 32, 615, 1660, 1038, 2597, 3554, 28, 2246, 2426, 1241, 22, 1406, 70, 2230, 2230, 3635, 302, 2537, 1385, 1385, 1385, 69, 754, 3489, 1055, 393, 393, 393, 393, 393, 393, 393, 393]
+            # token_list3 = [2745, 599, 3238, 2554, 84, 73, 42, 2582, 2583, 4082, 1660, 1584, 1469, 1712, 2243, 1260, 1688, 269, 409, 3552, 1584, 2646, 38, 2385, 1660, 1038, 1516, 85, 3250, 1611, 109, 3611, 2255, 3947, 229, 451, 2786, 1044, 2621, 4056, 2646, 2646, 2890, 31, 3898, 3898, 2893, 2893, 2893, 2893, 1043, 52, 52, 52, 52, 1504, 2307, 202, 229, 358, 358, 266, 2907, 1516, 2246, 343, 1030, 122, 2409, 1694, 1406, 511, 2209, 51, 927, 1185, 1256, 1879, 2890, 2858, 203, 2426, 2253, 69, 3011, 3611, 2515, 2646, 492, 3662, 1608, 7, 31, 1406, 1406, 2893, 1043, 728, 380, 380, 571, 2385, 229, 740, 3193, 358, 202, 3331, 2, 1796, 35, 2285, 1893, 1516, 329, 3761, 2859, 122, 1241, 329, 1906, 59, 460, 463, 2554, 740, 1608, 60, 1516, 101, 1, 489, 1038, 1038, 3337, 3768, 569, 32, 1494, 2250, 3768, 3649, 20, 351, 1404, 1193, 44, 59, 3607, 2174, 1584, 1584, 1584, 1655, 1736, 1043, 1043, 1469, 569, 28, 2000, 2426, 2250, 3768, 927, 3250, 8, 2099, 1716, 59, 792, 3106, 1385, 1385, 1385, 1385, 1385, 3947, 1507, 864, 52, 52, 52]
+            token_list3 = [997, 966, 3554, 1854, 714, 3761, 3741, 2426, 103, 103, 1260, 1260, 2306, 2306, 2307, 824, 792, 193, 1879, 3478, 48, 511, 3420, 1317, 1761, 599, 1002, 980, 2646, 2646, 2646, 2646, 2646, 3366, 1949, 575, 575, 26, 26, 29, 3929, 229, 3910, 568, 3265, 3768, 28, 2004, 3910, 568, 3265, 3062, 41, 927, 699, 304, 2859, 2537, 28, 3741, 2841, 1688, 3768, 28, 1155, 855, 1570, 1570, 1570, 1570, 1570, 2876, 2680, 3, 3, 3636, 1555, 2844, 409, 1040, 2515, 1640, 3121, 3153, 882, 2385, 1796, 1796, 1796, 2368, 1785, 49, 671, 3830, 3025, 2844, 2105, 1037, 1729, 2105, 3265, 103, 1346, 580, 3922, 2876, 42, 271, 59, 3106, 2680, 3830, 2704, 2105, 2815, 59, 1698, 1223, 1342, 3267, 2786, 2250, 2250, 2208, 3, 1446, 1446, 1446, 1446, 1446, 1446, 1446, 1446, 1446, 1446, 1688, 1688, 1446, 1446, 1688, 1688, 1688, 1688, 1688]
+            this_tts_speech_token = np.array(token_list3)
+            this_tts_speech_token = torch.tensor(this_tts_speech_token)
+            this_tts_speech_token = torch.tensor(this_tts_speech_token).unsqueeze(dim=0)
+            this_tts_speech = self.token2wav(token=this_tts_speech_token,
+                                             prompt_token=flow_prompt_speech_token,
+                                             prompt_feat=prompt_speech_feat,
+                                             embedding=flow_embedding,
+                                             uuid=this_uuid,
+                                             token_offset=0,
+                                             finalize=True,
+                                             speed=speed)
+            yield {'tts_speech': this_tts_speech.cpu()}
+        with self.lock:
+            self.tts_speech_token_dict.pop(this_uuid)
+            self.llm_end_dict.pop(this_uuid)
+        torch.cuda.empty_cache()

tts/cosyvoice/dataset/__init__.py ADDED Viewed

File without changes

tts/cosyvoice/dataset/dataset.py ADDED Viewed

	@@ -0,0 +1,164 @@

+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#               2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+import json
+import math
+from functools import partial
+import torch
+import torch.distributed as dist
+from torch.utils.data import IterableDataset
+from cosyvoice.utils.file_utils import read_lists, read_json_lists
+class Processor(IterableDataset):
+    def __init__(self, source, f, *args, **kw):
+        assert callable(f)
+        self.source = source
+        self.f = f
+        self.args = args
+        self.kw = kw
+    def set_epoch(self, epoch):
+        self.source.set_epoch(epoch)
+    def __iter__(self):
+        """ Return an iterator over the source dataset processed by the
+            given processor.
+        """
+        assert self.source is not None
+        assert callable(self.f)
+        return self.f(iter(self.source), *self.args, **self.kw)
+    def apply(self, f):
+        assert callable(f)
+        return Processor(self, f, *self.args, **self.kw)
+class DistributedSampler:
+    def __init__(self, shuffle=True, partition=True):
+        self.epoch = -1
+        self.update()
+        self.shuffle = shuffle
+        self.partition = partition
+    def update(self):
+        assert dist.is_available()
+        if dist.is_initialized():
+            self.rank = dist.get_rank()
+            self.world_size = dist.get_world_size()
+        else:
+            self.rank = 0
+            self.world_size = 1
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:
+            self.worker_id = 0
+            self.num_workers = 1
+        else:
+            self.worker_id = worker_info.id
+            self.num_workers = worker_info.num_workers
+        return dict(rank=self.rank,
+                    world_size=self.world_size,
+                    worker_id=self.worker_id,
+                    num_workers=self.num_workers)
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+    def sample(self, data):
+        """ Sample data according to rank/world_size/num_workers
+            Args:
+                data(List): input data list
+            Returns:
+                List: data list after sample
+        """
+        data = list(range(len(data)))
+        # force datalist even
+        if self.partition:
+            if self.shuffle:
+                random.Random(self.epoch).shuffle(data)
+            if len(data) < self.world_size:
+                data = data * math.ceil(self.world_size / len(data))
+                data = data[:self.world_size]
+            data = data[self.rank::self.world_size]
+        if len(data) < self.num_workers:
+            data = data * math.ceil(self.num_workers / len(data))
+            data = data[:self.num_workers]
+        data = data[self.worker_id::self.num_workers]
+        return data
+class DataList(IterableDataset):
+    def __init__(self, lists, shuffle=True, partition=True):
+        self.lists = lists
+        self.sampler = DistributedSampler(shuffle, partition)
+    def set_epoch(self, epoch):
+        self.sampler.set_epoch(epoch)
+    def __iter__(self):
+        sampler_info = self.sampler.update()
+        indexes = self.sampler.sample(self.lists)
+        for index in indexes:
+            data = dict(src=self.lists[index])
+            data.update(sampler_info)
+            yield data
+def Dataset(data_list_file,
+            data_pipeline,
+            mode='train',
+            gan=False,
+            shuffle=True,
+            partition=True,
+            tts_file='',
+            prompt_utt2data=''):
+    """ Construct dataset from arguments
+        We have two shuffle stage in the Dataset. The first is global
+        shuffle at shard tar/raw file level. The second is global shuffle
+        at training samples level.
+        Args:
+            data_type(str): raw/shard
+            tokenizer (BaseTokenizer): tokenizer to tokenize
+            partition(bool): whether to do data partition in terms of rank
+    """
+    assert mode in ['train', 'inference']
+    lists = read_lists(data_list_file)
+    if mode == 'inference':
+        with open(tts_file) as f:
+            tts_data = json.load(f)
+        utt2lists = read_json_lists(prompt_utt2data)
+        # filter unnecessary file in inference mode
+        lists = list({utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists})
+    dataset = DataList(lists,
+                       shuffle=shuffle,
+                       partition=partition)
+    if mode == 'inference':
+        # map partial arg to parquet_opener func in inference mode
+        data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data)
+    if gan is True:
+        # map partial arg to padding func in gan mode
+        data_pipeline[-1] = partial(data_pipeline[-1], gan=gan)
+    for func in data_pipeline:
+        dataset = Processor(dataset, func, mode=mode)
+    return dataset

tts/cosyvoice/dataset/processor.py ADDED Viewed

	@@ -0,0 +1,435 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+import pyarrow.parquet as pq
+from io import BytesIO
+import torch
+import torchaudio
+from torch.nn.utils.rnn import pad_sequence
+import torch.nn.functional as F
+import pyworld as pw
+AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
+def parquet_opener(data, mode='train', tts_data={}):
+    """ Give url or local file, return file descriptor
+        Inplace operation.
+        Args:
+            data(Iterable[str]): url or local file list
+        Returns:
+            Iterable[{src, stream}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        url = sample['src']
+        try:
+            for df in pq.ParquetFile(url).iter_batches(batch_size=64):
+                df = df.to_pandas()
+                for i in range(len(df)):
+                    if mode == 'inference' and df.loc[i, 'utt'] not in tts_data:
+                        continue
+                    sample.update(dict(df.loc[i]))
+                    if mode == 'train':
+                        # NOTE do not return sample directly, must initialize a new dict
+                        yield {**sample}
+                    else:
+                        for index, text in enumerate(tts_data[df.loc[i, 'utt']]):
+                            yield {**sample, 'tts_index': index, 'tts_text': text}
+        except Exception as ex:
+            logging.warning('Failed to open {}, ex info {}'.format(url, ex))
+def filter(data,
+           max_length=10240,
+           min_length=10,
+           token_max_length=200,
+           token_min_length=1,
+           min_output_input_ratio=0.0005,
+           max_output_input_ratio=1,
+           mode='train'):
+    """ Filter sample according to feature and label length
+        Inplace operation.
+        Args::
+            data: Iterable[{key, wav, label, sample_rate}]
+            max_length: drop utterance which is greater than max_length(10ms)
+            min_length: drop utterance which is less than min_length(10ms)
+            token_max_length: drop utterance which is greater than
+                token_max_length, especially when use char unit for
+                english modeling
+            token_min_length: drop utterance which is
+                less than token_max_length
+            min_output_input_ratio: minimal ration of
+                token_length / feats_length(10ms)
+            max_output_input_ratio: maximum ration of
+                token_length / feats_length(10ms)
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        sample['speech'], sample['sample_rate'] = torchaudio.load(BytesIO(sample['audio_data']))
+        sample['speech'] = sample['speech'].mean(dim=0, keepdim=True)
+        del sample['audio_data']
+        # sample['wav'] is torch.Tensor, we have 100 frames every second
+        num_frames = sample['speech'].size(1) / sample['sample_rate'] * 100
+        if num_frames < min_length:
+            continue
+        if num_frames > max_length:
+            continue
+        if len(sample['text_token']) < token_min_length:
+            continue
+        if len(sample['text_token']) > token_max_length:
+            continue
+        if len(sample['speech_token']) == 0:
+            continue
+        if num_frames != 0:
+            if len(sample['text_token']) / num_frames < min_output_input_ratio:
+                continue
+            if len(sample['text_token']) / num_frames > max_output_input_ratio:
+                continue
+        yield sample
+def resample(data, resample_rate=22050, min_sample_rate=16000, mode='train'):
+    """ Resample data.
+        Inplace operation.
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            resample_rate: target resample rate
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'speech' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['speech']
+        if sample_rate != resample_rate:
+            if sample_rate < min_sample_rate:
+                continue
+            sample['sample_rate'] = resample_rate
+            sample['speech'] = torchaudio.transforms.Resample(
+                orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+        max_val = sample['speech'].abs().max()
+        if max_val > 1:
+            sample['speech'] /= max_val
+        yield sample
+def truncate(data, truncate_length=24576, mode='train'):
+    """ Truncate data.
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            truncate_length: truncate length
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        waveform = sample['speech']
+        if waveform.shape[1] > truncate_length:
+            start = random.randint(0, waveform.shape[1] - truncate_length)
+            waveform = waveform[:, start: start + truncate_length]
+        else:
+            waveform = torch.concat([waveform, torch.zeros(1, truncate_length - waveform.shape[1])], dim=1)
+        sample['speech'] = waveform
+        yield sample
+def compute_fbank(data,
+                  feat_extractor,
+                  mode='train'):
+    """ Extract fbank
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'speech' in sample
+        assert 'utt' in sample
+        assert 'text_token' in sample
+        waveform = sample['speech']
+        mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
+        sample['speech_feat'] = mat
+        yield sample
+def compute_f0(data, sample_rate, hop_size, mode='train'):
+    """ Extract f0
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    frame_period = hop_size * 1000 / sample_rate
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'speech' in sample
+        assert 'utt' in sample
+        assert 'text_token' in sample
+        waveform = sample['speech']
+        _f0, t = pw.harvest(waveform.squeeze(dim=0).numpy().astype('double'), sample_rate, frame_period=frame_period)
+        if sum(_f0 != 0) < 5: # this happens when the algorithm fails
+            _f0, t = pw.dio(waveform.squeeze(dim=0).numpy().astype('double'), sample_rate, frame_period=frame_period) # if harvest fails, try dio
+        f0 = pw.stonemask(waveform.squeeze(dim=0).numpy().astype('double'), _f0, t, sample_rate)
+        f0 = F.interpolate(torch.from_numpy(f0).view(1, 1, -1), size=sample['speech_feat'].shape[0], mode='linear').view(-1)
+        sample['pitch_feat'] = f0
+        yield sample
+def parse_embedding(data, normalize, mode='train'):
+    """ Parse utt_embedding/spk_embedding
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32)
+        sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32)
+        if normalize:
+            sample['utt_embedding'] = F.normalize(sample['utt_embedding'], dim=0)
+            sample['spk_embedding'] = F.normalize(sample['spk_embedding'], dim=0)
+        yield sample
+def tokenize(data, get_tokenizer, allowed_special, mode='train'):
+    """ Decode text to chars or BPE
+        Inplace operation
+        Args:
+            data: Iterable[{key, wav, txt, sample_rate}]
+        Returns:
+            Iterable[{key, wav, txt, tokens, label, sample_rate}]
+    """
+    tokenizer = get_tokenizer()
+    for sample in data:
+        assert 'text' in sample
+        sample['text_token'] = tokenizer.encode(sample['text'], allowed_special=allowed_special)
+        if mode == 'inference':
+            sample['tts_text_token'] = tokenizer.encode(sample['tts_text'], allowed_special=allowed_special)
+        yield sample
+def shuffle(data, shuffle_size=10000, mode='train'):
+    """ Local shuffle the data
+        Args:
+            data: Iterable[{key, feat, label}]
+            shuffle_size: buffer size for shuffle
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= shuffle_size:
+            random.shuffle(buf)
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    random.shuffle(buf)
+    for x in buf:
+        yield x
+def sort(data, sort_size=500, mode='train'):
+    """ Sort the data by feature length.
+        Sort is used after shuffle and before batch, so we can group
+        utts with similar lengths into a batch, and `sort_size` should
+        be less than `shuffle_size`
+        Args:
+            data: Iterable[{key, feat, label}]
+            sort_size: buffer size for sort
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= sort_size:
+            buf.sort(key=lambda x: x['speech_feat'].size(0))
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    buf.sort(key=lambda x: x['speech_feat'].size(0))
+    for x in buf:
+        yield x
+def static_batch(data, batch_size=16):
+    """ Static batch the data by `batch_size`
+        Args:
+            data: Iterable[{key, feat, label}]
+            batch_size: batch size
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= batch_size:
+            yield buf
+            buf = []
+    if len(buf) > 0:
+        yield buf
+def dynamic_batch(data, max_frames_in_batch=12000, mode='train'):
+    """ Dynamic batch the data until the total frames in batch
+        reach `max_frames_in_batch`
+        Args:
+            data: Iterable[{key, feat, label}]
+            max_frames_in_batch: max_frames in one batch
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    longest_frames = 0
+    for sample in data:
+        assert 'speech_feat' in sample
+        assert isinstance(sample['speech_feat'], torch.Tensor)
+        new_sample_frames = sample['speech_feat'].size(0)
+        longest_frames = max(longest_frames, new_sample_frames)
+        frames_after_padding = longest_frames * (len(buf) + 1)
+        if frames_after_padding > max_frames_in_batch:
+            yield buf
+            buf = [sample]
+            longest_frames = new_sample_frames
+        else:
+            buf.append(sample)
+    if len(buf) > 0:
+        yield buf
+def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000, mode='train'):
+    """ Wrapper for static/dynamic batch
+    """
+    if mode == 'inference':
+        return static_batch(data, 1)
+    else:
+        if batch_type == 'static':
+            return static_batch(data, batch_size)
+        elif batch_type == 'dynamic':
+            return dynamic_batch(data, max_frames_in_batch)
+        else:
+            logging.fatal('Unsupported batch type {}'.format(batch_type))
+def padding(data, use_spk_embedding, mode='train', gan=False):
+    """ Padding the data into training data
+        Args:
+            data: Iterable[List[{key, feat, label}]]
+        Returns:
+            Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)]
+    """
+    for sample in data:
+        assert isinstance(sample, list)
+        speech_feat_len = torch.tensor([x['speech_feat'].size(1) for x in sample],
+                                       dtype=torch.int32)
+        order = torch.argsort(speech_feat_len, descending=True)
+        utts = [sample[i]['utt'] for i in order]
+        speech = [sample[i]['speech'].squeeze(dim=0) for i in order]
+        speech_len = torch.tensor([i.size(0) for i in speech], dtype=torch.int32)
+        speech = pad_sequence(speech, batch_first=True, padding_value=0)
+        speech_token = [torch.tensor(sample[i]['speech_token']) for i in order]
+        speech_token_len = torch.tensor([i.size(0) for i in speech_token], dtype=torch.int32)
+        speech_token = pad_sequence(speech_token,
+                                    batch_first=True,
+                                    padding_value=0)
+        speech_feat = [sample[i]['speech_feat'] for i in order]
+        speech_feat_len = torch.tensor([i.size(0) for i in speech_feat], dtype=torch.int32)
+        speech_feat = pad_sequence(speech_feat,
+                                   batch_first=True,
+                                   padding_value=0)
+        text = [sample[i]['text'] for i in order]
+        text_token = [torch.tensor(sample[i]['text_token']) for i in order]
+        text_token_len = torch.tensor([i.size(0) for i in text_token], dtype=torch.int32)
+        text_token = pad_sequence(text_token, batch_first=True, padding_value=0)
+        utt_embedding = torch.stack([sample[i]['utt_embedding'] for i in order], dim=0)
+        spk_embedding = torch.stack([sample[i]['spk_embedding'] for i in order], dim=0)
+        batch = {
+            "utts": utts,
+            "speech": speech,
+            "speech_len": speech_len,
+            "speech_token": speech_token,
+            "speech_token_len": speech_token_len,
+            "speech_feat": speech_feat,
+            "speech_feat_len": speech_feat_len,
+            "text": text,
+            "text_token": text_token,
+            "text_token_len": text_token_len,
+            "utt_embedding": utt_embedding,
+            "spk_embedding": spk_embedding,
+        }
+        if gan is True:
+            # in gan train, we need pitch_feat
+            pitch_feat = [sample[i]['pitch_feat'] for i in order]
+            pitch_feat_len = torch.tensor([i.size(0) for i in pitch_feat], dtype=torch.int32)
+            pitch_feat = pad_sequence(pitch_feat,
+                                      batch_first=True,
+                                      padding_value=0)
+            batch["pitch_feat"] = pitch_feat
+            batch["pitch_feat_len"] = pitch_feat_len
+        else:
+            # only gan train needs speech, delete it to save memory
+            del batch["speech"]
+            del batch["speech_len"]
+        if mode == 'inference':
+            tts_text = [sample[i]['tts_text'] for i in order]
+            tts_index = [sample[i]['tts_index'] for i in order]
+            tts_text_token = [torch.tensor(sample[i]['tts_text_token']) for i in order]
+            tts_text_token_len = torch.tensor([i.size(0) for i in tts_text_token], dtype=torch.int32)
+            tts_text_token = pad_sequence(tts_text_token, batch_first=True, padding_value=-1)
+            batch.update({'tts_text': tts_text,
+                          'tts_index': tts_index,
+                          'tts_text_token': tts_text_token,
+                          'tts_text_token_len': tts_text_token_len})
+        if use_spk_embedding is True:
+            batch["embedding"] = batch["spk_embedding"]
+        else:
+            batch["embedding"] = batch["utt_embedding"]
+        yield batch

tts/cosyvoice/flow/decoder.py ADDED Viewed

	@@ -0,0 +1,301 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import pack, rearrange, repeat
+from cosyvoice.utils.common import mask_to_bias
+from cosyvoice.utils.mask import add_optional_chunk_mask
+from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D
+from matcha.models.components.transformer import BasicTransformerBlock
+class Transpose(torch.nn.Module):
+    def __init__(self, dim0: int, dim1: int):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x: torch.Tensor):
+        x = torch.transpose(x, self.dim0, self.dim1)
+        return x
+class CausalBlock1D(Block1D):
+    def __init__(self, dim: int, dim_out: int):
+        super(CausalBlock1D, self).__init__(dim, dim_out)
+        self.block = torch.nn.Sequential(
+            CausalConv1d(dim, dim_out, 3),
+            Transpose(1, 2),
+            nn.LayerNorm(dim_out),
+            Transpose(1, 2),
+            nn.Mish(),
+        )
+    def forward(self, x: torch.Tensor, mask: torch.Tensor):
+        output = self.block(x * mask)
+        return output * mask
+class CausalResnetBlock1D(ResnetBlock1D):
+    def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
+        super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
+        self.block1 = CausalBlock1D(dim, dim_out)
+        self.block2 = CausalBlock1D(dim_out, dim_out)
+class CausalConv1d(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(CausalConv1d, self).__init__(in_channels, out_channels,
+                                           kernel_size, stride,
+                                           padding=0, dilation=dilation,
+                                           groups=groups, bias=bias,
+                                           padding_mode=padding_mode,
+                                           device=device, dtype=dtype)
+        assert stride == 1
+        self.causal_padding = (kernel_size - 1, 0)
+    def forward(self, x: torch.Tensor):
+        x = F.pad(x, self.causal_padding)
+        x = super(CausalConv1d, self).forward(x)
+        return x
+class ConditionalDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        causal=False,
+        channels=(256, 256),
+        dropout=0.05,
+        attention_head_dim=64,
+        n_blocks=1,
+        num_mid_blocks=2,
+        num_heads=4,
+        act_fn="snake",
+    ):
+        """
+        This decoder requires an input with the same shape of the target. So, if your text content
+        is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
+        """
+        super().__init__()
+        channels = tuple(channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.causal = causal
+        self.time_embeddings = SinusoidalPosEmb(in_channels)
+        time_embed_dim = channels[0] * 4
+        self.time_mlp = TimestepEmbedding(
+            in_channels=in_channels,
+            time_embed_dim=time_embed_dim,
+            act_fn="silu",
+        )
+        self.down_blocks = nn.ModuleList([])
+        self.mid_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        output_channel = in_channels
+        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
+            input_channel = output_channel
+            output_channel = channels[i]
+            is_last = i == len(channels) - 1
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
+                ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            downsample = (
+                Downsample1D(output_channel) if not is_last else
+                CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
+        for _ in range(num_mid_blocks):
+            input_channel = channels[-1]
+            out_channels = channels[-1]
+            resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) if self.causal else \
+                ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
+        channels = channels[::-1] + (channels[0],)
+        for i in range(len(channels) - 1):
+            input_channel = channels[i] * 2
+            output_channel = channels[i + 1]
+            is_last = i == len(channels) - 2
+            resnet = CausalResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            ) if self.causal else ResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            )
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            upsample = (
+                Upsample1D(output_channel, use_conv_transpose=True)
+                if not is_last
+                else CausalConv1d(output_channel, output_channel, 3) if self.causal else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
+        self.final_block = CausalBlock1D(channels[-1], channels[-1]) if self.causal else Block1D(channels[-1], channels[-1])
+        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
+        self.initialize_weights()
+    def initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.GroupNorm):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x, mask, mu, t, spks=None, cond=None):
+        """Forward pass of the UNet1DConditional model.
+        Args:
+            x (torch.Tensor): shape (batch_size, in_channels, time)
+            mask (_type_): shape (batch_size, 1, time)
+            t (_type_): shape (batch_size)
+            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
+            cond (_type_, optional): placeholder for future use. Defaults to None.
+        Raises:
+            ValueError: _description_
+            ValueError: _description_
+        Returns:
+            _type_: _description_
+        """
+        t = self.time_embeddings(t).to(t.dtype)
+        t = self.time_mlp(t)
+        x = pack([x, mu], "b * t")[0]
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+        if cond is not None:
+            x = pack([x, cond], "b * t")[0]
+        hiddens = []
+        masks = [mask]
+        for resnet, transformer_blocks, downsample in self.down_blocks:
+            mask_down = masks[-1]
+            x = resnet(x, mask_down, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            # attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
+            attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            hiddens.append(x)  # Save hidden states for skip connections
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+        for resnet, transformer_blocks in self.mid_blocks:
+            x = resnet(x, mask_mid, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            # attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
+            attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+        for resnet, transformer_blocks, upsample in self.up_blocks:
+            mask_up = masks.pop()
+            skip = hiddens.pop()
+            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
+            x = resnet(x, mask_up, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            # attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
+            attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
+            attn_mask = mask_to_bias(attn_mask == 1, x.dtype)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask_up)
+        output = self.final_proj(x * mask_up)
+        return output * mask

tts/cosyvoice/flow/flow.py ADDED Viewed

	@@ -0,0 +1,239 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from omegaconf import DictConfig
+from cosyvoice.utils.mask import make_pad_mask
+class MaskedDiffWithXvec(torch.nn.Module):
+    def __init__(self,
+                 input_size: int = 512,
+                 output_size: int = 80,
+                 spk_embed_dim: int = 192,
+                 output_type: str = "mel",
+                 vocab_size: int = 4096,
+                 input_frame_rate: int = 50,
+                 only_mask_loss: bool = True,
+                 encoder: torch.nn.Module = None,
+                 length_regulator: torch.nn.Module = None,
+                 decoder: torch.nn.Module = None,
+                 decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
+                                       'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
+                                                                 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
+                                       'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
+                                                          'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
+                 mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050,
+                                        'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.decoder_conf = decoder_conf
+        self.mel_feat_conf = mel_feat_conf
+        self.vocab_size = vocab_size
+        self.output_type = output_type
+        self.input_frame_rate = input_frame_rate
+        logging.info(f"input frame rate={self.input_frame_rate}")
+        self.input_embedding = nn.Embedding(vocab_size, input_size)
+        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
+        self.encoder = encoder
+        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
+        self.decoder = decoder
+        self.length_regulator = length_regulator
+        self.only_mask_loss = only_mask_loss
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        token = batch['speech_token'].to(device)
+        token_len = batch['speech_token_len'].to(device)
+        feat = batch['speech_feat'].to(device)
+        feat_len = batch['speech_feat_len'].to(device)
+        embedding = batch['embedding'].to(device)
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat text and prompt_text
+        mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # text encode
+        h, h_lengths = self.encoder(token, token_len)
+        h = self.encoder_proj(h)
+        h, h_lengths = self.length_regulator(h, feat_len)
+        # get conditions
+        conds = torch.zeros(feat.shape, device=token.device)
+        for i, j in enumerate(feat_len):
+            if random.random() < 0.5:
+                continue
+            index = random.randint(0, int(0.3 * j))
+            conds[i, :index] = feat[i, :index]
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(feat_len)).to(h)
+        feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1)
+        loss, _ = self.decoder.compute_loss(
+            feat.transpose(1, 2).contiguous(),
+            mask.unsqueeze(1),
+            h.transpose(1, 2).contiguous(),
+            embedding,
+            cond=conds
+        )
+        return {'loss': loss}
+    @torch.inference_mode()
+    def inference(self,
+                  token,
+                  token_len,
+                  prompt_token,
+                  prompt_token_len,
+                  prompt_feat,
+                  prompt_feat_len,
+                  embedding,
+                  flow_cache):
+        if self.fp16 is True:
+            prompt_feat = prompt_feat.half()
+            embedding = embedding.half()
+        assert token.shape[0] == 1
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat text and prompt_text
+        token_len1, token_len2 = prompt_token.shape[1], token.shape[1]
+        token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
+        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # text encode
+        h, h_lengths = self.encoder(token, token_len)
+        h = self.encoder_proj(h)
+        mel_len1, mel_len2 = prompt_feat.shape[1], int(token_len2 / self.input_frame_rate * 22050 / 256)
+        h, h_lengths = self.length_regulator.inference(h[:, :token_len1], h[:, token_len1:], mel_len1, mel_len2, self.input_frame_rate)
+        # get conditions
+        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device).to(h.dtype)
+        conds[:, :mel_len1] = prompt_feat
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
+        feat, flow_cache = self.decoder(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=embedding,
+            cond=conds,
+            n_timesteps=10,
+            prompt_len=mel_len1,
+            flow_cache=flow_cache
+        )
+        feat = feat[:, :, mel_len1:]
+        assert feat.shape[2] == mel_len2
+        return feat.float(), flow_cache
+class CausalMaskedDiffWithXvec(torch.nn.Module):
+    def __init__(self,
+                 input_size: int = 512,
+                 output_size: int = 80,
+                 spk_embed_dim: int = 192,
+                 output_type: str = "mel",
+                 vocab_size: int = 4096,
+                 input_frame_rate: int = 50,
+                 only_mask_loss: bool = True,
+                 token_mel_ratio: int = 2,
+                 pre_lookahead_len: int = 3,
+                 encoder: torch.nn.Module = None,
+                 decoder: torch.nn.Module = None,
+                 decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1,
+                                       'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
+                                                                 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
+                                       'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
+                                                          'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
+                 mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050,
+                                        'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.decoder_conf = decoder_conf
+        self.mel_feat_conf = mel_feat_conf
+        self.vocab_size = vocab_size
+        self.output_type = output_type
+        self.input_frame_rate = input_frame_rate
+        logging.info(f"input frame rate={self.input_frame_rate}")
+        self.input_embedding = nn.Embedding(vocab_size, input_size)
+        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
+        self.encoder = encoder
+        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
+        self.decoder = decoder
+        self.only_mask_loss = only_mask_loss
+        self.token_mel_ratio = token_mel_ratio
+        self.pre_lookahead_len = pre_lookahead_len
+    @torch.inference_mode()
+    def inference(self,
+                  token,
+                  token_len,
+                  prompt_token,
+                  prompt_token_len,
+                  prompt_feat,
+                  prompt_feat_len,
+                  embedding,
+                  finalize):
+        if self.fp16 is True:
+            prompt_feat = prompt_feat.half()
+            embedding = embedding.half()
+        assert token.shape[0] == 1
+        # xvec projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        # concat text and prompt_text
+        token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
+        mask = (~make_pad_mask(token_len)).unsqueeze(-1).to(embedding)
+        token = self.input_embedding(torch.clamp(token, min=0)) * mask
+        # text encode
+        h, h_lengths = self.encoder(token, token_len)
+        if finalize is False:
+            h = h[:, :-self.pre_lookahead_len * self.token_mel_ratio]
+        mel_len1, mel_len2 = prompt_feat.shape[1], h.shape[1] - prompt_feat.shape[1]
+        h = self.encoder_proj(h)
+        # get conditions
+        conds = torch.zeros([1, mel_len1 + mel_len2, self.output_size], device=token.device).to(h.dtype)
+        conds[:, :mel_len1] = prompt_feat
+        conds = conds.transpose(1, 2)
+        mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h)
+        feat, _ = self.decoder(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=embedding,
+            cond=conds,
+            n_timesteps=10
+        )
+        feat = feat[:, :, mel_len1:]
+        assert feat.shape[2] == mel_len2
+        return feat.float(), None

tts/cosyvoice/flow/flow_matching.py ADDED Viewed

	@@ -0,0 +1,217 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import threading
+import torch
+import torch.nn.functional as F
+from matcha.models.components.flow_matching import BASECFM
+class ConditionalCFM(BASECFM):
+    def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
+        super().__init__(
+            n_feats=in_channels,
+            cfm_params=cfm_params,
+            n_spks=n_spks,
+            spk_emb_dim=spk_emb_dim,
+        )
+        self.t_scheduler = cfm_params.t_scheduler
+        self.training_cfg_rate = cfm_params.training_cfg_rate
+        self.inference_cfg_rate = cfm_params.inference_cfg_rate
+        in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
+        # Just change the architecture of the estimator here
+        self.estimator = estimator
+        self.lock = threading.Lock()
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, prompt_len=0, flow_cache=torch.zeros(1, 80, 0, 2)):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = torch.randn_like(mu).to(mu.device).to(mu.dtype) * temperature
+        cache_size = flow_cache.shape[2]
+        # fix prompt and overlap part mu and z
+        if cache_size != 0:
+            z[:, :, :cache_size] = flow_cache[:, :, :, 0]
+            mu[:, :, :cache_size] = flow_cache[:, :, :, 1]
+        z_cache = torch.concat([z[:, :, :prompt_len], z[:, :, -34:]], dim=2)
+        mu_cache = torch.concat([mu[:, :, :prompt_len], mu[:, :, -34:]], dim=2)
+        flow_cache = torch.stack([z_cache, mu_cache], dim=-1)
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), flow_cache
+    def solve_euler(self, x, t_span, mu, mask, spks, cond):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        t = t.unsqueeze(dim=0)
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        # Do not use concat, it may cause memory format changed and trt infer with wrong results!
+        x_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        mask_in = torch.zeros([2, 1, x.size(2)], device=x.device, dtype=x.dtype)
+        mu_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        t_in = torch.zeros([2], device=x.device, dtype=x.dtype)
+        spks_in = torch.zeros([2, 80], device=x.device, dtype=x.dtype)
+        cond_in = torch.zeros([2, 80, x.size(2)], device=x.device, dtype=x.dtype)
+        for step in range(1, len(t_span)):
+            # Classifier-Free Guidance inference introduced in VoiceBox
+            x_in[:] = x
+            mask_in[:] = mask
+            mu_in[0] = mu
+            t_in[:] = t.unsqueeze(0)
+            spks_in[0] = spks
+            cond_in[0] = cond
+            dphi_dt = self.forward_estimator(
+                x_in, mask_in,
+                mu_in, t_in,
+                spks_in,
+                cond_in
+            )
+            dphi_dt, cfg_dphi_dt = torch.split(dphi_dt, [x.size(0), x.size(0)], dim=0)
+            dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt - self.inference_cfg_rate * cfg_dphi_dt)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1].float()
+    def forward_estimator(self, x, mask, mu, t, spks, cond):
+        if isinstance(self.estimator, torch.nn.Module):
+            return self.estimator.forward(x, mask, mu, t, spks, cond)
+        else:
+            with self.lock:
+                self.estimator.set_input_shape('x', (2, 80, x.size(2)))
+                self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
+                self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
+                self.estimator.set_input_shape('t', (2,))
+                self.estimator.set_input_shape('spks', (2, 80))
+                self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
+                # run trt engine
+                self.estimator.execute_v2([x.contiguous().data_ptr(),
+                                           mask.contiguous().data_ptr(),
+                                           mu.contiguous().data_ptr(),
+                                           t.contiguous().data_ptr(),
+                                           spks.contiguous().data_ptr(),
+                                           cond.contiguous().data_ptr(),
+                                           x.data_ptr()])
+            return x
+    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): target mask
+                shape: (batch_size, 1, mel_timesteps)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        b, _, t = mu.shape
+        # random timestep
+        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t = 1 - torch.cos(t * 0.5 * torch.pi)
+        # sample noise p(x_0)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        # during training, we randomly drop condition to trade off mode coverage and sample fidelity
+        if self.training_cfg_rate > 0:
+            cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
+            mu = mu * cfg_mask.view(-1, 1, 1)
+            spks = spks * cfg_mask.view(-1, 1)
+            cond = cond * cfg_mask.view(-1, 1, 1)
+        pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
+        loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
+        return loss, y
+class CausalConditionalCFM(ConditionalCFM):
+    def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
+        super().__init__(in_channels, cfm_params, n_spks, spk_emb_dim, estimator)
+        self.rand_noise = torch.randn([1, 80, 50 * 300])
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = self.rand_noise[:, :, :mu.size(2)].to(mu.device).to(mu.dtype) * temperature
+        # fix prompt and overlap part mu and z
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond), None

tts/cosyvoice/flow/length_regulator.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+import torch.nn as nn
+import torch
+from torch.nn import functional as F
+from cosyvoice.utils.mask import make_pad_mask
+class InterpolateRegulator(nn.Module):
+    def __init__(
+            self,
+            channels: int,
+            sampling_ratios: Tuple,
+            out_channels: int = None,
+            groups: int = 1,
+    ):
+        super().__init__()
+        self.sampling_ratios = sampling_ratios
+        out_channels = out_channels or channels
+        model = nn.ModuleList([])
+        if len(sampling_ratios) > 0:
+            for _ in sampling_ratios:
+                module = nn.Conv1d(channels, channels, 3, 1, 1)
+                norm = nn.GroupNorm(groups, channels)
+                act = nn.Mish()
+                model.extend([module, norm, act])
+        model.append(
+            nn.Conv1d(channels, out_channels, 1, 1)
+        )
+        self.model = nn.Sequential(*model)
+    def forward(self, x, ylens=None):
+        # x in (B, T, D)
+        mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
+        x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='linear')
+        out = self.model(x).transpose(1, 2).contiguous()
+        olens = ylens
+        return out * mask, olens
+    def inference(self, x1, x2, mel_len1, mel_len2, input_frame_rate=50):
+        # in inference mode, interploate prompt token and token(head/mid/tail) seprately, so we can get a clear separation point of mel
+        # x in (B, T, D)
+        if x2.shape[1] > 40:
+            x2_head = F.interpolate(x2[:, :20].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
+            x2_mid = F.interpolate(x2[:, 20:-20].transpose(1, 2).contiguous(), size=mel_len2 - int(20 / input_frame_rate * 22050 / 256) * 2,
+                                   mode='linear')
+            x2_tail = F.interpolate(x2[:, -20:].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
+            x2 = torch.concat([x2_head, x2_mid, x2_tail], dim=2)
+        else:
+            x2 = F.interpolate(x2.transpose(1, 2).contiguous(), size=mel_len2, mode='linear')
+        if x1.shape[1] != 0:
+            x1 = F.interpolate(x1.transpose(1, 2).contiguous(), size=mel_len1, mode='linear')
+            x = torch.concat([x1, x2], dim=2)
+        else:
+            x = x2
+        out = self.model(x).transpose(1, 2).contiguous()
+        return out, mel_len1 + mel_len2

tts/cosyvoice/hifigan/discriminator.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch
+import torch.nn as nn
+from torch.nn.utils.parametrizations import weight_norm
+from typing import List, Optional, Tuple
+from einops import rearrange
+from torchaudio.transforms import Spectrogram
+class MultipleDiscriminator(nn.Module):
+    def __init__(
+            self, mpd: nn.Module, mrd: nn.Module
+    ):
+        super().__init__()
+        self.mpd = mpd
+        self.mrd = mrd
+    def forward(self, y: torch.Tensor, y_hat: torch.Tensor):
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
+        this_y_d_rs, this_y_d_gs, this_fmap_rs, this_fmap_gs = self.mpd(y.unsqueeze(dim=1), y_hat.unsqueeze(dim=1))
+        y_d_rs += this_y_d_rs
+        y_d_gs += this_y_d_gs
+        fmap_rs += this_fmap_rs
+        fmap_gs += this_fmap_gs
+        this_y_d_rs, this_y_d_gs, this_fmap_rs, this_fmap_gs = self.mrd(y, y_hat)
+        y_d_rs += this_y_d_rs
+        y_d_gs += this_y_d_gs
+        fmap_rs += this_fmap_rs
+        fmap_gs += this_fmap_gs
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class MultiResolutionDiscriminator(nn.Module):
+    def __init__(
+        self,
+        fft_sizes: Tuple[int, ...] = (2048, 1024, 512),
+        num_embeddings: Optional[int] = None,
+    ):
+        """
+        Multi-Resolution Discriminator module adapted from https://github.com/descriptinc/descript-audio-codec.
+        Additionally, it allows incorporating conditional information with a learned embeddings table.
+        Args:
+            fft_sizes (tuple[int]): Tuple of window lengths for FFT. Defaults to (2048, 1024, 512).
+            num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
+                Defaults to None.
+        """
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorR(window_length=w, num_embeddings=num_embeddings) for w in fft_sizes]
+        )
+    def forward(
+        self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for d in self.discriminators:
+            y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
+            y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorR(nn.Module):
+    def __init__(
+        self,
+        window_length: int,
+        num_embeddings: Optional[int] = None,
+        channels: int = 32,
+        hop_factor: float = 0.25,
+        bands: Tuple[Tuple[float, float], ...] = ((0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)),
+    ):
+        super().__init__()
+        self.window_length = window_length
+        self.hop_factor = hop_factor
+        self.spec_fn = Spectrogram(
+            n_fft=window_length, hop_length=int(window_length * hop_factor), win_length=window_length, power=None
+        )
+        n_fft = window_length // 2 + 1
+        bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
+        self.bands = bands
+        convs = lambda: nn.ModuleList(
+            [
+                weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))),
+            ]
+        )
+        self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
+        if num_embeddings is not None:
+            self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=channels)
+            torch.nn.init.zeros_(self.emb.weight)
+        self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1)))
+    def spectrogram(self, x):
+        # Remove DC offset
+        x = x - x.mean(dim=-1, keepdims=True)
+        # Peak normalize the volume of input audio
+        x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
+        x = self.spec_fn(x)
+        x = torch.view_as_real(x)
+        x = rearrange(x, "b f t c -> b c t f")
+        # Split into bands
+        x_bands = [x[..., b[0]: b[1]] for b in self.bands]
+        return x_bands
+    def forward(self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None):
+        x_bands = self.spectrogram(x)
+        fmap = []
+        x = []
+        for band, stack in zip(x_bands, self.band_convs):
+            for i, layer in enumerate(stack):
+                band = layer(band)
+                band = torch.nn.functional.leaky_relu(band, 0.1)
+                if i > 0:
+                    fmap.append(band)
+            x.append(band)
+        x = torch.cat(x, dim=-1)
+        if cond_embedding_id is not None:
+            emb = self.emb(cond_embedding_id)
+            h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
+        else:
+            h = 0
+        x = self.conv_post(x)
+        fmap.append(x)
+        x += h
+        return x, fmap

tts/cosyvoice/hifigan/f0_predictor.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+# from torch.nn.utils import weight_norm
+from torch.nn.utils.parametrizations import weight_norm
+class ConvRNNF0Predictor(nn.Module):
+    def __init__(self,
+                 num_class: int = 1,
+                 in_channels: int = 80,
+                 cond_channels: int = 512
+                 ):
+        super().__init__()
+        self.num_class = num_class
+        self.condnet = nn.Sequential(
+            weight_norm(
+                nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+        )
+        self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.condnet(x)
+        x = x.transpose(1, 2)
+        return torch.abs(self.classifier(x).squeeze(-1))

tts/cosyvoice/hifigan/generator.py ADDED Viewed

	@@ -0,0 +1,412 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HIFI-GAN"""
+from typing import Dict, Optional, List
+import numpy as np
+from scipy.signal import get_window
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Conv1d
+from torch.nn import ConvTranspose1d
+from torch.nn.utils import remove_weight_norm
+# from torch.nn.utils import weight_norm
+from torch.nn.utils.parametrizations import weight_norm
+from torch.distributions.uniform import Uniform
+from cosyvoice.transformer.activation import Snake
+from cosyvoice.utils.common import get_padding
+from cosyvoice.utils.common import init_weights
+"""hifigan based generator implementation.
+This code is modified from https://github.com/jik876/hifi-gan
+ ,https://github.com/kan-bayashi/ParallelWaveGAN and
+ https://github.com/NVIDIA/BigVGAN
+"""
+class ResBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN/BigVGAN."""
+    def __init__(
+        self,
+        channels: int = 512,
+        kernel_size: int = 3,
+        dilations: List[int] = [1, 3, 5],
+    ):
+        super(ResBlock, self).__init__()
+        self.convs1 = nn.ModuleList()
+        self.convs2 = nn.ModuleList()
+        for dilation in dilations:
+            self.convs1.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation,
+                        padding=get_padding(kernel_size, dilation)
+                    )
+                )
+            )
+            self.convs2.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1)
+                    )
+                )
+            )
+        self.convs1.apply(init_weights)
+        self.convs2.apply(init_weights)
+        self.activations1 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs1))
+        ])
+        self.activations2 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs2))
+        ])
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for idx in range(len(self.convs1)):
+            xt = self.activations1[idx](x)
+            xt = self.convs1[idx](xt)
+            xt = self.activations2[idx](xt)
+            xt = self.convs2[idx](xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for idx in range(len(self.convs1)):
+            remove_weight_norm(self.convs1[idx])
+            remove_weight_norm(self.convs2[idx])
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    @torch.no_grad()
+    def forward(self, f0):
+        """
+        :param f0: [B, 1, sample_len], Hz
+        :return: [B, 1, sample_len]
+        """
+        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
+        for i in range(self.harmonic_num + 1):
+            F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
+        theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
+        u_dist = Uniform(low=-np.pi, high=np.pi)
+        phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
+        phase_vec[:, 0, :] = 0
+        # generate sine waveforms
+        sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
+        # generate uv signal
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
+            sine_wavs = sine_wavs.transpose(1, 2)
+            uv = uv.transpose(1, 2)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+class HiFTGenerator(nn.Module):
+    """
+    HiFTNet Generator: Neural Source Filter + ISTFTNet
+    https://arxiv.org/abs/2309.09493
+    """
+    def __init__(
+            self,
+            in_channels: int = 80,
+            base_channels: int = 512,
+            nb_harmonics: int = 8,
+            sampling_rate: int = 22050,
+            nsf_alpha: float = 0.1,
+            nsf_sigma: float = 0.003,
+            nsf_voiced_threshold: float = 10,
+            upsample_rates: List[int] = [8, 8],
+            upsample_kernel_sizes: List[int] = [16, 16],
+            istft_params: Dict[str, int] = {"n_fft": 16, "hop_len": 4},
+            resblock_kernel_sizes: List[int] = [3, 7, 11],
+            resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            source_resblock_kernel_sizes: List[int] = [7, 11],
+            source_resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5]],
+            lrelu_slope: float = 0.1,
+            audio_limit: float = 0.99,
+            f0_predictor: torch.nn.Module = None,
+    ):
+        super(HiFTGenerator, self).__init__()
+        self.out_channels = 1
+        self.nb_harmonics = nb_harmonics
+        self.sampling_rate = sampling_rate
+        self.istft_params = istft_params
+        self.lrelu_slope = lrelu_slope
+        self.audio_limit = audio_limit
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sampling_rate,
+            upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
+            harmonic_num=nb_harmonics,
+            sine_amp=nsf_alpha,
+            add_noise_std=nsf_sigma,
+            voiced_threshod=nsf_voiced_threshold)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"])
+        self.conv_pre = weight_norm(
+            Conv1d(in_channels, base_channels, 7, 1, padding=3)
+        )
+        # Up
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        base_channels // (2**i),
+                        base_channels // (2**(i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        # Down
+        self.source_downs = nn.ModuleList()
+        self.source_resblocks = nn.ModuleList()
+        downsample_rates = [1] + upsample_rates[::-1][:-1]
+        downsample_cum_rates = np.cumprod(downsample_rates)
+        for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, source_resblock_dilation_sizes)):
+            if u == 1:
+                self.source_downs.append(
+                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1)
+                )
+            else:
+                self.source_downs.append(
+                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
+                )
+            self.source_resblocks.append(
+                ResBlock(base_channels // (2 ** (i + 1)), k, d)
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = base_channels // (2**(i + 1))
+            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.reflection_pad = nn.ReflectionPad1d((1, 0))
+        self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
+        self.f0_predictor = f0_predictor
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+        self.m_source.remove_weight_norm()
+        for l in self.source_downs:
+            remove_weight_norm(l)
+        for l in self.source_resblocks:
+            l.remove_weight_norm()
+    def _stft(self, x):
+        spec = torch.stft(
+            x,
+            self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
+            return_complex=True)
+        spec = torch.view_as_real(spec)  # [B, F, TT, 2]
+        return spec[..., 0], spec[..., 1]
+    def _istft(self, magnitude, phase):
+        magnitude = torch.clip(magnitude, max=1e2)
+        real = magnitude * torch.cos(phase)
+        img = magnitude * torch.sin(phase)
+        inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"],
+                                        self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
+        return inverse_transform
+    def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
+        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, self.lrelu_slope)
+            x = self.ups[i](x)
+            if i == self.num_upsamples - 1:
+                x = self.reflection_pad(x)
+            # fusion
+            si = self.source_downs[i](s_stft)
+            si = self.source_resblocks[i](si)
+            x = x + si
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
+        phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])  # actually, sin is redundancy
+        x = self._istft(magnitude, phase)
+        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
+        return x
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        speech_feat = batch['speech_feat'].transpose(1, 2).to(device)
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # mel+source->speech
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, f0
+    @torch.inference_mode()
+    def inference(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # use cache_source to avoid glitch
+        if cache_source.shape[2] != 0:
+            s[:, :, :cache_source.shape[2]] = cache_source
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, s

tts/cosyvoice/hifigan/hifigan.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from matcha.hifigan.models import feature_loss, generator_loss, discriminator_loss
+from cosyvoice.utils.losses import tpr_loss, mel_loss
+class HiFiGan(nn.Module):
+    def __init__(self, generator, discriminator, mel_spec_transform,
+                 multi_mel_spectral_recon_loss_weight=45, feat_match_loss_weight=2.0,
+                 tpr_loss_weight=1.0, tpr_loss_tau=0.04):
+        super(HiFiGan, self).__init__()
+        self.generator = generator
+        self.discriminator = discriminator
+        self.mel_spec_transform = mel_spec_transform
+        self.multi_mel_spectral_recon_loss_weight = multi_mel_spectral_recon_loss_weight
+        self.feat_match_loss_weight = feat_match_loss_weight
+        self.tpr_loss_weight = tpr_loss_weight
+        self.tpr_loss_tau = tpr_loss_tau
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        if batch['turn'] == 'generator':
+            return self.forward_generator(batch, device)
+        else:
+            return self.forward_discriminator(batch, device)
+    def forward_generator(self, batch, device):
+        real_speech = batch['speech'].to(device)
+        pitch_feat = batch['pitch_feat'].to(device)
+        # 1. calculate generator outputs
+        generated_speech, generated_f0 = self.generator(batch, device)
+        # 2. calculate discriminator outputs
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech)
+        # 3. calculate generator losses, feature loss, mel loss, tpr losses [Optional]
+        loss_gen, _ = generator_loss(y_d_gs)
+        loss_fm = feature_loss(fmap_rs, fmap_gs)
+        loss_mel = mel_loss(real_speech, generated_speech, self.mel_spec_transform)
+        if self.tpr_loss_weight != 0:
+            loss_tpr = tpr_loss(y_d_rs, y_d_gs, self.tpr_loss_tau)
+        else:
+            loss_tpr = torch.zeros(1).to(device)
+        loss_f0 = F.l1_loss(generated_f0, pitch_feat)
+        loss = loss_gen + self.feat_match_loss_weight * loss_fm + \
+            self.multi_mel_spectral_recon_loss_weight * loss_mel + \
+            self.tpr_loss_weight * loss_tpr + loss_f0
+        return {'loss': loss, 'loss_gen': loss_gen, 'loss_fm': loss_fm, 'loss_mel': loss_mel, 'loss_tpr': loss_tpr, 'loss_f0': loss_f0}
+    def forward_discriminator(self, batch, device):
+        real_speech = batch['speech'].to(device)
+        # 1. calculate generator outputs
+        with torch.no_grad():
+            generated_speech, generated_f0 = self.generator(batch, device)
+        # 2. calculate discriminator outputs
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech)
+        # 3. calculate discriminator losses, tpr losses [Optional]
+        loss_disc, _, _ = discriminator_loss(y_d_rs, y_d_gs)
+        if self.tpr_loss_weight != 0:
+            loss_tpr = tpr_loss(y_d_rs, y_d_gs, self.tpr_loss_tau)
+        else:
+            loss_tpr = torch.zeros(1).to(device)
+        loss = loss_disc + self.tpr_loss_weight * loss_tpr
+        return {'loss': loss, 'loss_disc': loss_disc, 'loss_tpr': loss_tpr}

tts/cosyvoice/llm/llm.py ADDED Viewed

	@@ -0,0 +1,434 @@

+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional, Callable, List, Generator
+import torch
+from torch import nn
+import torch.nn.functional as F
+from transformers import Qwen2ForCausalLM
+from torch.nn.utils.rnn import pad_sequence, unpad_sequence
+from cosyvoice.utils.common import IGNORE_ID
+from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss
+from cosyvoice.utils.common import th_accuracy
+from cosyvoice.utils.file_utils import logging
+class TransformerLM(torch.nn.Module):
+    def __init__(
+            self,
+            text_encoder_input_size: int,
+            llm_input_size: int,
+            llm_output_size: int,
+            text_token_size: int,
+            speech_token_size: int,
+            text_encoder: torch.nn.Module,
+            llm: torch.nn.Module,
+            sampling: Callable,
+            length_normalized_loss: bool = True,
+            lsm_weight: float = 0.0,
+            spk_embed_dim: int = 192,
+    ):
+        super().__init__()
+        self.llm_input_size = llm_input_size
+        self.speech_token_size = speech_token_size
+        # 1. build text token inputs related modules
+        self.text_embedding = torch.nn.Embedding(text_token_size, text_encoder_input_size)
+        self.text_encoder = text_encoder
+        self.text_encoder_affine_layer = nn.Linear(
+            self.text_encoder.output_size(),
+            llm_input_size
+        )
+        # 2. build speech token language model related modules
+        self.sos_eos = 0
+        self.task_id = 1
+        self.llm_embedding = torch.nn.Embedding(2, llm_input_size)
+        self.llm = llm
+        self.llm_decoder = nn.Linear(llm_output_size, speech_token_size + 1)
+        self.criterion_ce = LabelSmoothingLoss(
+            size=speech_token_size + 1,
+            padding_idx=IGNORE_ID,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+        # 3. [Optional] build speech token related modules
+        self.speech_embedding = torch.nn.Embedding(speech_token_size, llm_input_size)
+        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, llm_input_size)
+        # 4. sampling method
+        self.sampling = sampling
+    def encode(
+            self,
+            text: torch.Tensor,
+            text_lengths: torch.Tensor,
+    ):
+        encoder_out, encoder_mask = self.text_encoder(text, text_lengths, decoding_chunk_size=1, num_decoding_left_chunks=-1)
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+        encoder_out = self.text_encoder_affine_layer(encoder_out)
+        return encoder_out, encoder_out_lens
+    def pad_unpad_sequence(self, sos_eos_emb, embedding, text_token, text_token_len, task_id_emb, speech_token, speech_token_len):
+        text_token = unpad_sequence(text_token, text_token_len.cpu(), batch_first=True)
+        speech_token = unpad_sequence(speech_token, speech_token_len.cpu(), batch_first=True)
+        lm_input = [torch.concat([sos_eos_emb.squeeze(dim=0), embedding[i], text_token[i], task_id_emb.squeeze(dim=0), speech_token[i]], dim=0)
+                    for i in range(len(text_token))]
+        lm_input_len = torch.tensor([i.size(0) for i in lm_input], dtype=torch.int32)
+        lm_input = pad_sequence(lm_input, batch_first=True, padding_value=IGNORE_ID)
+        return lm_input, lm_input_len
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        """
+        Args:
+            text: (B, L, D)
+            text_lengths: (B,)
+            audio: (B, T, N) or (B, T)
+            audio_lengths: (B,)
+        """
+        text_token = batch['text_token'].to(device)
+        text_token_len = batch['text_token_len'].to(device)
+        speech_token = batch['speech_token'].to(device)
+        speech_token_len = batch['speech_token_len'].to(device)
+        embedding = batch['embedding'].to(device)
+        # 1. prepare llm_target
+        lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() +
+                                  [self.speech_token_size]) for i in range(text_token.size(0))]
+        lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID).to(device)
+        # 1. encode text_token
+        text_token = self.text_embedding(text_token)
+        text_token, text_token_len = self.encode(text_token, text_token_len)
+        # 2. embedding projection
+        embedding = F.normalize(embedding, dim=1)
+        embedding = self.spk_embed_affine_layer(embedding)
+        embedding = embedding.unsqueeze(1)
+        # 3. eos and task_id
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        # 4. encode speech_token
+        speech_token = self.speech_embedding(speech_token)
+        # 5. unpad and pad
+        lm_input, lm_input_len = self.pad_unpad_sequence(sos_eos_emb, embedding, text_token, text_token_len,
+                                                         task_id_emb, speech_token, speech_token_len)
+        # 6. run lm forward
+        lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
+        logits = self.llm_decoder(lm_output)
+        loss = self.criterion_ce(logits, lm_target)
+        acc = th_accuracy(logits.view(-1, self.speech_token_size + 1), lm_target, ignore_label=IGNORE_ID)
+        return {'loss': loss, 'acc': acc}
+    def sampling_ids(
+            self,
+            weighted_scores: torch.Tensor,
+            decoded_tokens: List,
+            sampling: int,
+            ignore_eos: bool = True,
+    ):
+        num_trials, max_trials = 0, 100
+        while True:
+            top_ids = self.sampling(weighted_scores, decoded_tokens, sampling)
+            if (not ignore_eos) or (self.speech_token_size not in top_ids):
+                break
+            num_trials += 1
+            if num_trials > max_trials:
+                raise RuntimeError('sampling reaches max_trials {} and still get eos when ignore_eos is True, check your input!'.format(max_trials))
+        return top_ids
+    @torch.inference_mode()
+    def inference(
+            self,
+            text: torch.Tensor,
+            text_len: torch.Tensor,
+            prompt_text: torch.Tensor,
+            prompt_text_len: torch.Tensor,
+            prompt_speech_token: torch.Tensor,
+            prompt_speech_token_len: torch.Tensor,
+            embedding: torch.Tensor,
+            sampling: int = 25,
+            max_token_text_ratio: float = 20,
+            min_token_text_ratio: float = 2,
+    ) -> Generator[torch.Tensor, None, None]:
+        if self.fp16 is True:
+            embedding = embedding.half()
+        device = text.device
+        text = torch.concat([prompt_text, text], dim=1)
+        text_len += prompt_text_len
+        text = self.text_embedding(text)
+        # 1. encode text
+        text, text_len = self.encode(text, text_len)
+        # 2. encode embedding
+        if embedding.shape[0] != 0:
+            embedding = F.normalize(embedding, dim=1)
+            embedding = self.spk_embed_affine_layer(embedding)
+            embedding = embedding.unsqueeze(dim=1)
+        else:
+            embedding = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device).to(text.dtype)
+        # 3. concat llm_input
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        if prompt_speech_token_len != 0:
+            prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
+        else:
+            prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+        lm_input = torch.concat([sos_eos_emb, embedding, text, task_id_emb, prompt_speech_token_emb], dim=1)
+        # 4. cal min/max_length
+        min_len = int((text_len - prompt_text_len) * min_token_text_ratio)
+        max_len = int((text_len - prompt_text_len) * max_token_text_ratio)
+        # 5. step by step decode
+        out_tokens = []
+        offset = 0
+        att_cache, cnn_cache = torch.zeros((0, 0, 0, 0), device=lm_input.device), torch.zeros((0, 0, 0, 0), device=lm_input.device)
+        for i in range(max_len):
+            y_pred, att_cache, cnn_cache = self.llm.forward_chunk(lm_input, offset=offset, required_cache_size=-1,
+                                                                  att_cache=att_cache, cnn_cache=cnn_cache,
+                                                                  att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]),
+                                                                                                 device=lm_input.device)).to(torch.bool))
+            logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+            # force continue decode first token
+            if i == 0:
+                logp[:, self.speech_token_size] = -float('inf')
+            top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
+            if top_ids == self.speech_token_size:
+                break
+            # in stream mode, yield token one by one
+            yield top_ids
+            out_tokens.append(top_ids)
+            offset += lm_input.size(1)
+            lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+class Qwen2Encoder(torch.nn.Module):
+    def __init__(self, pretrain_path):
+        super().__init__()
+        self.model = Qwen2ForCausalLM.from_pretrained(pretrain_path)
+    def forward_one_step(self, xs, masks, cache=None):
+        input_masks = masks[:, -1, :]
+        outs = self.model(
+            inputs_embeds=xs,
+            attention_mask=input_masks,
+            output_hidden_states=True,
+            return_dict=True,
+            use_cache=True,
+            past_key_values=cache,
+        )
+        xs = outs.hidden_states[-1]
+        new_cache = outs.past_key_values
+        return xs, new_cache
+class Qwen2LM(TransformerLM):
+    def __init__(
+            self,
+            llm_input_size: int,
+            llm_output_size: int,
+            speech_token_size: int,
+            llm: torch.nn.Module,
+            sampling: Callable,
+            length_normalized_loss: bool = True,
+            lsm_weight: float = 0.0,
+            mix_ratio: List[int] = [5, 15],
+    ):
+        torch.nn.Module.__init__(self)
+        self.llm_input_size = llm_input_size
+        self.llm_output_size = llm_output_size
+        self.speech_token_size = speech_token_size
+        # 2. build speech token language model related modules
+        self.sos_eos = 0
+        self.task_id = 1
+        self.fill_token = 2
+        self.llm_embedding = torch.nn.Embedding(2, llm_input_size)
+        self.llm = llm
+        self.llm_decoder = nn.Linear(llm_output_size, speech_token_size + 3)
+        self.criterion_ce = LabelSmoothingLoss(
+            size=speech_token_size + 3,
+            padding_idx=IGNORE_ID,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+        # 3. [Optional] build speech token related modules
+        self.speech_embedding = torch.nn.Embedding(speech_token_size + 3, llm_input_size)
+        # 4. sampling method
+        self.sampling = sampling
+        self.mix_ratio = mix_ratio
+    @torch.inference_mode()
+    def inference(
+            self,
+            text: torch.Tensor,
+            text_len: torch.Tensor,
+            prompt_text: torch.Tensor,
+            prompt_text_len: torch.Tensor,
+            prompt_speech_token: torch.Tensor,
+            prompt_speech_token_len: torch.Tensor,
+            embedding: torch.Tensor,
+            sampling: int = 25,
+            max_token_text_ratio: float = 20,
+            min_token_text_ratio: float = 2,
+    ) -> Generator[torch.Tensor, None, None]:
+        device = text.device
+        text = torch.concat([prompt_text, text], dim=1)
+        text_len += prompt_text_len
+        text = self.llm.model.model.embed_tokens(text)
+        # 3. concat llm_input
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        if prompt_speech_token_len != 0:
+            prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
+        else:
+            prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+        lm_input = torch.concat([sos_eos_emb, text, task_id_emb, prompt_speech_token_emb], dim=1)
+        # 4. cal min/max_length
+        min_len = int((text_len - prompt_text_len) * min_token_text_ratio)
+        max_len = int((text_len - prompt_text_len) * max_token_text_ratio)
+        # 5. step by step decode
+        out_tokens = []
+        cache = None
+        for i in range(max_len):
+            y_pred, cache = self.llm.forward_one_step(lm_input,
+                                                      masks=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool),
+                                                      cache=cache)
+            logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+            top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True if i < min_len else False).item()
+            if top_ids == self.speech_token_size:
+                break
+            if top_ids > self.speech_token_size:
+                continue
+            # in stream mode, yield token one by one
+            yield top_ids
+            out_tokens.append(top_ids)
+            lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+    @torch.inference_mode()
+    def inference_bistream(
+            self,
+            text: Generator,
+            prompt_text: torch.Tensor,
+            prompt_text_len: torch.Tensor,
+            prompt_speech_token: torch.Tensor,
+            prompt_speech_token_len: torch.Tensor,
+            embedding: torch.Tensor,
+            sampling: int = 25,
+            max_token_text_ratio: float = 20,
+            min_token_text_ratio: float = 2,
+    ) -> Generator[torch.Tensor, None, None]:
+        device = prompt_text.device
+        # 1. prepare input
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        if prompt_speech_token_len != 0:
+            prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
+        else:
+            prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=prompt_text.dtype).to(device)
+        lm_input = torch.concat([sos_eos_emb], dim=1)
+        # 2. iterate text
+        out_tokens = []
+        cache = None
+        # NOTE init prompt_text as text_cache as it is basically impossible prompt_speech_token/prompt_text < 15/5
+        text_cache = self.llm.model.model.embed_tokens(prompt_text)
+        next_fill_index = -1
+        for this_text in text:
+            text_cache = torch.concat([text_cache, self.llm.model.model.embed_tokens(this_text)], dim=1)
+            # prompt_speech_token_emb not empty, try append to lm_input
+            while prompt_speech_token_emb.size(1) != 0:
+                if text_cache.size(1) >= self.mix_ratio[0]:
+                    lm_input_text, lm_input_speech = text_cache[:, :self.mix_ratio[0]], prompt_speech_token_emb[:, :self.mix_ratio[1]]
+                    logging.info('append {} text token {} speech token'.format(lm_input_text.size(1), lm_input_speech.size(1)))
+                    lm_input = torch.concat([lm_input, lm_input_text, lm_input_speech], dim=1)
+                    text_cache, prompt_speech_token_emb = text_cache[:, self.mix_ratio[0]:], prompt_speech_token_emb[:, self.mix_ratio[1]:]
+                else:
+                    logging.info('not enough text token to decode, wait for more')
+                    break
+            # no prompt_speech_token_emb remain, can decode some speech token
+            if prompt_speech_token_emb.size(1) == 0:
+                if (len(out_tokens) != 0 and out_tokens[-1] == self.speech_token_size + 2) or (len(out_tokens) == 0 and lm_input.size(1) == 1):
+                    logging.info('get fill token, need to append more text token')
+                    if text_cache.size(1) >= self.mix_ratio[0]:
+                        lm_input_text = text_cache[:, :self.mix_ratio[0]]
+                        logging.info('append {} text token'.format(lm_input_text.size(1)))
+                        if len(out_tokens) != 0 and out_tokens[-1] == self.speech_token_size + 2:
+                            lm_input = lm_input_text
+                        else:
+                            lm_input = torch.concat([lm_input, lm_input_text], dim=1)
+                        text_cache = text_cache[:, self.mix_ratio[0]:]
+                    else:
+                        logging.info('not enough text token to decode, wait for more')
+                        continue
+                while True:
+                    seq_len = lm_input.shape[1] if cache is None else lm_input.shape[1] + cache[0][0].size(2)
+                    y_pred, cache = self.llm.forward_one_step(lm_input,
+                                                masks=torch.tril(torch.ones((1, seq_len, seq_len), device=lm_input.device)).to(torch.bool),
+                                                cache=cache)
+                    logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+                    if next_fill_index != -1 and len(out_tokens) == next_fill_index:
+                        top_ids = self.speech_token_size + 2
+                        next_fill_index += (self.mix_ratio[1] + 1)
+                    else:
+                        top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=True).item()
+                    if top_ids == self.speech_token_size + 2:
+                        next_fill_index = len(out_tokens) + self.mix_ratio[1] + 1
+                        logging.info('fill_token index {} next fill_token index {}'.format(len(out_tokens), next_fill_index))
+                    out_tokens.append(top_ids)
+                    if top_ids >= self.speech_token_size:
+                        if top_ids == self.speech_token_size + 2:
+                            break
+                        else:
+                            raise ValueError('should not get token {}'.format(top_ids))
+                    yield top_ids
+                    lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+        # 3. final decode
+        lm_input = torch.concat([lm_input, text_cache, task_id_emb], dim=1)
+        logging.info('no more text token, decode until met eos')
+        while True:
+            seq_len = lm_input.shape[1] if cache is None else lm_input.shape[1] + cache[0][0].size(2)
+            y_pred, cache = self.llm.forward_one_step(lm_input,
+                                                      masks=torch.tril(torch.ones((1, seq_len, seq_len), device=lm_input.device)).to(torch.bool),
+                                                      cache=cache)
+            logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
+            top_ids = self.sampling_ids(logp.squeeze(dim=0), out_tokens, sampling, ignore_eos=False).item()
+            out_tokens.append(top_ids)
+            if top_ids >= self.speech_token_size:
+                if top_ids == self.speech_token_size:
+                    break
+                else:
+                    raise ValueError('should not get token {}'.format(top_ids))
+            # in stream mode, yield token one by one
+            yield top_ids
+            lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)

tts/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

tts/cosyvoice/tokenizer/tokenizer.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import base64
+import os
+from functools import lru_cache
+from typing import Optional
+import torch
+from transformers import AutoTokenizer
+from whisper.tokenizer import Tokenizer
+import tiktoken
+LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "he": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+    "yue": "cantonese",
+    "minnan": "minnan",
+    "wuyu": "wuyu",
+    "dialect": "dialect",
+    "zh/en": "zh/en",
+    "en/zh": "en/zh",
+}
+# language code lookup by name, with a few language aliases
+TO_LANGUAGE_CODE = {
+    **{language: code for code, language in LANGUAGES.items()},
+    "burmese": "my",
+    "valencian": "ca",
+    "flemish": "nl",
+    "haitian": "ht",
+    "letzeburgesch": "lb",
+    "pushto": "ps",
+    "panjabi": "pa",
+    "moldavian": "ro",
+    "moldovan": "ro",
+    "sinhalese": "si",
+    "castilian": "es",
+    "mandarin": "zh",
+}
+AUDIO_EVENT = {
+    "ASR": "ASR",
+    "AED": "AED",
+    "SER": "SER",
+    "Speech": "Speech",
+    "/Speech": "/Speech",
+    "BGM": "BGM",
+    "/BGM": "/BGM",
+    "Laughter": "Laughter",
+    "/Laughter": "/Laughter",
+    "Applause": "Applause",
+    "/Applause": "/Applause",
+}
+EMOTION = {
+    "HAPPY": "HAPPY",
+    "SAD": "SAD",
+    "ANGRY": "ANGRY",
+    "NEUTRAL": "NEUTRAL",
+}
+TTS_Vocal_Token = {
+    "TTS/B": "TTS/B",
+    "TTS/O": "TTS/O",
+    "TTS/Q": "TTS/Q",
+    "TTS/A": "TTS/A",
+    "TTS/CO": "TTS/CO",
+    "TTS/CL": "TTS/CL",
+    "TTS/H": "TTS/H",
+    **{f"TTS/SP{i:02d}": f"TTS/SP{i:02d}" for i in range(1, 14)}
+}
+@lru_cache(maxsize=None)
+def get_encoding(name: str = "gpt2", num_languages: int = 99):
+    vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
+    ranks = {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in open(vocab_path) if line)
+    }
+    n_vocab = len(ranks)
+    special_tokens = {}
+    specials = [
+        "<|endoftext|>",
+        "<|startoftranscript|>",
+        *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
+        *[f"<|{audio_event}|>" for audio_event in list(AUDIO_EVENT.keys())],
+        *[f"<|{emotion}|>" for emotion in list(EMOTION.keys())],
+        "<|translate|>",
+        "<|transcribe|>",
+        "<|startoflm|>",
+        "<|startofprev|>",
+        "<|nospeech|>",
+        "<|notimestamps|>",
+        *[f"<|SPECIAL_TOKEN_{i}|>" for i in range(1, 31)],        # register special tokens for ASR
+        *[f"<|{tts}|>" for tts in list(TTS_Vocal_Token.keys())],  # register special tokens for TTS
+        *[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
+    ]
+    for token in specials:
+        special_tokens[token] = n_vocab
+        n_vocab += 1
+    return tiktoken.Encoding(
+        name=os.path.basename(vocab_path),
+        explicit_n_vocab=n_vocab,
+        pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
+        mergeable_ranks=ranks,
+        special_tokens=special_tokens,
+    )
+@lru_cache(maxsize=None)
+def get_tokenizer(
+    multilingual: bool,
+    *,
+    num_languages: int = 99,
+    language: Optional[str] = None,
+    task: Optional[str] = None,  # Literal["transcribe", "translate", None]
+) -> Tokenizer:
+    if language is not None:
+        language = language.lower()
+        if language not in LANGUAGES:
+            if language in TO_LANGUAGE_CODE:
+                language = TO_LANGUAGE_CODE[language]
+            else:
+                raise ValueError(f"Unsupported language: {language}")
+    if multilingual:
+        encoding_name = "multilingual_zh_ja_yue_char_del"
+        language = language or "en"
+        task = task or "transcribe"
+    else:
+        encoding_name = "gpt2"
+        language = None
+        task = None
+    encoding = get_encoding(name=encoding_name, num_languages=num_languages)
+    return Tokenizer(
+        encoding=encoding, num_languages=num_languages, language=language, task=task
+    )
+class QwenTokenizer():
+    def __init__(self, token_path, skip_special_tokens=True):
+        super().__init__()
+        # NOTE: non-chat model, all these special tokens keep randomly initialized.
+        special_tokens = {
+            'eos_token': '<|endoftext|>',
+            'pad_token': '<|endoftext|>',
+            'additional_special_tokens': [
+                '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
+                '[breath]', '<strong>', '</strong>', '[noise]',
+                '[laughter]', '[cough]', '[clucking]', '[accent]',
+                '[quick_breath]',
+                "<laughter>", "</laughter>",
+                "[hissing]", "[sigh]", "[vocalized-noise]",
+                "[lipsmack]", "[mn]"
+            ]
+        }
+        self.special_tokens = special_tokens
+        self.tokenizer = AutoTokenizer.from_pretrained(token_path)
+        self.tokenizer.add_special_tokens(special_tokens)
+        self.skip_special_tokens = skip_special_tokens
+    def encode(self, text, **kwargs):
+        tokens = self.tokenizer([text], return_tensors="pt")
+        tokens = tokens["input_ids"][0].cpu().tolist()
+        return tokens
+    def decode(self, tokens):
+        tokens = torch.tensor(tokens, dtype=torch.int64)
+        text = self.tokenizer.batch_decode([tokens], skip_special_tokens=self.skip_special_tokens)[0]
+        return text
+@lru_cache(maxsize=None)
+def get_qwen_tokenizer(
+    token_path: str,
+    skip_special_tokens: bool
+) -> QwenTokenizer:
+    return QwenTokenizer(token_path=token_path, skip_special_tokens=skip_special_tokens)