Spaces:

Difficult-Burger
/

vevo-test

Build error

File size: 21,566 Bytes

import os
import sys
import json
import torch
import gradio as gr
import torchaudio
import numpy as np
from huggingface_hub import snapshot_download, hf_hub_download
import subprocess

# 克隆Amphion仓库
if not os.path.exists("Amphion"):
    subprocess.run(["git", "clone", "https://github.com/open-mmlab/Amphion.git"])
    os.chdir("Amphion")
else:
    if not os.getcwd().endswith("Amphion"):
        os.chdir("Amphion")

# 将Amphion加入到路径中
if os.path.dirname(os.path.abspath("Amphion")) not in sys.path:
    sys.path.append(os.path.dirname(os.path.abspath("Amphion")))

# 确保需要的目录存在
os.makedirs("wav", exist_ok=True)
os.makedirs("ckpts/Vevo", exist_ok=True)

from models.vc.vevo.vevo_utils import VevoInferencePipeline, save_audio, load_wav

# 下载和设置配置文件
def setup_configs():
    config_path = "models/vc/vevo/config"
    os.makedirs(config_path, exist_ok=True)
    
    config_files = [
        "PhoneToVq8192.json",
        "Vocoder.json",
        "Vq32ToVq8192.json",
        "Vq8192ToMels.json",
        "hubert_large_l18_c32.yaml",
    ]
    
    for file in config_files:
        file_path = f"{config_path}/{file}"
        if not os.path.exists(file_path):
            try:
                file_data = hf_hub_download(
                    repo_id="amphion/Vevo", 
                    filename=f"config/{file}", 
                    repo_type="model",
                )
                os.makedirs(os.path.dirname(file_path), exist_ok=True)
                # 拷贝文件到目标位置
                subprocess.run(["cp", file_data, file_path])
            except Exception as e:
                print(f"下载配置文件 {file} 时出错: {e}")

setup_configs()

# 设备配置
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"使用设备: {device}")

# 初始化管道字典
inference_pipelines = {}

def get_pipeline(pipeline_type):
    if pipeline_type in inference_pipelines:
        return inference_pipelines[pipeline_type]
    
    # 根据需要的管道类型初始化
    if pipeline_type == "style" or pipeline_type == "voice":
        # 下载Content Tokenizer
        local_dir = snapshot_download(
            repo_id="amphion/Vevo",
            repo_type="model",
            cache_dir="./ckpts/Vevo",
            allow_patterns=["tokenizer/vq32/*"],
        )
        content_tokenizer_ckpt_path = os.path.join(
            local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
        )
        
        # 下载Content-Style Tokenizer
        local_dir = snapshot_download(
            repo_id="amphion/Vevo",
            repo_type="model",
            cache_dir="./ckpts/Vevo",
            allow_patterns=["tokenizer/vq8192/*"],
        )
        content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
        
        # 下载Autoregressive Transformer
        local_dir = snapshot_download(
            repo_id="amphion/Vevo",
            repo_type="model",
            cache_dir="./ckpts/Vevo",
            allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
        )
        ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
        ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
        
        # 下载Flow Matching Transformer
        local_dir = snapshot_download(
            repo_id="amphion/Vevo",
            repo_type="model",
            cache_dir="./ckpts/Vevo",
            allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
        )
        fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
        fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
        
        # 下载Vocoder
        local_dir = snapshot_download(
            repo_id="amphion/Vevo",
            repo_type="model",
            cache_dir="./ckpts/Vevo",
            allow_patterns=["acoustic_modeling/Vocoder/*"],
        )
        vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
        vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
        
        # 初始化管道
        inference_pipeline = VevoInferencePipeline(
            content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
            content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
            ar_cfg_path=ar_cfg_path,
            ar_ckpt_path=ar_ckpt_path,
            fmt_cfg_path=fmt_cfg_path,
            fmt_ckpt_path=fmt_ckpt_path,
            vocoder_cfg_path=vocoder_cfg_path,
            vocoder_ckpt_path=vocoder_ckpt_path,
            device=device,
        )
        
    elif pipeline_type == "timbre":
        # 下载Content-Style Tokenizer (仅timbre需要)
        local_dir = snapshot_download(
            repo_id="amphion/Vevo",
            repo_type="model",
            cache_dir="./ckpts/Vevo",
            allow_patterns=["tokenizer/vq8192/*"],
        )
        content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
        
        # 下载Flow Matching Transformer
        local_dir = snapshot_download(
            repo_id="amphion/Vevo",
            repo_type="model",
            cache_dir="./ckpts/Vevo",
            allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
        )
        fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
        fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
        
        # 下载Vocoder
        local_dir = snapshot_download(
            repo_id="amphion/Vevo",
            repo_type="model",
            cache_dir="./ckpts/Vevo",
            allow_patterns=["acoustic_modeling/Vocoder/*"],
        )
        vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
        vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
        
        # 初始化管道
        inference_pipeline = VevoInferencePipeline(
            content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
            fmt_cfg_path=fmt_cfg_path,
            fmt_ckpt_path=fmt_ckpt_path,
            vocoder_cfg_path=vocoder_cfg_path,
            vocoder_ckpt_path=vocoder_ckpt_path,
            device=device,
        )
        
    elif pipeline_type == "tts":
        # 下载Content-Style Tokenizer
        local_dir = snapshot_download(
            repo_id="amphion/Vevo",
            repo_type="model",
            cache_dir="./ckpts/Vevo",
            allow_patterns=["tokenizer/vq8192/*"],
        )
        content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
        
        # 下载Autoregressive Transformer (TTS特有)
        local_dir = snapshot_download(
            repo_id="amphion/Vevo",
            repo_type="model",
            cache_dir="./ckpts/Vevo",
            allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
        )
        ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
        ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
        
        # 下载Flow Matching Transformer
        local_dir = snapshot_download(
            repo_id="amphion/Vevo",
            repo_type="model",
            cache_dir="./ckpts/Vevo",
            allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
        )
        fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
        fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
        
        # 下载Vocoder
        local_dir = snapshot_download(
            repo_id="amphion/Vevo",
            repo_type="model",
            cache_dir="./ckpts/Vevo",
            allow_patterns=["acoustic_modeling/Vocoder/*"],
        )
        vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
        vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
        
        # 初始化管道
        inference_pipeline = VevoInferencePipeline(
            content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
            ar_cfg_path=ar_cfg_path,
            ar_ckpt_path=ar_ckpt_path,
            fmt_cfg_path=fmt_cfg_path,
            fmt_ckpt_path=fmt_ckpt_path,
            vocoder_cfg_path=vocoder_cfg_path,
            vocoder_ckpt_path=vocoder_ckpt_path,
            device=device,
        )
    
    # 缓存管道实例
    inference_pipelines[pipeline_type] = inference_pipeline
    return inference_pipeline

# 实现VEVO功能函数
def vevo_style(content_wav, style_wav):
    temp_content_path = "wav/temp_content.wav"
    temp_style_path = "wav/temp_style.wav"
    output_path = "wav/output_vevostyle.wav"
    
    # 检查并处理音频数据
    if content_wav is None or style_wav is None:
        raise ValueError("请上传音频文件")
    
    # 处理音频格式
    if isinstance(content_wav, tuple) and len(content_wav) == 2:
        if isinstance(content_wav[0], np.ndarray):
            content_data, content_sr = content_wav
        else:
            content_sr, content_data = content_wav
        
        # 确保是单声道
        if len(content_data.shape) > 1 and content_data.shape[1] > 1:
            content_data = np.mean(content_data, axis=1)
        
        # 重采样到24kHz
        if content_sr != 24000:
            content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
            content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
            content_sr = 24000
        else:
            content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
        
        # 归一化音量
        content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
    else:
        raise ValueError("内容音频格式不正确")
    
    if isinstance(style_wav, tuple) and len(style_wav) == 2:
        # 确保正确的顺序 (data, sample_rate)
        if isinstance(style_wav[0], np.ndarray):
            style_data, style_sr = style_wav
        else:
            style_sr, style_data = style_wav
        style_tensor = torch.FloatTensor(style_data)
        if style_tensor.ndim == 1:
            style_tensor = style_tensor.unsqueeze(0)  # 添加通道维度
    else:
        raise ValueError("风格音频格式不正确")
    
    # 打印debug信息
    print(f"内容音频形状: {content_tensor.shape}, 采样率: {content_sr}")
    print(f"风格音频形状: {style_tensor.shape}, 采样率: {style_sr}")
    
    # 保存音频
    torchaudio.save(temp_content_path, content_tensor, content_sr)
    torchaudio.save(temp_style_path, style_tensor, style_sr)
    
    try:
        # 获取管道
        pipeline = get_pipeline("style")
        
        # 推理
        gen_audio = pipeline.inference_ar_and_fm(
            src_wav_path=temp_content_path,
            src_text=None,
            style_ref_wav_path=temp_style_path,
            timbre_ref_wav_path=temp_content_path,
        )
        
        # 检查生成音频是否为数值异常
        if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
            print("警告：生成的音频包含NaN或Inf值")
            gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
        
        print(f"生成音频形状: {gen_audio.shape}, 最大值: {torch.max(gen_audio)}, 最小值: {torch.min(gen_audio)}")
        
        # 保存生成的音频
        save_audio(gen_audio, output_path=output_path)
        
        return output_path
    except Exception as e:
        print(f"处理过程中出错: {e}")
        import traceback
        traceback.print_exc()
        raise e

def vevo_timbre(content_wav, reference_wav):
    temp_content_path = "wav/temp_content.wav"
    temp_reference_path = "wav/temp_reference.wav"
    output_path = "wav/output_vevotimbre.wav"
    
    # 检查并正确处理音频数据
    if content_wav is None or reference_wav is None:
        raise ValueError("请上传音频文件")
    
    # Gradio音频组件返回(sample_rate, data)元组或(data, sample_rate)元组
    if isinstance(content_wav, tuple) and len(content_wav) == 2:
        # 确保正确的顺序 (data, sample_rate)
        if isinstance(content_wav[0], np.ndarray):
            content_data, content_sr = content_wav
        else:
            content_sr, content_data = content_wav
        content_tensor = torch.FloatTensor(content_data)
        if content_tensor.ndim == 1:
            content_tensor = content_tensor.unsqueeze(0)  # 添加通道维度
    else:
        raise ValueError("内容音频格式不正确")
        
    if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
        # 确保正确的顺序 (data, sample_rate)
        if isinstance(reference_wav[0], np.ndarray):
            reference_data, reference_sr = reference_wav
        else:
            reference_sr, reference_data = reference_wav
        reference_tensor = torch.FloatTensor(reference_data)
        if reference_tensor.ndim == 1:
            reference_tensor = reference_tensor.unsqueeze(0)  # 添加通道维度
    else:
        raise ValueError("参考音频格式不正确")
    
    # 保存上传的音频
    torchaudio.save(temp_content_path, content_tensor, content_sr)
    torchaudio.save(temp_reference_path, reference_tensor, reference_sr)
    
    # 获取管道
    pipeline = get_pipeline("timbre")
    
    # 推理
    gen_audio = pipeline.inference_fm(
        src_wav_path=temp_content_path,
        timbre_ref_wav_path=temp_reference_path,
        flow_matching_steps=32,
    )
    
    # 保存生成的音频
    save_audio(gen_audio, output_path=output_path)
    
    return output_path

def vevo_voice(content_wav, reference_wav):
    temp_content_path = "wav/temp_content.wav"
    temp_reference_path = "wav/temp_reference.wav"
    output_path = "wav/output_vevovoice.wav"
    
    # 检查并正确处理音频数据
    if content_wav is None or reference_wav is None:
        raise ValueError("请上传音频文件")
    
    # Gradio音频组件返回(sample_rate, data)元组或(data, sample_rate)元组
    if isinstance(content_wav, tuple) and len(content_wav) == 2:
        # 确保正确的顺序 (data, sample_rate)
        if isinstance(content_wav[0], np.ndarray):
            content_data, content_sr = content_wav
        else:
            content_sr, content_data = content_wav
        content_tensor = torch.FloatTensor(content_data)
        if content_tensor.ndim == 1:
            content_tensor = content_tensor.unsqueeze(0)  # 添加通道维度
    else:
        raise ValueError("内容音频格式不正确")
        
    if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
        # 确保正确的顺序 (data, sample_rate)
        if isinstance(reference_wav[0], np.ndarray):
            reference_data, reference_sr = reference_wav
        else:
            reference_sr, reference_data = reference_wav
        reference_tensor = torch.FloatTensor(reference_data)
        if reference_tensor.ndim == 1:
            reference_tensor = reference_tensor.unsqueeze(0)  # 添加通道维度
    else:
        raise ValueError("参考音频格式不正确")
    
    # 保存上传的音频
    torchaudio.save(temp_content_path, content_tensor, content_sr)
    torchaudio.save(temp_reference_path, reference_tensor, reference_sr)
    
    # 获取管道
    pipeline = get_pipeline("voice")
    
    # 推理
    gen_audio = pipeline.inference_ar_and_fm(
        src_wav_path=temp_content_path,
        src_text=None,
        style_ref_wav_path=temp_reference_path,
        timbre_ref_wav_path=temp_reference_path,
    )
    
    # 保存生成的音频
    save_audio(gen_audio, output_path=output_path)
    
    return output_path

def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language="en"):
    temp_ref_path = "wav/temp_ref.wav"
    temp_timbre_path = "wav/temp_timbre.wav"
    output_path = "wav/output_vevotts.wav"
    
    # 检查并正确处理音频数据
    if ref_wav is None:
        raise ValueError("请上传参考音频文件")
    
    # Gradio音频组件返回(sample_rate, data)元组或(data, sample_rate)元组
    if isinstance(ref_wav, tuple) and len(ref_wav) == 2:
        # 确保正确的顺序 (data, sample_rate)
        if isinstance(ref_wav[0], np.ndarray):
            ref_data, ref_sr = ref_wav
        else:
            ref_sr, ref_data = ref_wav
        ref_tensor = torch.FloatTensor(ref_data)
        if ref_tensor.ndim == 1:
            ref_tensor = ref_tensor.unsqueeze(0)  # 添加通道维度
    else:
        raise ValueError("参考音频格式不正确")
    
    # 保存上传的音频
    torchaudio.save(temp_ref_path, ref_tensor, ref_sr)
    
    if timbre_ref_wav is not None:
        if isinstance(timbre_ref_wav, tuple) and len(timbre_ref_wav) == 2:
            # 确保正确的顺序 (data, sample_rate)
            if isinstance(timbre_ref_wav[0], np.ndarray):
                timbre_data, timbre_sr = timbre_ref_wav
            else:
                timbre_sr, timbre_data = timbre_ref_wav
            timbre_tensor = torch.FloatTensor(timbre_data)
            if timbre_tensor.ndim == 1:
                timbre_tensor = timbre_tensor.unsqueeze(0)  # 添加通道维度
            torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr)
        else:
            raise ValueError("音色参考音频格式不正确")
    else:
        temp_timbre_path = temp_ref_path
    
    # 获取管道
    pipeline = get_pipeline("tts")
    
    # 推理
    gen_audio = pipeline.inference_ar_and_fm(
        src_wav_path=None,
        src_text=text,
        style_ref_wav_path=temp_ref_path,
        timbre_ref_wav_path=temp_timbre_path,
        style_ref_wav_text=None,
        src_text_language=src_language,
        style_ref_wav_text_language=ref_language,
    )
    
    # 保存生成的音频
    save_audio(gen_audio, output_path=output_path)
    
    return output_path

# 创建Gradio界面
with gr.Blocks(title="VEVO Demo") as demo:
    gr.Markdown("# VEVO: 多功能语音合成模型演示")
    gr.Markdown("## 可控零样本声音模仿与风格转换")
    
    with gr.Tab("风格转换 (Style)"):
        gr.Markdown("### Vevo-Style: 保持音色但转换风格（如口音、情感等）")
        with gr.Row():
            with gr.Column():
                style_content = gr.Audio(label="内容音频", type="numpy")
                style_reference = gr.Audio(label="风格音频", type="numpy")
                style_button = gr.Button("生成")
            with gr.Column():
                style_output = gr.Audio(label="生成结果")
        style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
    
    with gr.Tab("音色转换 (Timbre)"):
        gr.Markdown("### Vevo-Timbre: 保持风格但转换音色")
        with gr.Row():
            with gr.Column():
                timbre_content = gr.Audio(label="内容音频", type="numpy")
                timbre_reference = gr.Audio(label="音色参考音频", type="numpy")
                timbre_button = gr.Button("生成")
            with gr.Column():
                timbre_output = gr.Audio(label="生成结果")
        timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
    
    with gr.Tab("声音转换 (Voice)"):
        gr.Markdown("### Vevo-Voice: 同时转换风格和音色")
        with gr.Row():
            with gr.Column():
                voice_content = gr.Audio(label="内容音频", type="numpy")
                voice_reference = gr.Audio(label="声音参考音频", type="numpy")
                voice_button = gr.Button("生成")
            with gr.Column():
                voice_output = gr.Audio(label="生成结果")
        voice_button.click(vevo_voice, inputs=[voice_content, voice_reference], outputs=voice_output)
    
    with gr.Tab("文本到语音 (TTS)"):
        gr.Markdown("### Vevo-TTS: 风格与音色可控的文本到语音转换")
        with gr.Row():
            with gr.Column():
                tts_text = gr.Textbox(label="输入文本", placeholder="请输入要合成的文本...", lines=3)
                tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="文本语言", value="en")
                tts_reference = gr.Audio(label="风格参考音频", type="numpy")
                tts_ref_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="参考音频语言", value="en")
                
                with gr.Accordion("高级选项", open=False):
                    tts_timbre_reference = gr.Audio(label="音色参考音频（可选）", type="numpy")
                
                tts_button = gr.Button("生成")
            with gr.Column():
                tts_output = gr.Audio(label="生成结果")
        
        tts_button.click(
            vevo_tts, 
            inputs=[tts_text, tts_reference, tts_timbre_reference, tts_src_language, tts_ref_language], 
            outputs=tts_output
        )
    
    gr.Markdown("""
    ## 关于VEVO
    VEVO是一个多功能语音合成和转换模型，提供四种主要功能：
    1. **Vevo-Style**: 保持音色但转换风格（如口音、情感等）
    2. **Vevo-Timbre**: 保持风格但转换音色
    3. **Vevo-Voice**: 同时转换风格和音色
    4. **Vevo-TTS**: 风格与音色可控的文本到语音转换
    
    更多信息请访问[Amphion项目](https://github.com/open-mmlab/Amphion)
    """)

# 启动应用
demo.launch()