Spaces:

Difficult-Burger
/

vevo-test

Build error

vevo-test / app.py

积极的屁孩

debug audio saving format

4a1664c 3 months ago

20.4 kB

	import os
	import sys
	import json
	import torch
	import gradio as gr
	import torchaudio
	import numpy as np
	from huggingface_hub import snapshot_download, hf_hub_download
	import subprocess

	# 克隆Amphion仓库
	if not os.path.exists("Amphion"):
	subprocess.run(["git", "clone", "https://github.com/open-mmlab/Amphion.git"])
	os.chdir("Amphion")
	else:
	if not os.getcwd().endswith("Amphion"):
	os.chdir("Amphion")

	# 将Amphion加入到路径中
	if os.path.dirname(os.path.abspath("Amphion")) not in sys.path:
	sys.path.append(os.path.dirname(os.path.abspath("Amphion")))

	# 确保需要的目录存在
	os.makedirs("wav", exist_ok=True)
	os.makedirs("ckpts/Vevo", exist_ok=True)

	from models.vc.vevo.vevo_utils import VevoInferencePipeline, save_audio, load_wav

	# 下载和设置配置文件
	def setup_configs():
	config_path = "models/vc/vevo/config"
	os.makedirs(config_path, exist_ok=True)

	config_files = [
	"PhoneToVq8192.json",
	"Vocoder.json",
	"Vq32ToVq8192.json",
	"Vq8192ToMels.json",
	"hubert_large_l18_c32.yaml",
	]

	for file in config_files:
	file_path = f"{config_path}/{file}"
	if not os.path.exists(file_path):
	try:
	file_data = hf_hub_download(
	repo_id="amphion/Vevo",
	filename=f"config/{file}",
	repo_type="model",
	)
	os.makedirs(os.path.dirname(file_path), exist_ok=True)
	# 拷贝文件到目标位置
	subprocess.run(["cp", file_data, file_path])
	except Exception as e:
	print(f"下载配置文件 {file} 时出错: {e}")

	setup_configs()

	# 设备配置
	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	print(f"使用设备: {device}")

	# 初始化管道字典
	inference_pipelines = {}

	def get_pipeline(pipeline_type):
	if pipeline_type in inference_pipelines:
	return inference_pipelines[pipeline_type]

	# 根据需要的管道类型初始化
	if pipeline_type == "style" or pipeline_type == "voice":
	# 下载Content Tokenizer
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir="./ckpts/Vevo",
	allow_patterns=["tokenizer/vq32/*"],
	)
	content_tokenizer_ckpt_path = os.path.join(
	local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
	)

	# 下载Content-Style Tokenizer
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir="./ckpts/Vevo",
	allow_patterns=["tokenizer/vq8192/*"],
	)
	content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")

	# 下载Autoregressive Transformer
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir="./ckpts/Vevo",
	allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
	)
	ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
	ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")

	# 下载Flow Matching Transformer
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir="./ckpts/Vevo",
	allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
	)
	fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
	fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")

	# 下载Vocoder
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir="./ckpts/Vevo",
	allow_patterns=["acoustic_modeling/Vocoder/*"],
	)
	vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
	vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")

	# 初始化管道
	inference_pipeline = VevoInferencePipeline(
	content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
	content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
	ar_cfg_path=ar_cfg_path,
	ar_ckpt_path=ar_ckpt_path,
	fmt_cfg_path=fmt_cfg_path,
	fmt_ckpt_path=fmt_ckpt_path,
	vocoder_cfg_path=vocoder_cfg_path,
	vocoder_ckpt_path=vocoder_ckpt_path,
	device=device,
	)

	elif pipeline_type == "timbre":
	# 下载Content-Style Tokenizer (仅timbre需要)
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir="./ckpts/Vevo",
	allow_patterns=["tokenizer/vq8192/*"],
	)
	content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")

	# 下载Flow Matching Transformer
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir="./ckpts/Vevo",
	allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
	)
	fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
	fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")

	# 下载Vocoder
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir="./ckpts/Vevo",
	allow_patterns=["acoustic_modeling/Vocoder/*"],
	)
	vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
	vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")

	# 初始化管道
	inference_pipeline = VevoInferencePipeline(
	content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
	fmt_cfg_path=fmt_cfg_path,
	fmt_ckpt_path=fmt_ckpt_path,
	vocoder_cfg_path=vocoder_cfg_path,
	vocoder_ckpt_path=vocoder_ckpt_path,
	device=device,
	)

	elif pipeline_type == "tts":
	# 下载Content-Style Tokenizer
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir="./ckpts/Vevo",
	allow_patterns=["tokenizer/vq8192/*"],
	)
	content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")

	# 下载Autoregressive Transformer (TTS特有)
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir="./ckpts/Vevo",
	allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
	)
	ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
	ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")

	# 下载Flow Matching Transformer
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir="./ckpts/Vevo",
	allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
	)
	fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
	fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")

	# 下载Vocoder
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir="./ckpts/Vevo",
	allow_patterns=["acoustic_modeling/Vocoder/*"],
	)
	vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
	vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")

	# 初始化管道
	inference_pipeline = VevoInferencePipeline(
	content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
	ar_cfg_path=ar_cfg_path,
	ar_ckpt_path=ar_ckpt_path,
	fmt_cfg_path=fmt_cfg_path,
	fmt_ckpt_path=fmt_ckpt_path,
	vocoder_cfg_path=vocoder_cfg_path,
	vocoder_ckpt_path=vocoder_ckpt_path,
	device=device,
	)

	# 缓存管道实例
	inference_pipelines[pipeline_type] = inference_pipeline
	return inference_pipeline

	# 实现VEVO功能函数
	def vevo_style(content_wav, style_wav):
	temp_content_path = "wav/temp_content.wav"
	temp_style_path = "wav/temp_style.wav"
	output_path = "wav/output_vevostyle.wav"

	# 检查并正确处理音频数据
	if content_wav is None or style_wav is None:
	raise ValueError("请上传音频文件")

	# Gradio音频组件返回(sample_rate, data)元组或(data, sample_rate)元组
	if isinstance(content_wav, tuple) and len(content_wav) == 2:
	# 确保正确的顺序 (data, sample_rate)
	if isinstance(content_wav[0], np.ndarray):
	content_data, content_sr = content_wav
	else:
	content_sr, content_data = content_wav
	content_tensor = torch.FloatTensor(content_data)
	if content_tensor.ndim == 1:
	content_tensor = content_tensor.unsqueeze(0) # 添加通道维度
	else:
	raise ValueError("内容音频格式不正确")

	if isinstance(style_wav, tuple) and len(style_wav) == 2:
	# 确保正确的顺序 (data, sample_rate)
	if isinstance(style_wav[0], np.ndarray):
	style_data, style_sr = style_wav
	else:
	style_sr, style_data = style_wav
	style_tensor = torch.FloatTensor(style_data)
	if style_tensor.ndim == 1:
	style_tensor = style_tensor.unsqueeze(0) # 添加通道维度
	else:
	raise ValueError("风格音频格式不正确")

	# 保存上传的音频
	torchaudio.save(temp_content_path, content_tensor, content_sr)
	torchaudio.save(temp_style_path, style_tensor, style_sr)

	# 获取管道
	pipeline = get_pipeline("style")

	# 推理
	gen_audio = pipeline.inference_ar_and_fm(
	src_wav_path=temp_content_path,
	src_text=None,
	style_ref_wav_path=temp_style_path,
	timbre_ref_wav_path=temp_content_path,
	)

	# 保存生成的音频
	save_audio(gen_audio, output_path=output_path)

	return output_path

	def vevo_timbre(content_wav, reference_wav):
	temp_content_path = "wav/temp_content.wav"
	temp_reference_path = "wav/temp_reference.wav"
	output_path = "wav/output_vevotimbre.wav"

	# 检查并正确处理音频数据
	if content_wav is None or reference_wav is None:
	raise ValueError("请上传音频文件")

	# Gradio音频组件返回(sample_rate, data)元组或(data, sample_rate)元组
	if isinstance(content_wav, tuple) and len(content_wav) == 2:
	# 确保正确的顺序 (data, sample_rate)
	if isinstance(content_wav[0], np.ndarray):
	content_data, content_sr = content_wav
	else:
	content_sr, content_data = content_wav
	content_tensor = torch.FloatTensor(content_data)
	if content_tensor.ndim == 1:
	content_tensor = content_tensor.unsqueeze(0) # 添加通道维度
	else:
	raise ValueError("内容音频格式不正确")

	if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
	# 确保正确的顺序 (data, sample_rate)
	if isinstance(reference_wav[0], np.ndarray):
	reference_data, reference_sr = reference_wav
	else:
	reference_sr, reference_data = reference_wav
	reference_tensor = torch.FloatTensor(reference_data)
	if reference_tensor.ndim == 1:
	reference_tensor = reference_tensor.unsqueeze(0) # 添加通道维度
	else:
	raise ValueError("参考音频格式不正确")

	# 保存上传的音频
	torchaudio.save(temp_content_path, content_tensor, content_sr)
	torchaudio.save(temp_reference_path, reference_tensor, reference_sr)

	# 获取管道
	pipeline = get_pipeline("timbre")

	# 推理
	gen_audio = pipeline.inference_fm(
	src_wav_path=temp_content_path,
	timbre_ref_wav_path=temp_reference_path,
	flow_matching_steps=32,
	)

	# 保存生成的音频
	save_audio(gen_audio, output_path=output_path)

	return output_path

	def vevo_voice(content_wav, reference_wav):
	temp_content_path = "wav/temp_content.wav"
	temp_reference_path = "wav/temp_reference.wav"
	output_path = "wav/output_vevovoice.wav"

	# 检查并正确处理音频数据
	if content_wav is None or reference_wav is None:
	raise ValueError("请上传音频文件")

	# Gradio音频组件返回(sample_rate, data)元组或(data, sample_rate)元组
	if isinstance(content_wav, tuple) and len(content_wav) == 2:
	# 确保正确的顺序 (data, sample_rate)
	if isinstance(content_wav[0], np.ndarray):
	content_data, content_sr = content_wav
	else:
	content_sr, content_data = content_wav
	content_tensor = torch.FloatTensor(content_data)
	if content_tensor.ndim == 1:
	content_tensor = content_tensor.unsqueeze(0) # 添加通道维度
	else:
	raise ValueError("内容音频格式不正确")

	if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
	# 确保正确的顺序 (data, sample_rate)
	if isinstance(reference_wav[0], np.ndarray):
	reference_data, reference_sr = reference_wav
	else:
	reference_sr, reference_data = reference_wav
	reference_tensor = torch.FloatTensor(reference_data)
	if reference_tensor.ndim == 1:
	reference_tensor = reference_tensor.unsqueeze(0) # 添加通道维度
	else:
	raise ValueError("参考音频格式不正确")

	# 保存上传的音频
	torchaudio.save(temp_content_path, content_tensor, content_sr)
	torchaudio.save(temp_reference_path, reference_tensor, reference_sr)

	# 获取管道
	pipeline = get_pipeline("voice")

	# 推理
	gen_audio = pipeline.inference_ar_and_fm(
	src_wav_path=temp_content_path,
	src_text=None,
	style_ref_wav_path=temp_reference_path,
	timbre_ref_wav_path=temp_reference_path,
	)

	# 保存生成的音频
	save_audio(gen_audio, output_path=output_path)

	return output_path

	def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language="en"):
	temp_ref_path = "wav/temp_ref.wav"
	temp_timbre_path = "wav/temp_timbre.wav"
	output_path = "wav/output_vevotts.wav"

	# 检查并正确处理音频数据
	if ref_wav is None:
	raise ValueError("请上传参考音频文件")

	# Gradio音频组件返回(sample_rate, data)元组或(data, sample_rate)元组
	if isinstance(ref_wav, tuple) and len(ref_wav) == 2:
	# 确保正确的顺序 (data, sample_rate)
	if isinstance(ref_wav[0], np.ndarray):
	ref_data, ref_sr = ref_wav
	else:
	ref_sr, ref_data = ref_wav
	ref_tensor = torch.FloatTensor(ref_data)
	if ref_tensor.ndim == 1:
	ref_tensor = ref_tensor.unsqueeze(0) # 添加通道维度
	else:
	raise ValueError("参考音频格式不正确")

	# 保存上传的音频
	torchaudio.save(temp_ref_path, ref_tensor, ref_sr)

	if timbre_ref_wav is not None:
	if isinstance(timbre_ref_wav, tuple) and len(timbre_ref_wav) == 2:
	# 确保正确的顺序 (data, sample_rate)
	if isinstance(timbre_ref_wav[0], np.ndarray):
	timbre_data, timbre_sr = timbre_ref_wav
	else:
	timbre_sr, timbre_data = timbre_ref_wav
	timbre_tensor = torch.FloatTensor(timbre_data)
	if timbre_tensor.ndim == 1:
	timbre_tensor = timbre_tensor.unsqueeze(0) # 添加通道维度
	torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr)
	else:
	raise ValueError("音色参考音频格式不正确")
	else:
	temp_timbre_path = temp_ref_path

	# 获取管道
	pipeline = get_pipeline("tts")

	# 推理
	gen_audio = pipeline.inference_ar_and_fm(
	src_wav_path=None,
	src_text=text,
	style_ref_wav_path=temp_ref_path,
	timbre_ref_wav_path=temp_timbre_path,
	style_ref_wav_text=None,
	src_text_language=src_language,
	style_ref_wav_text_language=ref_language,
	)

	# 保存生成的音频
	save_audio(gen_audio, output_path=output_path)

	return output_path

	# 创建Gradio界面
	with gr.Blocks(title="VEVO Demo") as demo:
	gr.Markdown("# VEVO: 多功能语音合成模型演示")
	gr.Markdown("## 可控零样本声音模仿与风格转换")

	with gr.Tab("风格转换 (Style)"):
	gr.Markdown("### Vevo-Style: 保持音色但转换风格（如口音、情感等）")
	with gr.Row():
	with gr.Column():
	style_content = gr.Audio(label="内容音频", type="numpy")
	style_reference = gr.Audio(label="风格音频", type="numpy")
	style_button = gr.Button("生成")
	with gr.Column():
	style_output = gr.Audio(label="生成结果")
	style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)

	with gr.Tab("音色转换 (Timbre)"):
	gr.Markdown("### Vevo-Timbre: 保持风格但转换音色")
	with gr.Row():
	with gr.Column():
	timbre_content = gr.Audio(label="内容音频", type="numpy")
	timbre_reference = gr.Audio(label="音色参考音频", type="numpy")
	timbre_button = gr.Button("生成")
	with gr.Column():
	timbre_output = gr.Audio(label="生成结果")
	timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)

	with gr.Tab("声音转换 (Voice)"):
	gr.Markdown("### Vevo-Voice: 同时转换风格和音色")
	with gr.Row():
	with gr.Column():
	voice_content = gr.Audio(label="内容音频", type="numpy")
	voice_reference = gr.Audio(label="声音参考音频", type="numpy")
	voice_button = gr.Button("生成")
	with gr.Column():
	voice_output = gr.Audio(label="生成结果")
	voice_button.click(vevo_voice, inputs=[voice_content, voice_reference], outputs=voice_output)

	with gr.Tab("文本到语音 (TTS)"):
	gr.Markdown("### Vevo-TTS: 风格与音色可控的文本到语音转换")
	with gr.Row():
	with gr.Column():
	tts_text = gr.Textbox(label="输入文本", placeholder="请输入要合成的文本...", lines=3)
	tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="文本语言", value="en")
	tts_reference = gr.Audio(label="风格参考音频", type="numpy")
	tts_ref_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="参考音频语言", value="en")

	with gr.Accordion("高级选项", open=False):
	tts_timbre_reference = gr.Audio(label="音色参考音频（可选）", type="numpy")

	tts_button = gr.Button("生成")
	with gr.Column():
	tts_output = gr.Audio(label="生成结果")

	tts_button.click(
	vevo_tts,
	inputs=[tts_text, tts_reference, tts_timbre_reference, tts_src_language, tts_ref_language],
	outputs=tts_output
	)

	gr.Markdown("""
	## 关于VEVO
	VEVO是一个多功能语音合成和转换模型，提供四种主要功能：
	1. Vevo-Style: 保持音色但转换风格（如口音、情感等）
	2. Vevo-Timbre: 保持风格但转换音色
	3. Vevo-Voice: 同时转换风格和音色
	4. Vevo-TTS: 风格与音色可控的文本到语音转换

	更多信息请访问[Amphion项目](https://github.com/open-mmlab/Amphion)
	""")

	# 启动应用
	demo.launch()