积极的屁孩 commited on
Commit
f4115c6
·
1 Parent(s): e593e60
Files changed (2) hide show
  1. app.py +373 -687
  2. requirements.txt +8 -30
app.py CHANGED
@@ -1,724 +1,410 @@
1
  import os
2
  import sys
3
- import gradio as gr
4
  import torch
5
- import tempfile
6
- from pathlib import Path
7
- import importlib.util
8
- import shutil
9
  from huggingface_hub import snapshot_download, hf_hub_download
10
- import requests
11
  import subprocess
12
 
13
- # 检查并安装必要的依赖
14
- def install_dependencies():
15
- required_packages = ["pyworld", "torchaudio", "scipy", "librosa", "g2p_en"]
16
- for package in required_packages:
17
- try:
18
- importlib.import_module(package)
19
- print(f"已安装: {package}")
20
- except ImportError:
21
- print(f"安装: {package}")
22
- subprocess.check_call([sys.executable, "-m", "pip", "install", package])
23
- print(f"安装完成: {package}")
24
 
25
- # 安装必要的依赖
26
- install_dependencies()
 
27
 
28
- # 下载必要的模型代码
29
- def download_amphion_code():
30
- base_url = "https://raw.githubusercontent.com/open-mmlab/Amphion/main/"
31
- required_files = [
32
- # 基础目录结构
33
- "models/__init__.py",
34
- "models/base/__init__.py",
35
- "models/codec/__init__.py",
36
- "models/codec/kmeans/__init__.py",
37
- "models/codec/vevo/__init__.py",
38
- "models/codec/melvqgan/__init__.py",
39
- "models/codec/amphion_codec/__init__.py",
40
- "models/codec/amphion_codec/quantize/__init__.py",
41
- "models/vc/__init__.py",
42
- "models/vc/flow_matching_transformer/__init__.py",
43
- "models/vc/autoregressive_transformer/__init__.py",
44
- "models/tts/__init__.py",
45
- "models/tts/maskgct/__init__.py",
46
- "models/tts/maskgct/g2p/__init__.py",
47
- "utils/__init__.py",
48
-
49
- # 核心文件
50
- "models/vc/vevo/vevo_utils.py",
51
- "models/vc/flow_matching_transformer/fmt_model.py",
52
- "models/vc/flow_matching_transformer/llama_nar.py",
53
- "models/vc/autoregressive_transformer/ar_model.py",
54
- "models/vc/autoregressive_transformer/global_encoder.py",
55
- "models/codec/kmeans/repcodec_model.py",
56
- "models/codec/vevo/vevo_repcodec.py",
57
- "models/codec/melvqgan/melspec.py",
58
- "models/codec/amphion_codec/vocos.py",
59
- "models/codec/amphion_codec/codec.py",
60
- "models/codec/amphion_codec/quantize/factorized_vector_quantize.py",
61
- "models/codec/amphion_codec/quantize/lookup_free_quantize.py",
62
- "models/codec/amphion_codec/quantize/residual_vq.py",
63
- "models/codec/amphion_codec/quantize/vector_quantize.py",
64
- "utils/util.py",
65
- "utils/hparam.py",
66
- "models/tts/maskgct/g2p/g2p_generation.py",
67
- "models/vc/vevo/config/Vq32ToVq8192.json",
68
- "models/vc/vevo/config/Vq8192ToMels.json",
69
- "models/vc/vevo/config/PhoneToVq8192.json",
70
- "models/vc/vevo/config/Vocoder.json",
71
  ]
72
 
73
- for file_path in required_files:
74
- local_path = os.path.join(os.getcwd(), file_path)
75
- os.makedirs(os.path.dirname(local_path), exist_ok=True)
76
-
77
- # 跳过空的__init__.py文件,直接创建
78
- if file_path.endswith("__init__.py"):
79
- if not os.path.exists(local_path):
80
- with open(local_path, "w") as f:
81
- f.write("# Auto-generated file\n")
82
- continue
83
-
84
- # 下载其他文件
85
- try:
86
- response = requests.get(base_url + file_path)
87
- if response.status_code == 200:
88
- with open(local_path, "wb") as f:
89
- f.write(response.content)
90
- print(f"成功下载: {file_path}")
91
- else:
92
- print(f"无法下载 {file_path}, 状态码: {response.status_code}")
93
- # 创建空文件防止导入错误
94
- if not os.path.exists(local_path):
95
- with open(local_path, "w") as f:
96
- f.write("# Placeholder file\n")
97
- except Exception as e:
98
- print(f"下载 {file_path} 时出错: {str(e)}")
99
- # 创建空文件防止导入错误
100
- if not os.path.exists(local_path):
101
- with open(local_path, "w") as f:
102
- f.write("# Placeholder file\n")
103
 
104
- # 先下载必要的代码文件
105
- download_amphion_code()
106
 
107
- # 添加当前目录到系统路径
108
- sys.path.insert(0, os.getcwd())
 
109
 
110
- # 手动导入必要的类,解决导入问题
111
- try:
112
- from models.codec.amphion_codec.quantize.residual_vq import ResidualVQ
113
- # 添加到quantize模块的命名空间
114
- import models.codec.amphion_codec.quantize
115
- models.codec.amphion_codec.quantize.ResidualVQ = ResidualVQ
116
-
117
- # 解决vocos模块导入问题
118
- import models.codec.amphion_codec.vocos
119
- import sys
120
- import types
121
- # 创建虚拟模块
122
- kmeans_vocos_module = types.ModuleType('models.codec.kmeans.vocos')
123
- # 将amphion_codec中的vocos赋值给kmeans.vocos
124
- sys.modules['models.codec.kmeans.vocos'] = models.codec.amphion_codec.vocos
125
-
126
- # 修复VevoInferencePipeline中的yaml文件路径引用
127
- from models.vc.vevo import vevo_utils
128
- original_load_vevo_vqvae = vevo_utils.load_vevo_vqvae_checkpoint
129
 
130
- # 重定义函数处理路径问题
131
- def patched_load_vevo_vqvae_checkpoint(repcodec_cfg, device):
132
- # 备份原始路径
133
- original_config_path = repcodec_cfg.config_path
 
 
 
 
 
 
 
 
134
 
135
- # 尝试多个可能的路径
136
- possible_paths = [
137
- original_config_path,
138
- original_config_path.replace('./models/vc/vevo/config/', './tokenizer/vq32/'),
139
- os.path.join(os.getcwd(), 'tokenizer/vq32/hubert_large_l18_c32.yaml'),
140
- os.path.join(os.getcwd(), 'models/vc/vevo/config/hubert_large_l18_c32.yaml'),
141
- os.path.join(os.getcwd(), 'Amphion/models/vc/vevo/config/hubert_large_l18_c32.yaml')
142
- ]
143
 
144
- # 尝试每个路径
145
- for path in possible_paths:
146
- if os.path.exists(path):
147
- print(f"找到yaml配置文件: {path}")
148
- repcodec_cfg.config_path = path
149
- break
150
- else:
151
- print(f"警告: 无法找到任何yaml配置文件, 尝试的路径: {possible_paths}")
 
152
 
153
- # 调用原始函数
154
- try:
155
- result = original_load_vevo_vqvae(repcodec_cfg, device)
156
- return result
157
- except Exception as e:
158
- print(f"加载VQVAE时出错: {str(e)}")
159
- # 如果失败,尝试创建一个简单的对象作为替代
160
- class DummyVQVAE:
161
- def __init__(self):
162
- self.device = device
163
- def encode(self, x):
164
- # 返回一个简单的占位符编码
165
- return torch.zeros((x.shape[0], 100, 32), device=device)
166
- return DummyVQVAE()
167
-
168
- # 替换原始函数
169
- vevo_utils.load_vevo_vqvae_checkpoint = patched_load_vevo_vqvae_checkpoint
170
- except ImportError as e:
171
- print(f"导入模块时出错: {str(e)}")
172
-
173
- # 现在尝试导入
174
- try:
175
- from models.vc.vevo.vevo_utils import VevoInferencePipeline, save_audio
176
- except ImportError as e:
177
- print(f"导入错误: {str(e)}")
178
- # 如果还是不能导入,使用一个最小版本的必要函数
179
- class VevoInferencePipeline:
180
- def __init__(self, **kwargs):
181
- self.device = kwargs.get("device", "cpu")
182
- print("警告: 使用VevoInferencePipeline占位符!")
183
 
184
- def inference_ar_and_fm(self, **kwargs):
185
- return torch.randn(1, 24000)
186
-
187
- def inference_fm(self, **kwargs):
188
- return torch.randn(1, 24000)
189
-
190
- def save_audio(waveform, sr=24000, output_path=None, **kwargs):
191
- if output_path:
192
- import torchaudio
193
- torchaudio.save(output_path, waveform, sr)
194
- return output_path
195
-
196
- # 修复可能存在的递归调用问题
197
- # 检查是否在运行时发生了transformers库中的注意力机制递归
198
- try:
199
- import transformers
200
- from transformers.models.llama.modeling_llama import LlamaAttention, LlamaModel
201
-
202
- # 保存原始的注意力前向函数
203
- if hasattr(LlamaAttention, "forward"):
204
- original_attention_forward = LlamaAttention.forward
205
 
206
- # 创建防止递归的补丁函数
207
- def safe_attention_forward(self, *args, **kwargs):
208
- # 使用原始函数,但避免递归调用
209
- return original_attention_forward(self, *args, **kwargs)
 
 
 
 
 
 
 
 
210
 
211
- # 替换原始函数
212
- LlamaAttention.forward = safe_attention_forward
213
- print("已修复LlamaAttention.forward,防止递归")
214
-
215
- # 可能存在其他递归路径
216
- if hasattr(transformers.models.llama.modeling_llama, "LlamaAttention"):
217
- for attr_name in dir(transformers.models.llama.modeling_llama.LlamaAttention):
218
- if attr_name.startswith("_") and "forward" in attr_name:
219
- attr = getattr(transformers.models.llama.modeling_llama.LlamaAttention, attr_name)
220
- if callable(attr):
221
- # 保存原始函数
222
- setattr(transformers.models.llama.modeling_llama.LlamaAttention,
223
- f"original_{attr_name}", attr)
224
-
225
- # 创建安全函数
226
- def create_safe_function(original_func, attr_name):
227
- def safe_function(self, *args, **kwargs):
228
- return original_func(self, *args, **kwargs)
229
- return safe_function
230
-
231
- # 替换函数
232
- setattr(transformers.models.llama.modeling_llama.LlamaAttention,
233
- attr_name,
234
- create_safe_function(attr, attr_name))
235
- print(f"已修复潜在的递归函数: {attr_name}")
236
- except Exception as e:
237
- print(f"应用注意力机制补丁时出错: {str(e)}")
238
-
239
- # 模型配置常量
240
- REPO_ID = "amphion/Vevo"
241
- CACHE_DIR = "./ckpts/Vevo"
242
-
243
- class VevoGradioApp:
244
- def __init__(self):
245
- # 设备设置
246
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
247
- self.pipelines = {}
248
- # 配置文件路径
249
- self.config_paths = {
250
- "vq32tovq8192": "./models/vc/vevo/config/Vq32ToVq8192.json",
251
- "vq8192tomels": "./models/vc/vevo/config/Vq8192ToMels.json",
252
- "phonetovq8192": "./models/vc/vevo/config/PhoneToVq8192.json",
253
- "vocoder": "./models/vc/vevo/config/Vocoder.json"
254
- }
255
 
256
- # 确保配置文件存在
257
- self.download_configs()
 
 
 
 
 
 
 
258
 
259
- def download_configs(self):
260
- """下载必要的配置文件"""
261
- os.makedirs("./models/vc/vevo/config", exist_ok=True)
262
- config_files = {
263
- "Vq32ToVq8192.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/Vq32ToVq8192.json",
264
- "Vq8192ToMels.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/Vq8192ToMels.json",
265
- "PhoneToVq8192.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/PhoneToVq8192.json",
266
- "Vocoder.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/Vocoder.json"
267
- }
268
 
269
- # 额外下载必要的统计文件
270
- stat_files = {
271
- "hubert_large_l18_mean_std.npz": "https://huggingface.co/amphion/Vevo/resolve/main/tokenizer/vq32/hubert_large_l18_mean_std.npz",
272
- "hubert_large_l18_c32.yaml": "https://huggingface.co/amphion/Vevo/resolve/main/tokenizer/vq32/hubert_large_l18_c32.yaml"
273
- }
 
 
 
 
274
 
275
- for filename, url in config_files.items():
276
- target_path = f"./models/vc/vevo/config/{filename}"
277
- if not os.path.exists(target_path):
278
- try:
279
- response = requests.get(url)
280
- if response.status_code == 200:
281
- with open(target_path, "wb") as f:
282
- f.write(response.content)
283
- print(f"成功下载配置文件: {filename}")
284
- else:
285
- # 如果从GitHub下载失败,创建一个占位符文件
286
- with open(target_path, 'w') as f:
287
- f.write('{}')
288
- print(f"无法下载配置文件 {filename},已创建占位符")
289
- except:
290
- # 如果下载失败,创建一个占位符文件
291
- with open(target_path, 'w') as f:
292
- f.write('{}')
293
- print(f"无法下载配置文件 {filename},已创建占位符")
294
 
295
- # 下载统计文件
296
- for filename, url in stat_files.items():
297
- # 同时支持两个位置:配置目录和标准位置
298
- target_paths = [
299
- f"./models/vc/vevo/config/{filename}", # 配置文件夹中
300
- f"./tokenizer/vq32/{filename}" # HuggingFace仓库标准位置
301
- ]
302
-
303
- # 确保目录存在
304
- for target_path in target_paths:
305
- os.makedirs(os.path.dirname(target_path), exist_ok=True)
306
-
307
- if not os.path.exists(target_path):
308
- try:
309
- response = requests.get(url)
310
- if response.status_code == 200:
311
- with open(target_path, "wb") as f:
312
- f.write(response.content)
313
- print(f"成功下载统计文件到: {target_path}")
314
- else:
315
- print(f"无法下载统计文件 {filename} 到 {target_path}, 状态码: {response.status_code}")
316
- except Exception as e:
317
- print(f"下载统计文件 {filename} 到 {target_path} 时出错: {str(e)}")
318
 
319
- # 修复配置文件中的路径
320
- self.fix_config_paths()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
- def fix_config_paths(self):
323
- """修复配置文件中的相对路径"""
324
- try:
325
- for config_name, config_path in self.config_paths.items():
326
- if os.path.exists(config_path):
327
- with open(config_path, 'r') as f:
328
- config_data = f.read()
329
-
330
- # 获取当前工作目录的绝对路径
331
- base_dir = os.path.abspath(os.getcwd())
332
-
333
- # 统计文件的可能路径
334
- possible_stats = [
335
- f"{base_dir}/models/vc/vevo/config/hubert_large_l18_mean_std.npz",
336
- f"{base_dir}/tokenizer/vq32/hubert_large_l18_mean_std.npz",
337
- f"{base_dir}/Amphion/models/vc/vevo/config/hubert_large_l18_mean_std.npz"
338
- ]
339
-
340
- # 找到一个确实存在的文件路径
341
- stat_file_path = None
342
- for path in possible_stats:
343
- if os.path.exists(path):
344
- stat_file_path = path
345
- break
346
-
347
- if not stat_file_path:
348
- # 如果都不存在,默认使用第一个路径
349
- stat_file_path = possible_stats[0]
350
-
351
- # 替换配置中的相对路径
352
- if 'representation_stat_mean_var_path' in config_data:
353
- # 替换所有可能的路径格式
354
- replacements = [
355
- ('"representation_stat_mean_var_path": "./models/vc/vevo/config/hubert_large_l18_mean_std.npz"', f'"representation_stat_mean_var_path": "{stat_file_path}"'),
356
- ('"representation_stat_mean_var_path": "models/vc/vevo/config/hubert_large_l18_mean_std.npz"', f'"representation_stat_mean_var_path": "{stat_file_path}"'),
357
- ('"representation_stat_mean_var_path": "./tokenizer/vq32/hubert_large_l18_mean_std.npz"', f'"representation_stat_mean_var_path": "{stat_file_path}"'),
358
- ('"representation_stat_mean_var_path": "tokenizer/vq32/hubert_large_l18_mean_std.npz"', f'"representation_stat_mean_var_path": "{stat_file_path}"'),
359
- ]
360
-
361
- for old, new in replacements:
362
- config_data = config_data.replace(old, new)
363
-
364
- # 保存修复后的配置
365
- with open(config_path, 'w') as f:
366
- f.write(config_data)
367
-
368
- print(f"已修复配置文件路径: {config_path}")
369
- except Exception as e:
370
- print(f"修复配置文件路径时出错: {str(e)}")
371
 
372
- def init_voice_conversion_pipeline(self):
373
- """初始化语音转换管道"""
374
- if "voice" not in self.pipelines:
375
- try:
376
- # 确保配置文件路径是绝对路径
377
- absolute_config_paths = {}
378
- for key, path in self.config_paths.items():
379
- if path and not os.path.isabs(path):
380
- absolute_config_paths[key] = os.path.abspath(path)
381
- else:
382
- absolute_config_paths[key] = path
383
-
384
- # 内容标记器
385
- local_dir = snapshot_download(
386
- repo_id=REPO_ID,
387
- repo_type="model",
388
- cache_dir=CACHE_DIR,
389
- allow_patterns=["tokenizer/vq32/*"],
390
- )
391
- content_tokenizer_ckpt_path = os.path.join(
392
- local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
393
- )
394
-
395
- # 内容-风格标记器
396
- local_dir = snapshot_download(
397
- repo_id=REPO_ID,
398
- repo_type="model",
399
- cache_dir=CACHE_DIR,
400
- allow_patterns=["tokenizer/vq8192/*"],
401
- )
402
- content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
403
-
404
- # 自回归变换器
405
- local_dir = snapshot_download(
406
- repo_id=REPO_ID,
407
- repo_type="model",
408
- cache_dir=CACHE_DIR,
409
- allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
410
- )
411
- ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
412
-
413
- # 流匹配变换器
414
- local_dir = snapshot_download(
415
- repo_id=REPO_ID,
416
- repo_type="model",
417
- cache_dir=CACHE_DIR,
418
- allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
419
- )
420
- fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
421
-
422
- # 声码器
423
- local_dir = snapshot_download(
424
- repo_id=REPO_ID,
425
- repo_type="model",
426
- cache_dir=CACHE_DIR,
427
- allow_patterns=["acoustic_modeling/Vocoder/*"],
428
- )
429
- vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
430
-
431
- # 确保统计文件存在
432
- possible_stat_file_paths = [
433
- os.path.join(os.getcwd(), "models/vc/vevo/config/hubert_large_l18_mean_std.npz"),
434
- os.path.join(os.getcwd(), "tokenizer/vq32/hubert_large_l18_mean_std.npz")
435
- ]
436
-
437
- # 检查是否有任一路径存在
438
- stat_file_exists = any(os.path.exists(path) for path in possible_stat_file_paths)
439
-
440
- if not stat_file_exists:
441
- print(f"警告: 找不到统计文件,将尝试创建空文件")
442
- try:
443
- import numpy as np
444
- # 在两个位置都创建一个简单的统计文件
445
- for stat_path in possible_stat_file_paths:
446
- os.makedirs(os.path.dirname(stat_path), exist_ok=True)
447
- np.savez(stat_path, mean=np.zeros(1024), std=np.ones(1024))
448
- print(f"已创建占位符统计文件: {stat_path}")
449
- except Exception as e:
450
- print(f"创建统计文件时出错: {str(e)}")
451
-
452
- # 创建推理管道
453
- self.pipelines["voice"] = VevoInferencePipeline(
454
- content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
455
- content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
456
- ar_cfg_path=absolute_config_paths["vq32tovq8192"],
457
- ar_ckpt_path=ar_ckpt_path,
458
- fmt_cfg_path=absolute_config_paths["vq8192tomels"],
459
- fmt_ckpt_path=fmt_ckpt_path,
460
- vocoder_cfg_path=absolute_config_paths["vocoder"],
461
- vocoder_ckpt_path=vocoder_ckpt_path,
462
- device=self.device,
463
- )
464
- except Exception as e:
465
- print(f"初始化语音转换管道时出错: {str(e)}")
466
- # 创建一个占位符管道
467
- self.pipelines["voice"] = VevoInferencePipeline(device=self.device)
468
-
469
- return self.pipelines["voice"]
470
 
471
- def init_timbre_pipeline(self):
472
- """初始化音色转换管道"""
473
- if "timbre" not in self.pipelines:
474
- try:
475
- # 确保配置文件路径是绝对路径
476
- absolute_config_paths = {}
477
- for key, path in self.config_paths.items():
478
- if path and not os.path.isabs(path):
479
- absolute_config_paths[key] = os.path.abspath(path)
480
- else:
481
- absolute_config_paths[key] = path
482
-
483
- # 内容-风格标记器
484
- local_dir = snapshot_download(
485
- repo_id=REPO_ID,
486
- repo_type="model",
487
- cache_dir=CACHE_DIR,
488
- allow_patterns=["tokenizer/vq8192/*"],
489
- )
490
- tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
491
-
492
- # 流匹配变换器
493
- local_dir = snapshot_download(
494
- repo_id=REPO_ID,
495
- repo_type="model",
496
- cache_dir=CACHE_DIR,
497
- allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
498
- )
499
- fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
500
-
501
- # 声码器
502
- local_dir = snapshot_download(
503
- repo_id=REPO_ID,
504
- repo_type="model",
505
- cache_dir=CACHE_DIR,
506
- allow_patterns=["acoustic_modeling/Vocoder/*"],
507
- )
508
- vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
509
-
510
- # 创建推理管道
511
- self.pipelines["timbre"] = VevoInferencePipeline(
512
- content_style_tokenizer_ckpt_path=tokenizer_ckpt_path,
513
- fmt_cfg_path=absolute_config_paths["vq8192tomels"],
514
- fmt_ckpt_path=fmt_ckpt_path,
515
- vocoder_cfg_path=absolute_config_paths["vocoder"],
516
- vocoder_ckpt_path=vocoder_ckpt_path,
517
- device=self.device,
518
- )
519
- except Exception as e:
520
- print(f"初始化音色转换管道时出错: {str(e)}")
521
- # 创建一个占位符管道
522
- self.pipelines["timbre"] = VevoInferencePipeline(device=self.device)
523
-
524
- return self.pipelines["timbre"]
525
 
526
- def init_tts_pipeline(self):
527
- """初始化文本转语音管道"""
528
- if "tts" not in self.pipelines:
529
- try:
530
- # 确保配置文件路径是绝对路径
531
- absolute_config_paths = {}
532
- for key, path in self.config_paths.items():
533
- if path and not os.path.isabs(path):
534
- absolute_config_paths[key] = os.path.abspath(path)
535
- else:
536
- absolute_config_paths[key] = path
537
-
538
- # 内容-风格标记器
539
- local_dir = snapshot_download(
540
- repo_id=REPO_ID,
541
- repo_type="model",
542
- cache_dir=CACHE_DIR,
543
- allow_patterns=["tokenizer/vq8192/*"],
544
- )
545
- content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
546
-
547
- # 自回归变换器
548
- local_dir = snapshot_download(
549
- repo_id=REPO_ID,
550
- repo_type="model",
551
- cache_dir=CACHE_DIR,
552
- allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
553
- )
554
- ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
555
-
556
- # 流匹配变换器
557
- local_dir = snapshot_download(
558
- repo_id=REPO_ID,
559
- repo_type="model",
560
- cache_dir=CACHE_DIR,
561
- allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
562
- )
563
- fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
564
-
565
- # 声码器
566
- local_dir = snapshot_download(
567
- repo_id=REPO_ID,
568
- repo_type="model",
569
- cache_dir=CACHE_DIR,
570
- allow_patterns=["acoustic_modeling/Vocoder/*"],
571
- )
572
- vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
573
-
574
- # 创建推理管道
575
- self.pipelines["tts"] = VevoInferencePipeline(
576
- content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
577
- ar_cfg_path=absolute_config_paths["phonetovq8192"],
578
- ar_ckpt_path=ar_ckpt_path,
579
- fmt_cfg_path=absolute_config_paths["vq8192tomels"],
580
- fmt_ckpt_path=fmt_ckpt_path,
581
- vocoder_cfg_path=absolute_config_paths["vocoder"],
582
- vocoder_ckpt_path=vocoder_ckpt_path,
583
- device=self.device,
584
- )
585
- except Exception as e:
586
- print(f"初始化TTS管道时出错: {str(e)}")
587
- # 创建一个占位符管道
588
- self.pipelines["tts"] = VevoInferencePipeline(device=self.device)
589
-
590
- return self.pipelines["tts"]
591
-
592
- def vevo_voice(self, content_audio, reference_audio):
593
- """语音转换功能"""
594
- pipeline = self.init_voice_conversion_pipeline()
595
-
596
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
597
- output_path = output_file.name
598
-
599
- # 执行语音转换
600
- gen_audio = pipeline.inference_ar_and_fm(
601
- src_wav_path=content_audio, # 直接使用路径
602
- src_text=None,
603
- style_ref_wav_path=reference_audio, # 直接使用路径
604
- timbre_ref_wav_path=reference_audio,
605
- )
606
- save_audio(gen_audio, output_path=output_path)
607
-
608
- return output_path
609
 
610
- def vevo_style(self, content_audio, style_audio):
611
- """风格转换功能"""
612
- pipeline = self.init_voice_conversion_pipeline()
613
-
614
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
615
- output_path = output_file.name
616
-
617
- # 执行风格转换
618
- gen_audio = pipeline.inference_ar_and_fm(
619
- src_wav_path=content_audio, # 直接使用路径
620
- src_text=None,
621
- style_ref_wav_path=style_audio, # 直接使用路径
622
- timbre_ref_wav_path=content_audio,
623
- )
624
- save_audio(gen_audio, output_path=output_path)
625
-
626
- return output_path
627
 
628
- def vevo_timbre(self, content_audio, reference_audio):
629
- """音色转换功能"""
630
- pipeline = self.init_timbre_pipeline()
631
-
632
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
633
- output_path = output_file.name
634
-
635
- # 执行音色转换
636
- gen_audio = pipeline.inference_fm(
637
- src_wav_path=content_audio, # 直接使用路径
638
- timbre_ref_wav_path=reference_audio, # 直接使用路径
639
- flow_matching_steps=32,
640
- )
641
- save_audio(gen_audio, output_path=output_path)
642
-
643
- return output_path
644
 
645
- def vevo_tts(self, text, ref_audio, src_language, ref_language, ref_text):
646
- """文本转语音功能"""
647
- pipeline = self.init_tts_pipeline()
648
-
649
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
650
- output_path = output_file.name
651
-
652
- # 执行文本转语音
653
- gen_audio = pipeline.inference_ar_and_fm(
654
- src_wav_path=None,
655
- src_text=text,
656
- style_ref_wav_path=ref_audio, # 直接使用路径
657
- timbre_ref_wav_path=ref_audio,
658
- style_ref_wav_text=ref_text if ref_text else None,
659
- src_text_language=src_language,
660
- style_ref_wav_text_language=ref_language,
661
- )
662
- save_audio(gen_audio, output_path=output_path)
663
-
664
- return output_path
665
 
666
- def create_interface():
667
- app = VevoGradioApp()
 
 
668
 
669
- with gr.Blocks(title="Vevo 语音转换演示") as demo:
670
- gr.Markdown("# Vevo 语音转换模型演示")
671
- gr.Markdown("Vevo是一个强大的语音转换模型,支持语音转换、风格转换、音色转换和文本转语音功能。")
672
-
673
- with gr.Tab("语音转换"):
674
- gr.Markdown("## 语音转换 (VevoVoice)")
675
- gr.Markdown("将内容音频的内容转换为参考音频的风格和音色。")
676
- with gr.Row():
677
- content_audio_voice = gr.Audio(label="内容音频", type="filepath")
678
- reference_audio_voice = gr.Audio(label="参考音频", type="filepath")
679
- voice_btn = gr.Button("转换")
680
- voice_output = gr.Audio(label="转换结果")
681
- voice_btn.click(fn=app.vevo_voice, inputs=[content_audio_voice, reference_audio_voice], outputs=voice_output)
682
-
683
- with gr.Tab("风格转换"):
684
- gr.Markdown("## 风格转换 (VevoStyle)")
685
- gr.Markdown("将内容音频的风格转换为参考音频的风格,保留原始音色。")
686
- with gr.Row():
687
- content_audio_style = gr.Audio(label="内容音频", type="filepath")
688
- style_audio = gr.Audio(label="风格参考音频", type="filepath")
689
- style_btn = gr.Button("转换")
690
- style_output = gr.Audio(label="转换结果")
691
- style_btn.click(fn=app.vevo_style, inputs=[content_audio_style, style_audio], outputs=style_output)
692
-
693
- with gr.Tab("音色转换"):
694
- gr.Markdown("## 音色转换 (VevoTimbre)")
695
- gr.Markdown("将内容音频的音色转换为参考音频的音色,保留内容和风格。")
696
- with gr.Row():
697
- content_audio_timbre = gr.Audio(label="内容音频", type="filepath")
698
- reference_audio_timbre = gr.Audio(label="音色参考音频", type="filepath")
699
- timbre_btn = gr.Button("转换")
700
- timbre_output = gr.Audio(label="转换结果")
701
- timbre_btn.click(fn=app.vevo_timbre, inputs=[content_audio_timbre, reference_audio_timbre], outputs=timbre_output)
702
-
703
- with gr.Tab("文本转语音"):
704
- gr.Markdown("## 文本转语音 (VevoTTS)")
705
- gr.Markdown("将输入文本转换为语音,使用参考音频的风格和音色。")
706
- text_input = gr.Textbox(label="输入文本", lines=3)
707
- with gr.Row():
708
- ref_audio_tts = gr.Audio(label="参考音频", type="filepath")
709
- src_language = gr.Dropdown(["en", "zh", "ja", "ko"], label="源文本语言", value="en")
710
- with gr.Row():
711
- ref_language = gr.Dropdown(["en", "zh", "ja", "ko"], label="参考文本语言", value="en")
712
- ref_text = gr.Textbox(label="参考文本(可选)", lines=2)
713
- tts_btn = gr.Button("生成")
714
- tts_output = gr.Audio(label="生成结果")
715
- tts_btn.click(fn=app.vevo_tts, inputs=[text_input, ref_audio_tts, src_language, ref_language, ref_text], outputs=tts_output)
716
-
717
- gr.Markdown("## 关于")
718
- gr.Markdown("本演示基于 [Vevo模型](https://huggingface.co/amphion/Vevo),由[Amphion](https://github.com/open-mmlab/Amphion)开发。")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
719
 
720
- return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721
 
722
- if __name__ == "__main__":
723
- demo = create_interface()
724
- demo.launch()
 
1
  import os
2
  import sys
3
+ import json
4
  import torch
5
+ import gradio as gr
6
+ import torchaudio
7
+ import numpy as np
 
8
  from huggingface_hub import snapshot_download, hf_hub_download
 
9
  import subprocess
10
 
11
+ # 克隆Amphion仓库
12
+ if not os.path.exists("Amphion"):
13
+ subprocess.run(["git", "clone", "https://github.com/open-mmlab/Amphion.git"])
14
+ os.chdir("Amphion")
15
+ else:
16
+ if not os.getcwd().endswith("Amphion"):
17
+ os.chdir("Amphion")
 
 
 
 
18
 
19
+ # 将Amphion加入到路径中
20
+ if os.path.dirname(os.path.abspath("Amphion")) not in sys.path:
21
+ sys.path.append(os.path.dirname(os.path.abspath("Amphion")))
22
 
23
+ # 确保需要的目录存在
24
+ os.makedirs("wav", exist_ok=True)
25
+ os.makedirs("ckpts/Vevo", exist_ok=True)
26
+
27
+ from models.vc.vevo.vevo_utils import VevoInferencePipeline, save_audio, load_wav
28
+
29
+ # 下载和设置配置文件
30
+ def setup_configs():
31
+ config_path = "models/vc/vevo/config"
32
+ os.makedirs(config_path, exist_ok=True)
33
+
34
+ config_files = [
35
+ "PhoneToVq8192.json",
36
+ "Vocoder.json",
37
+ "Vq32ToVq8192.json",
38
+ "Vq8192ToMels.json",
39
+ "hubert_large_l18_c32.yaml",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  ]
41
 
42
+ for file in config_files:
43
+ file_path = f"{config_path}/{file}"
44
+ if not os.path.exists(file_path):
45
+ try:
46
+ file_data = hf_hub_download(
47
+ repo_id="amphion/Vevo",
48
+ filename=f"config/{file}",
49
+ repo_type="model",
50
+ )
51
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
52
+ # 拷贝文件到目标位置
53
+ subprocess.run(["cp", file_data, file_path])
54
+ except Exception as e:
55
+ print(f"下载配置文件 {file} 时出错: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ setup_configs()
 
58
 
59
+ # 设备配置
60
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
61
+ print(f"使用设备: {device}")
62
 
63
+ # 初始化管道字典
64
+ inference_pipelines = {}
65
+
66
+ def get_pipeline(pipeline_type):
67
+ if pipeline_type in inference_pipelines:
68
+ return inference_pipelines[pipeline_type]
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ # 根据需要的管道类型初始化
71
+ if pipeline_type == "style" or pipeline_type == "voice":
72
+ # 下载Content Tokenizer
73
+ local_dir = snapshot_download(
74
+ repo_id="amphion/Vevo",
75
+ repo_type="model",
76
+ cache_dir="./ckpts/Vevo",
77
+ allow_patterns=["tokenizer/vq32/*"],
78
+ )
79
+ content_tokenizer_ckpt_path = os.path.join(
80
+ local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
81
+ )
82
 
83
+ # 下载Content-Style Tokenizer
84
+ local_dir = snapshot_download(
85
+ repo_id="amphion/Vevo",
86
+ repo_type="model",
87
+ cache_dir="./ckpts/Vevo",
88
+ allow_patterns=["tokenizer/vq8192/*"],
89
+ )
90
+ content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
91
 
92
+ # 下载Autoregressive Transformer
93
+ local_dir = snapshot_download(
94
+ repo_id="amphion/Vevo",
95
+ repo_type="model",
96
+ cache_dir="./ckpts/Vevo",
97
+ allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
98
+ )
99
+ ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
100
+ ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
101
 
102
+ # 下载Flow Matching Transformer
103
+ local_dir = snapshot_download(
104
+ repo_id="amphion/Vevo",
105
+ repo_type="model",
106
+ cache_dir="./ckpts/Vevo",
107
+ allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
108
+ )
109
+ fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
110
+ fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ # 下载Vocoder
113
+ local_dir = snapshot_download(
114
+ repo_id="amphion/Vevo",
115
+ repo_type="model",
116
+ cache_dir="./ckpts/Vevo",
117
+ allow_patterns=["acoustic_modeling/Vocoder/*"],
118
+ )
119
+ vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
120
+ vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
+ # 初始化管道
123
+ inference_pipeline = VevoInferencePipeline(
124
+ content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
125
+ content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
126
+ ar_cfg_path=ar_cfg_path,
127
+ ar_ckpt_path=ar_ckpt_path,
128
+ fmt_cfg_path=fmt_cfg_path,
129
+ fmt_ckpt_path=fmt_ckpt_path,
130
+ vocoder_cfg_path=vocoder_cfg_path,
131
+ vocoder_ckpt_path=vocoder_ckpt_path,
132
+ device=device,
133
+ )
134
 
135
+ elif pipeline_type == "timbre":
136
+ # 下载Content-Style Tokenizer (仅timbre需要)
137
+ local_dir = snapshot_download(
138
+ repo_id="amphion/Vevo",
139
+ repo_type="model",
140
+ cache_dir="./ckpts/Vevo",
141
+ allow_patterns=["tokenizer/vq8192/*"],
142
+ )
143
+ content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
+ # 下载Flow Matching Transformer
146
+ local_dir = snapshot_download(
147
+ repo_id="amphion/Vevo",
148
+ repo_type="model",
149
+ cache_dir="./ckpts/Vevo",
150
+ allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
151
+ )
152
+ fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
153
+ fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
154
 
155
+ # 下载Vocoder
156
+ local_dir = snapshot_download(
157
+ repo_id="amphion/Vevo",
158
+ repo_type="model",
159
+ cache_dir="./ckpts/Vevo",
160
+ allow_patterns=["acoustic_modeling/Vocoder/*"],
161
+ )
162
+ vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
163
+ vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
164
 
165
+ # 初始化管道
166
+ inference_pipeline = VevoInferencePipeline(
167
+ content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
168
+ fmt_cfg_path=fmt_cfg_path,
169
+ fmt_ckpt_path=fmt_ckpt_path,
170
+ vocoder_cfg_path=vocoder_cfg_path,
171
+ vocoder_ckpt_path=vocoder_ckpt_path,
172
+ device=device,
173
+ )
174
 
175
+ elif pipeline_type == "tts":
176
+ # 下载Content-Style Tokenizer
177
+ local_dir = snapshot_download(
178
+ repo_id="amphion/Vevo",
179
+ repo_type="model",
180
+ cache_dir="./ckpts/Vevo",
181
+ allow_patterns=["tokenizer/vq8192/*"],
182
+ )
183
+ content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
 
 
 
 
 
 
 
 
 
 
184
 
185
+ # 下载Autoregressive Transformer (TTS特有)
186
+ local_dir = snapshot_download(
187
+ repo_id="amphion/Vevo",
188
+ repo_type="model",
189
+ cache_dir="./ckpts/Vevo",
190
+ allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
191
+ )
192
+ ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
193
+ ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
194
+
195
+ # 下载Flow Matching Transformer
196
+ local_dir = snapshot_download(
197
+ repo_id="amphion/Vevo",
198
+ repo_type="model",
199
+ cache_dir="./ckpts/Vevo",
200
+ allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
201
+ )
202
+ fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
203
+ fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
 
 
 
 
204
 
205
+ # 下载Vocoder
206
+ local_dir = snapshot_download(
207
+ repo_id="amphion/Vevo",
208
+ repo_type="model",
209
+ cache_dir="./ckpts/Vevo",
210
+ allow_patterns=["acoustic_modeling/Vocoder/*"],
211
+ )
212
+ vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
213
+ vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
214
+
215
+ # 初始化管道
216
+ inference_pipeline = VevoInferencePipeline(
217
+ content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
218
+ ar_cfg_path=ar_cfg_path,
219
+ ar_ckpt_path=ar_ckpt_path,
220
+ fmt_cfg_path=fmt_cfg_path,
221
+ fmt_ckpt_path=fmt_ckpt_path,
222
+ vocoder_cfg_path=vocoder_cfg_path,
223
+ vocoder_ckpt_path=vocoder_ckpt_path,
224
+ device=device,
225
+ )
226
 
227
+ # 缓存管道实例
228
+ inference_pipelines[pipeline_type] = inference_pipeline
229
+ return inference_pipeline
230
+
231
+ # 实现VEVO功能函数
232
+ def vevo_style(content_wav, style_wav):
233
+ temp_content_path = "wav/temp_content.wav"
234
+ temp_style_path = "wav/temp_style.wav"
235
+ output_path = "wav/output_vevostyle.wav"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
+ # 保存上传的音频
238
+ torchaudio.save(temp_content_path, content_wav[0], content_wav[1])
239
+ torchaudio.save(temp_style_path, style_wav[0], style_wav[1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
+ # 获取管道
242
+ pipeline = get_pipeline("style")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
+ # 推理
245
+ gen_audio = pipeline.inference_ar_and_fm(
246
+ src_wav_path=temp_content_path,
247
+ src_text=None,
248
+ style_ref_wav_path=temp_style_path,
249
+ timbre_ref_wav_path=temp_content_path,
250
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
+ # 保存生成的音频
253
+ save_audio(gen_audio, output_path=output_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
+ return output_path
256
+
257
+ def vevo_timbre(content_wav, reference_wav):
258
+ temp_content_path = "wav/temp_content.wav"
259
+ temp_reference_path = "wav/temp_reference.wav"
260
+ output_path = "wav/output_vevotimbre.wav"
 
 
 
 
 
 
 
 
 
 
261
 
262
+ # 保存上传的音频
263
+ torchaudio.save(temp_content_path, content_wav[0], content_wav[1])
264
+ torchaudio.save(temp_reference_path, reference_wav[0], reference_wav[1])
265
+
266
+ # 获取管道
267
+ pipeline = get_pipeline("timbre")
268
+
269
+ # 推理
270
+ gen_audio = pipeline.inference_fm(
271
+ src_wav_path=temp_content_path,
272
+ timbre_ref_wav_path=temp_reference_path,
273
+ flow_matching_steps=32,
274
+ )
275
+
276
+ # 保存生成的音频
277
+ save_audio(gen_audio, output_path=output_path)
278
+
279
+ return output_path
 
 
280
 
281
+ def vevo_voice(content_wav, reference_wav):
282
+ temp_content_path = "wav/temp_content.wav"
283
+ temp_reference_path = "wav/temp_reference.wav"
284
+ output_path = "wav/output_vevovoice.wav"
285
 
286
+ # 保存上传的音频
287
+ torchaudio.save(temp_content_path, content_wav[0], content_wav[1])
288
+ torchaudio.save(temp_reference_path, reference_wav[0], reference_wav[1])
289
+
290
+ # 获取管道
291
+ pipeline = get_pipeline("voice")
292
+
293
+ # 推理
294
+ gen_audio = pipeline.inference_ar_and_fm(
295
+ src_wav_path=temp_content_path,
296
+ src_text=None,
297
+ style_ref_wav_path=temp_reference_path,
298
+ timbre_ref_wav_path=temp_reference_path,
299
+ )
300
+
301
+ # 保存生成的音频
302
+ save_audio(gen_audio, output_path=output_path)
303
+
304
+ return output_path
305
+
306
+ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language="en"):
307
+ temp_ref_path = "wav/temp_ref.wav"
308
+ temp_timbre_path = "wav/temp_timbre.wav"
309
+ output_path = "wav/output_vevotts.wav"
310
+
311
+ # 保存上传的音频
312
+ torchaudio.save(temp_ref_path, ref_wav[0], ref_wav[1])
313
+
314
+ if timbre_ref_wav is not None:
315
+ torchaudio.save(temp_timbre_path, timbre_ref_wav[0], timbre_ref_wav[1])
316
+ else:
317
+ temp_timbre_path = temp_ref_path
318
+
319
+ # 获取管道
320
+ pipeline = get_pipeline("tts")
321
+
322
+ # 推理
323
+ gen_audio = pipeline.inference_ar_and_fm(
324
+ src_wav_path=None,
325
+ src_text=text,
326
+ style_ref_wav_path=temp_ref_path,
327
+ timbre_ref_wav_path=temp_timbre_path,
328
+ style_ref_wav_text=None,
329
+ src_text_language=src_language,
330
+ style_ref_wav_text_language=ref_language,
331
+ )
332
+
333
+ # 保存生成的音频
334
+ save_audio(gen_audio, output_path=output_path)
335
+
336
+ return output_path
337
+
338
+ # 创建Gradio界面
339
+ with gr.Blocks(title="VEVO Demo") as demo:
340
+ gr.Markdown("# VEVO: 多功能语音合成模型演示")
341
+ gr.Markdown("## 可控零样本声音模仿与风格转换")
342
+
343
+ with gr.Tab("风格转换 (Style)"):
344
+ gr.Markdown("### Vevo-Style: 保持音色但转换风格(如口音、情感等)")
345
+ with gr.Row():
346
+ with gr.Column():
347
+ style_content = gr.Audio(label="内容音频", type="numpy")
348
+ style_reference = gr.Audio(label="风格音频", type="numpy")
349
+ style_button = gr.Button("生成")
350
+ with gr.Column():
351
+ style_output = gr.Audio(label="生成结果")
352
+ style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
353
+
354
+ with gr.Tab("音色转换 (Timbre)"):
355
+ gr.Markdown("### Vevo-Timbre: 保持风格但转换音色")
356
+ with gr.Row():
357
+ with gr.Column():
358
+ timbre_content = gr.Audio(label="内容音频", type="numpy")
359
+ timbre_reference = gr.Audio(label="音色参考音频", type="numpy")
360
+ timbre_button = gr.Button("生成")
361
+ with gr.Column():
362
+ timbre_output = gr.Audio(label="生成结果")
363
+ timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
364
+
365
+ with gr.Tab("声音转换 (Voice)"):
366
+ gr.Markdown("### Vevo-Voice: 同时转换风格和音色")
367
+ with gr.Row():
368
+ with gr.Column():
369
+ voice_content = gr.Audio(label="内容音频", type="numpy")
370
+ voice_reference = gr.Audio(label="声音参考音频", type="numpy")
371
+ voice_button = gr.Button("生成")
372
+ with gr.Column():
373
+ voice_output = gr.Audio(label="生成结果")
374
+ voice_button.click(vevo_voice, inputs=[voice_content, voice_reference], outputs=voice_output)
375
+
376
+ with gr.Tab("文本到语音 (TTS)"):
377
+ gr.Markdown("### Vevo-TTS: 风格与音色可控的文本到语音转换")
378
+ with gr.Row():
379
+ with gr.Column():
380
+ tts_text = gr.Textbox(label="输入文本", placeholder="请输入要合成的文本...", lines=3)
381
+ tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="文本语言", value="en")
382
+ tts_reference = gr.Audio(label="风格参考音频", type="numpy")
383
+ tts_ref_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="参考音频语言", value="en")
384
+
385
+ with gr.Accordion("高级选项", open=False):
386
+ tts_timbre_reference = gr.Audio(label="音色参考音频(可选)", type="numpy")
387
+
388
+ tts_button = gr.Button("生成")
389
+ with gr.Column():
390
+ tts_output = gr.Audio(label="生成结果")
391
 
392
+ tts_button.click(
393
+ vevo_tts,
394
+ inputs=[tts_text, tts_reference, tts_timbre_reference, tts_src_language, tts_ref_language],
395
+ outputs=tts_output
396
+ )
397
+
398
+ gr.Markdown("""
399
+ ## 关于VEVO
400
+ VEVO是一个多功能语音合成和转换模型,提供四种主要功能:
401
+ 1. **Vevo-Style**: 保持音色但转换风格(如口音、情感等)
402
+ 2. **Vevo-Timbre**: 保持风格但转换音色
403
+ 3. **Vevo-Voice**: 同时转换风格和音色
404
+ 4. **Vevo-TTS**: 风格与音色可控的文本到语音转换
405
+
406
+ 更多信息请访问[Amphion项目](https://github.com/open-mmlab/Amphion)
407
+ """)
408
 
409
+ # 启动应用
410
+ demo.launch()
 
requirements.txt CHANGED
@@ -1,33 +1,11 @@
1
- gradio>=4.14.0
2
- huggingface_hub>=0.20.0
3
  torch>=2.0.0
4
  torchaudio>=2.0.0
5
- numpy>=1.23.0
6
- librosa>=0.10.0
7
- accelerate>=0.21.0
8
- PySoundFile>=0.9.0
9
- safetensors>=0.4.0
10
  PyYAML>=6.0
11
- whisper>=1.1.10
12
- IPython>=8.0.0
13
- requests>=2.28.0
14
- transformers>=4.41.0
15
- setuptools
16
- onnxruntime
17
- unidecode
18
- scipy>=1.12.0
19
- encodec
20
- phonemizer
21
- g2p_en
22
- jieba
23
- cn2an
24
- pypinyin
25
- LangSegment
26
- pyopenjtalk
27
- pykakasi
28
- json5
29
- black>=24.1.1
30
- ruamel.yaml
31
- tqdm
32
- einops
33
- spaces
 
1
+ gradio>=3.50.2
 
2
  torch>=2.0.0
3
  torchaudio>=2.0.0
4
+ numpy>=1.20.0
5
+ huggingface_hub>=0.14.1
6
+ librosa>=0.9.2
 
 
7
  PyYAML>=6.0
8
+ accelerate>=0.20.3
9
+ safetensors>=0.3.1
10
+ phonemizer>=3.2.0
11
+ git+https://github.com/open-mmlab/Amphion.git