soiz1 commited on
Commit
0d7eb3f
·
verified ·
1 Parent(s): 20d6bb2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +358 -361
app.py CHANGED
@@ -1,362 +1,359 @@
1
- import gradio as gr
2
- import torch
3
- import torchaudio
4
- import librosa
5
- from modules.commons import build_model, load_checkpoint, recursive_munch
6
- import yaml
7
- from hf_utils import load_custom_model_from_hf
8
- import numpy as np
9
- from pydub import AudioSegment
10
-
11
- # Load model and configuration
12
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
-
14
- dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
15
- "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
16
- "config_dit_mel_seed_uvit_whisper_small_wavenet.yml")
17
- config = yaml.safe_load(open(dit_config_path, 'r'))
18
- model_params = recursive_munch(config['model_params'])
19
- model = build_model(model_params, stage='DiT')
20
- hop_length = config['preprocess_params']['spect_params']['hop_length']
21
- sr = config['preprocess_params']['sr']
22
-
23
- # Load checkpoints
24
- model, _, _, _ = load_checkpoint(model, None, dit_checkpoint_path,
25
- load_only_params=True, ignore_modules=[], is_distributed=False)
26
- for key in model:
27
- model[key].eval()
28
- model[key].to(device)
29
- model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
30
-
31
- # Load additional modules
32
- from modules.campplus.DTDNN import CAMPPlus
33
-
34
- campplus_ckpt_path = load_custom_model_from_hf("funasr/campplus", "campplus_cn_common.bin", config_filename=None)
35
- campplus_model = CAMPPlus(feat_dim=80, embedding_size=192)
36
- campplus_model.load_state_dict(torch.load(campplus_ckpt_path, map_location="cpu"))
37
- campplus_model.eval()
38
- campplus_model.to(device)
39
-
40
- from modules.bigvgan import bigvgan
41
-
42
- bigvgan_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_22khz_80band_256x', use_cuda_kernel=False)
43
-
44
- # remove weight norm in the model and set to eval mode
45
- bigvgan_model.remove_weight_norm()
46
- bigvgan_model = bigvgan_model.eval().to(device)
47
-
48
- # whisper
49
- from transformers import AutoFeatureExtractor, WhisperModel
50
-
51
- whisper_name = model_params.speech_tokenizer.whisper_name if hasattr(model_params.speech_tokenizer,
52
- 'whisper_name') else "openai/whisper-small"
53
- whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float16).to(device)
54
- del whisper_model.decoder
55
- whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
56
-
57
- # Generate mel spectrograms
58
- mel_fn_args = {
59
- "n_fft": config['preprocess_params']['spect_params']['n_fft'],
60
- "win_size": config['preprocess_params']['spect_params']['win_length'],
61
- "hop_size": config['preprocess_params']['spect_params']['hop_length'],
62
- "num_mels": config['preprocess_params']['spect_params']['n_mels'],
63
- "sampling_rate": sr,
64
- "fmin": 0,
65
- "fmax": None,
66
- "center": False
67
- }
68
- from modules.audio import mel_spectrogram
69
-
70
- to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
71
-
72
- # f0 conditioned model
73
- dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
74
- "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema.pth",
75
- "config_dit_mel_seed_uvit_whisper_base_f0_44k.yml")
76
-
77
- config = yaml.safe_load(open(dit_config_path, 'r'))
78
- model_params = recursive_munch(config['model_params'])
79
- model_f0 = build_model(model_params, stage='DiT')
80
- hop_length = config['preprocess_params']['spect_params']['hop_length']
81
- sr = config['preprocess_params']['sr']
82
-
83
- # Load checkpoints
84
- model_f0, _, _, _ = load_checkpoint(model_f0, None, dit_checkpoint_path,
85
- load_only_params=True, ignore_modules=[], is_distributed=False)
86
- for key in model_f0:
87
- model_f0[key].eval()
88
- model_f0[key].to(device)
89
- model_f0.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
90
-
91
- # f0 extractor
92
- from modules.rmvpe import RMVPE
93
-
94
- model_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
95
- rmvpe = RMVPE(model_path, is_half=False, device=device)
96
-
97
- mel_fn_args_f0 = {
98
- "n_fft": config['preprocess_params']['spect_params']['n_fft'],
99
- "win_size": config['preprocess_params']['spect_params']['win_length'],
100
- "hop_size": config['preprocess_params']['spect_params']['hop_length'],
101
- "num_mels": config['preprocess_params']['spect_params']['n_mels'],
102
- "sampling_rate": sr,
103
- "fmin": 0,
104
- "fmax": None,
105
- "center": False
106
- }
107
- to_mel_f0 = lambda x: mel_spectrogram(x, **mel_fn_args_f0)
108
- bigvgan_44k_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_44khz_128band_512x', use_cuda_kernel=False)
109
-
110
- # remove weight norm in the model and set to eval mode
111
- bigvgan_44k_model.remove_weight_norm()
112
- bigvgan_44k_model = bigvgan_44k_model.eval().to(device)
113
-
114
- def adjust_f0_semitones(f0_sequence, n_semitones):
115
- factor = 2 ** (n_semitones / 12)
116
- return f0_sequence * factor
117
-
118
- def crossfade(chunk1, chunk2, overlap):
119
- fade_out = np.cos(np.linspace(0, np.pi / 2, overlap)) ** 2
120
- fade_in = np.cos(np.linspace(np.pi / 2, 0, overlap)) ** 2
121
- if len(chunk2) < overlap:
122
- chunk2[:overlap] = chunk2[:overlap] * fade_in[:len(chunk2)] + (chunk1[-overlap:] * fade_out)[:len(chunk2)]
123
- else:
124
- chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
125
- return chunk2
126
-
127
- # streaming and chunk processing related params
128
- overlap_frame_len = 16
129
- bitrate = "320k"
130
-
131
- @torch.no_grad()
132
- @torch.inference_mode()
133
- def voice_conversion(source, target, diffusion_steps, length_adjust, inference_cfg_rate, f0_condition, auto_f0_adjust, pitch_shift):
134
- inference_module = model if not f0_condition else model_f0
135
- mel_fn = to_mel if not f0_condition else to_mel_f0
136
- bigvgan_fn = bigvgan_model if not f0_condition else bigvgan_44k_model
137
- sr = 22050 if not f0_condition else 44100
138
- hop_length = 256 if not f0_condition else 512
139
- max_context_window = sr // hop_length * 30
140
- overlap_wave_len = overlap_frame_len * hop_length
141
- # Load audio
142
- source_audio = librosa.load(source, sr=sr)[0]
143
- ref_audio = librosa.load(target, sr=sr)[0]
144
-
145
- # Process audio
146
- source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
147
- ref_audio = torch.tensor(ref_audio[:sr * 25]).unsqueeze(0).float().to(device)
148
-
149
- # Resample
150
- ref_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
151
- converted_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
152
- # if source audio less than 30 seconds, whisper can handle in one forward
153
- if converted_waves_16k.size(-1) <= 16000 * 30:
154
- alt_inputs = whisper_feature_extractor([converted_waves_16k.squeeze(0).cpu().numpy()],
155
- return_tensors="pt",
156
- return_attention_mask=True,
157
- sampling_rate=16000)
158
- alt_input_features = whisper_model._mask_input_features(
159
- alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
160
- alt_outputs = whisper_model.encoder(
161
- alt_input_features.to(whisper_model.encoder.dtype),
162
- head_mask=None,
163
- output_attentions=False,
164
- output_hidden_states=False,
165
- return_dict=True,
166
- )
167
- S_alt = alt_outputs.last_hidden_state.to(torch.float32)
168
- S_alt = S_alt[:, :converted_waves_16k.size(-1) // 320 + 1]
169
- else:
170
- overlapping_time = 5 # 5 seconds
171
- S_alt_list = []
172
- buffer = None
173
- traversed_time = 0
174
- while traversed_time < converted_waves_16k.size(-1):
175
- if buffer is None: # first chunk
176
- chunk = converted_waves_16k[:, traversed_time:traversed_time + 16000 * 30]
177
- else:
178
- chunk = torch.cat([buffer, converted_waves_16k[:, traversed_time:traversed_time + 16000 * (30 - overlapping_time)]], dim=-1)
179
- alt_inputs = whisper_feature_extractor([chunk.squeeze(0).cpu().numpy()],
180
- return_tensors="pt",
181
- return_attention_mask=True,
182
- sampling_rate=16000)
183
- alt_input_features = whisper_model._mask_input_features(
184
- alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
185
- alt_outputs = whisper_model.encoder(
186
- alt_input_features.to(whisper_model.encoder.dtype),
187
- head_mask=None,
188
- output_attentions=False,
189
- output_hidden_states=False,
190
- return_dict=True,
191
- )
192
- S_alt = alt_outputs.last_hidden_state.to(torch.float32)
193
- S_alt = S_alt[:, :chunk.size(-1) // 320 + 1]
194
- if traversed_time == 0:
195
- S_alt_list.append(S_alt)
196
- else:
197
- S_alt_list.append(S_alt[:, 50 * overlapping_time:])
198
- buffer = chunk[:, -16000 * overlapping_time:]
199
- traversed_time += 30 * 16000 if traversed_time == 0 else chunk.size(-1) - 16000 * overlapping_time
200
- S_alt = torch.cat(S_alt_list, dim=1)
201
-
202
- ori_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
203
- ori_inputs = whisper_feature_extractor([ori_waves_16k.squeeze(0).cpu().numpy()],
204
- return_tensors="pt",
205
- return_attention_mask=True)
206
- ori_input_features = whisper_model._mask_input_features(
207
- ori_inputs.input_features, attention_mask=ori_inputs.attention_mask).to(device)
208
- with torch.no_grad():
209
- ori_outputs = whisper_model.encoder(
210
- ori_input_features.to(whisper_model.encoder.dtype),
211
- head_mask=None,
212
- output_attentions=False,
213
- output_hidden_states=False,
214
- return_dict=True,
215
- )
216
- S_ori = ori_outputs.last_hidden_state.to(torch.float32)
217
- S_ori = S_ori[:, :ori_waves_16k.size(-1) // 320 + 1]
218
-
219
- mel = mel_fn(source_audio.to(device).float())
220
- mel2 = mel_fn(ref_audio.to(device).float())
221
-
222
- target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
223
- target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
224
-
225
- feat2 = torchaudio.compliance.kaldi.fbank(ref_waves_16k,
226
- num_mel_bins=80,
227
- dither=0,
228
- sample_frequency=16000)
229
- feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
230
- style2 = campplus_model(feat2.unsqueeze(0))
231
-
232
- if f0_condition:
233
- F0_ori = rmvpe.infer_from_audio(ref_waves_16k[0], thred=0.03)
234
- F0_alt = rmvpe.infer_from_audio(converted_waves_16k[0], thred=0.03)
235
-
236
- F0_ori = torch.from_numpy(F0_ori).to(device)[None]
237
- F0_alt = torch.from_numpy(F0_alt).to(device)[None]
238
-
239
- voiced_F0_ori = F0_ori[F0_ori > 1]
240
- voiced_F0_alt = F0_alt[F0_alt > 1]
241
-
242
- log_f0_alt = torch.log(F0_alt + 1e-5)
243
- voiced_log_f0_ori = torch.log(voiced_F0_ori + 1e-5)
244
- voiced_log_f0_alt = torch.log(voiced_F0_alt + 1e-5)
245
- median_log_f0_ori = torch.median(voiced_log_f0_ori)
246
- median_log_f0_alt = torch.median(voiced_log_f0_alt)
247
-
248
- # shift alt log f0 level to ori log f0 level
249
- shifted_log_f0_alt = log_f0_alt.clone()
250
- if auto_f0_adjust:
251
- shifted_log_f0_alt[F0_alt > 1] = log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori
252
- shifted_f0_alt = torch.exp(shifted_log_f0_alt)
253
- if pitch_shift != 0:
254
- shifted_f0_alt[F0_alt > 1] = adjust_f0_semitones(shifted_f0_alt[F0_alt > 1], pitch_shift)
255
- else:
256
- F0_ori = None
257
- F0_alt = None
258
- shifted_f0_alt = None
259
-
260
- # Length regulation
261
- cond, _, codes, commitment_loss, codebook_loss = inference_module.length_regulator(S_alt, ylens=target_lengths, n_quantizers=3, f0=shifted_f0_alt)
262
- prompt_condition, _, codes, commitment_loss, codebook_loss = inference_module.length_regulator(S_ori, ylens=target2_lengths, n_quantizers=3, f0=F0_ori)
263
-
264
- max_source_window = max_context_window - mel2.size(2)
265
- # split source condition (cond) into chunks
266
- processed_frames = 0
267
- generated_wave_chunks = []
268
- # generate chunk by chunk and stream the output
269
- while processed_frames < cond.size(1):
270
- chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
271
- is_last_chunk = processed_frames + max_source_window >= cond.size(1)
272
- cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
273
- with torch.autocast(device_type=device.type, dtype=torch.float16):
274
- # Voice Conversion
275
- vc_target = inference_module.cfm.inference(cat_condition,
276
- torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
277
- mel2, style2, None, diffusion_steps,
278
- inference_cfg_rate=inference_cfg_rate)
279
- vc_target = vc_target[:, :, mel2.size(-1):]
280
- vc_wave = bigvgan_fn(vc_target.float())[0]
281
- if processed_frames == 0:
282
- if is_last_chunk:
283
- output_wave = vc_wave[0].cpu().numpy()
284
- generated_wave_chunks.append(output_wave)
285
- output_wave = (output_wave * 32768.0).astype(np.int16)
286
- mp3_bytes = AudioSegment(
287
- output_wave.tobytes(), frame_rate=sr,
288
- sample_width=output_wave.dtype.itemsize, channels=1
289
- ).export(format="mp3", bitrate=bitrate).read()
290
- yield mp3_bytes, (sr, np.concatenate(generated_wave_chunks))
291
- break
292
- output_wave = vc_wave[0, :-overlap_wave_len].cpu().numpy()
293
- generated_wave_chunks.append(output_wave)
294
- previous_chunk = vc_wave[0, -overlap_wave_len:]
295
- processed_frames += vc_target.size(2) - overlap_frame_len
296
- output_wave = (output_wave * 32768.0).astype(np.int16)
297
- mp3_bytes = AudioSegment(
298
- output_wave.tobytes(), frame_rate=sr,
299
- sample_width=output_wave.dtype.itemsize, channels=1
300
- ).export(format="mp3", bitrate=bitrate).read()
301
- yield mp3_bytes, None
302
- elif is_last_chunk:
303
- output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0].cpu().numpy(), overlap_wave_len)
304
- generated_wave_chunks.append(output_wave)
305
- processed_frames += vc_target.size(2) - overlap_frame_len
306
- output_wave = (output_wave * 32768.0).astype(np.int16)
307
- mp3_bytes = AudioSegment(
308
- output_wave.tobytes(), frame_rate=sr,
309
- sample_width=output_wave.dtype.itemsize, channels=1
310
- ).export(format="mp3", bitrate=bitrate).read()
311
- yield mp3_bytes, (sr, np.concatenate(generated_wave_chunks))
312
- break
313
- else:
314
- output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0, :-overlap_wave_len].cpu().numpy(), overlap_wave_len)
315
- generated_wave_chunks.append(output_wave)
316
- previous_chunk = vc_wave[0, -overlap_wave_len:]
317
- processed_frames += vc_target.size(2) - overlap_frame_len
318
- output_wave = (output_wave * 32768.0).astype(np.int16)
319
- mp3_bytes = AudioSegment(
320
- output_wave.tobytes(), frame_rate=sr,
321
- sample_width=output_wave.dtype.itemsize, channels=1
322
- ).export(format="mp3", bitrate=bitrate).read()
323
- yield mp3_bytes, None
324
-
325
-
326
- if __name__ == "__main__":
327
- description = ("Zero-shot voice conversion with in-context learning. For local deployment please check [GitHub repository](https://github.com/Plachtaa/seed-vc) "
328
- "for details and updates.<br>Note that any reference audio will be forcefully clipped to 25s if beyond this length.<br> "
329
- "If total duration of source and reference audio exceeds 30s, source audio will be processed in chunks.<br> "
330
- "无需训练的 zero-shot 语音/歌声转换模型,若需本地部署查看[GitHub页面](https://github.com/Plachtaa/seed-vc)<br>"
331
- "请注意,参考音频若超过 25 秒,则会被自动裁剪至此长度。<br>若源音频和参考音频的总时长超过 30 秒,源音频将被分段处理。")
332
- inputs = [
333
- gr.Audio(type="filepath", label="Source Audio / 源音频"),
334
- gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
335
- gr.Slider(minimum=1, maximum=200, value=10, step=1, label="Diffusion Steps / 扩散步数", info="10 by default, 50~100 for best quality / 默认为 10,50~100 为最佳质量"),
336
- gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust / 长度调整", info="<1.0 for speed-up speech, >1.0 for slow-down speech / <1.0 加速语速,>1.0 减慢语速"),
337
- gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate", info="has subtle influence / 有微小影响"),
338
- gr.Checkbox(label="Use F0 conditioned model / 启用F0输入", value=False, info="Must set to true for singing voice conversion / 歌声转换时必须勾选"),
339
- gr.Checkbox(label="Auto F0 adjust / 自动F0调整", value=True,
340
- info="Roughly adjust F0 to match target voice. Only works when F0 conditioned model is used. / 粗略调整 F0 以匹配目标音色,仅在勾选 '启用F0输入' 时生效"),
341
- gr.Slider(label='Pitch shift / 音调变换', minimum=-24, maximum=24, step=1, value=0, info="Pitch shift in semitones, only works when F0 conditioned model is used / 半音数的音高变换,仅在勾选 '启用F0输入' 时生效"),
342
- ]
343
-
344
- examples = [["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, False, True, 0],
345
- ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 25, 1.0, 0.7, True, True, 0],
346
- ["examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav",
347
- "examples/reference/teio_0.wav", 100, 1.0, 0.7, True, False, 0],
348
- ["examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav",
349
- "examples/reference/trump_0.wav", 50, 1.0, 0.7, True, False, -12],
350
- ]
351
-
352
- outputs = [gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
353
- gr.Audio(label="Full Output Audio / 完整输出", streaming=False, format='wav')]
354
-
355
- gr.Interface(fn=voice_conversion,
356
- description=description,
357
- inputs=inputs,
358
- outputs=outputs,
359
- title="Seed Voice Conversion",
360
- examples=examples,
361
- cache_examples=False,
362
  ).launch()
 
1
+ import gradio as gr
2
+ import torch
3
+ import torchaudio
4
+ import librosa
5
+ from modules.commons import build_model, load_checkpoint, recursive_munch
6
+ import yaml
7
+ from hf_utils import load_custom_model_from_hf
8
+ import numpy as np
9
+ from pydub import AudioSegment
10
+
11
+ # Load model and configuration
12
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
+
14
+ dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
15
+ "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
16
+ "config_dit_mel_seed_uvit_whisper_small_wavenet.yml")
17
+ config = yaml.safe_load(open(dit_config_path, 'r'))
18
+ model_params = recursive_munch(config['model_params'])
19
+ model = build_model(model_params, stage='DiT')
20
+ hop_length = config['preprocess_params']['spect_params']['hop_length']
21
+ sr = config['preprocess_params']['sr']
22
+
23
+ # Load checkpoints
24
+ model, _, _, _ = load_checkpoint(model, None, dit_checkpoint_path,
25
+ load_only_params=True, ignore_modules=[], is_distributed=False)
26
+ for key in model:
27
+ model[key].eval()
28
+ model[key].to(device)
29
+ model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
30
+
31
+ # Load additional modules
32
+ from modules.campplus.DTDNN import CAMPPlus
33
+
34
+ campplus_ckpt_path = load_custom_model_from_hf("funasr/campplus", "campplus_cn_common.bin", config_filename=None)
35
+ campplus_model = CAMPPlus(feat_dim=80, embedding_size=192)
36
+ campplus_model.load_state_dict(torch.load(campplus_ckpt_path, map_location="cpu"))
37
+ campplus_model.eval()
38
+ campplus_model.to(device)
39
+
40
+ from modules.bigvgan import bigvgan
41
+
42
+ bigvgan_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_22khz_80band_256x', use_cuda_kernel=False)
43
+
44
+ # remove weight norm in the model and set to eval mode
45
+ bigvgan_model.remove_weight_norm()
46
+ bigvgan_model = bigvgan_model.eval().to(device)
47
+
48
+ # whisper
49
+ from transformers import AutoFeatureExtractor, WhisperModel
50
+
51
+ whisper_name = model_params.speech_tokenizer.whisper_name if hasattr(model_params.speech_tokenizer,
52
+ 'whisper_name') else "openai/whisper-small"
53
+ whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float16).to(device)
54
+ del whisper_model.decoder
55
+ whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
56
+
57
+ # Generate mel spectrograms
58
+ mel_fn_args = {
59
+ "n_fft": config['preprocess_params']['spect_params']['n_fft'],
60
+ "win_size": config['preprocess_params']['spect_params']['win_length'],
61
+ "hop_size": config['preprocess_params']['spect_params']['hop_length'],
62
+ "num_mels": config['preprocess_params']['spect_params']['n_mels'],
63
+ "sampling_rate": sr,
64
+ "fmin": 0,
65
+ "fmax": None,
66
+ "center": False
67
+ }
68
+ from modules.audio import mel_spectrogram
69
+
70
+ to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
71
+
72
+ # f0 conditioned model
73
+ dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
74
+ "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema.pth",
75
+ "config_dit_mel_seed_uvit_whisper_base_f0_44k.yml")
76
+
77
+ config = yaml.safe_load(open(dit_config_path, 'r'))
78
+ model_params = recursive_munch(config['model_params'])
79
+ model_f0 = build_model(model_params, stage='DiT')
80
+ hop_length = config['preprocess_params']['spect_params']['hop_length']
81
+ sr = config['preprocess_params']['sr']
82
+
83
+ # Load checkpoints
84
+ model_f0, _, _, _ = load_checkpoint(model_f0, None, dit_checkpoint_path,
85
+ load_only_params=True, ignore_modules=[], is_distributed=False)
86
+ for key in model_f0:
87
+ model_f0[key].eval()
88
+ model_f0[key].to(device)
89
+ model_f0.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
90
+
91
+ # f0 extractor
92
+ from modules.rmvpe import RMVPE
93
+
94
+ model_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
95
+ rmvpe = RMVPE(model_path, is_half=False, device=device)
96
+
97
+ mel_fn_args_f0 = {
98
+ "n_fft": config['preprocess_params']['spect_params']['n_fft'],
99
+ "win_size": config['preprocess_params']['spect_params']['win_length'],
100
+ "hop_size": config['preprocess_params']['spect_params']['hop_length'],
101
+ "num_mels": config['preprocess_params']['spect_params']['n_mels'],
102
+ "sampling_rate": sr,
103
+ "fmin": 0,
104
+ "fmax": None,
105
+ "center": False
106
+ }
107
+ to_mel_f0 = lambda x: mel_spectrogram(x, **mel_fn_args_f0)
108
+ bigvgan_44k_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_44khz_128band_512x', use_cuda_kernel=False)
109
+
110
+ # remove weight norm in the model and set to eval mode
111
+ bigvgan_44k_model.remove_weight_norm()
112
+ bigvgan_44k_model = bigvgan_44k_model.eval().to(device)
113
+
114
+ def adjust_f0_semitones(f0_sequence, n_semitones):
115
+ factor = 2 ** (n_semitones / 12)
116
+ return f0_sequence * factor
117
+
118
+ def crossfade(chunk1, chunk2, overlap):
119
+ fade_out = np.cos(np.linspace(0, np.pi / 2, overlap)) ** 2
120
+ fade_in = np.cos(np.linspace(np.pi / 2, 0, overlap)) ** 2
121
+ if len(chunk2) < overlap:
122
+ chunk2[:overlap] = chunk2[:overlap] * fade_in[:len(chunk2)] + (chunk1[-overlap:] * fade_out)[:len(chunk2)]
123
+ else:
124
+ chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
125
+ return chunk2
126
+
127
+ # streaming and chunk processing related params
128
+ overlap_frame_len = 16
129
+ bitrate = "320k"
130
+
131
+ @torch.no_grad()
132
+ @torch.inference_mode()
133
+ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_cfg_rate, f0_condition, auto_f0_adjust, pitch_shift):
134
+ inference_module = model if not f0_condition else model_f0
135
+ mel_fn = to_mel if not f0_condition else to_mel_f0
136
+ bigvgan_fn = bigvgan_model if not f0_condition else bigvgan_44k_model
137
+ sr = 22050 if not f0_condition else 44100
138
+ hop_length = 256 if not f0_condition else 512
139
+ max_context_window = sr // hop_length * 30
140
+ overlap_wave_len = overlap_frame_len * hop_length
141
+ # Load audio
142
+ source_audio = librosa.load(source, sr=sr)[0]
143
+ ref_audio = librosa.load(target, sr=sr)[0]
144
+
145
+ # Process audio
146
+ source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
147
+ ref_audio = torch.tensor(ref_audio[:sr * 25]).unsqueeze(0).float().to(device)
148
+
149
+ # Resample
150
+ ref_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
151
+ converted_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
152
+ # if source audio less than 30 seconds, whisper can handle in one forward
153
+ if converted_waves_16k.size(-1) <= 16000 * 30:
154
+ alt_inputs = whisper_feature_extractor([converted_waves_16k.squeeze(0).cpu().numpy()],
155
+ return_tensors="pt",
156
+ return_attention_mask=True,
157
+ sampling_rate=16000)
158
+ alt_input_features = whisper_model._mask_input_features(
159
+ alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
160
+ alt_outputs = whisper_model.encoder(
161
+ alt_input_features.to(whisper_model.encoder.dtype),
162
+ head_mask=None,
163
+ output_attentions=False,
164
+ output_hidden_states=False,
165
+ return_dict=True,
166
+ )
167
+ S_alt = alt_outputs.last_hidden_state.to(torch.float32)
168
+ S_alt = S_alt[:, :converted_waves_16k.size(-1) // 320 + 1]
169
+ else:
170
+ overlapping_time = 5 # 5 seconds
171
+ S_alt_list = []
172
+ buffer = None
173
+ traversed_time = 0
174
+ while traversed_time < converted_waves_16k.size(-1):
175
+ if buffer is None: # first chunk
176
+ chunk = converted_waves_16k[:, traversed_time:traversed_time + 16000 * 30]
177
+ else:
178
+ chunk = torch.cat([buffer, converted_waves_16k[:, traversed_time:traversed_time + 16000 * (30 - overlapping_time)]], dim=-1)
179
+ alt_inputs = whisper_feature_extractor([chunk.squeeze(0).cpu().numpy()],
180
+ return_tensors="pt",
181
+ return_attention_mask=True,
182
+ sampling_rate=16000)
183
+ alt_input_features = whisper_model._mask_input_features(
184
+ alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
185
+ alt_outputs = whisper_model.encoder(
186
+ alt_input_features.to(whisper_model.encoder.dtype),
187
+ head_mask=None,
188
+ output_attentions=False,
189
+ output_hidden_states=False,
190
+ return_dict=True,
191
+ )
192
+ S_alt = alt_outputs.last_hidden_state.to(torch.float32)
193
+ S_alt = S_alt[:, :chunk.size(-1) // 320 + 1]
194
+ if traversed_time == 0:
195
+ S_alt_list.append(S_alt)
196
+ else:
197
+ S_alt_list.append(S_alt[:, 50 * overlapping_time:])
198
+ buffer = chunk[:, -16000 * overlapping_time:]
199
+ traversed_time += 30 * 16000 if traversed_time == 0 else chunk.size(-1) - 16000 * overlapping_time
200
+ S_alt = torch.cat(S_alt_list, dim=1)
201
+
202
+ ori_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
203
+ ori_inputs = whisper_feature_extractor([ori_waves_16k.squeeze(0).cpu().numpy()],
204
+ return_tensors="pt",
205
+ return_attention_mask=True)
206
+ ori_input_features = whisper_model._mask_input_features(
207
+ ori_inputs.input_features, attention_mask=ori_inputs.attention_mask).to(device)
208
+ with torch.no_grad():
209
+ ori_outputs = whisper_model.encoder(
210
+ ori_input_features.to(whisper_model.encoder.dtype),
211
+ head_mask=None,
212
+ output_attentions=False,
213
+ output_hidden_states=False,
214
+ return_dict=True,
215
+ )
216
+ S_ori = ori_outputs.last_hidden_state.to(torch.float32)
217
+ S_ori = S_ori[:, :ori_waves_16k.size(-1) // 320 + 1]
218
+
219
+ mel = mel_fn(source_audio.to(device).float())
220
+ mel2 = mel_fn(ref_audio.to(device).float())
221
+
222
+ target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
223
+ target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
224
+
225
+ feat2 = torchaudio.compliance.kaldi.fbank(ref_waves_16k,
226
+ num_mel_bins=80,
227
+ dither=0,
228
+ sample_frequency=16000)
229
+ feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
230
+ style2 = campplus_model(feat2.unsqueeze(0))
231
+
232
+ if f0_condition:
233
+ F0_ori = rmvpe.infer_from_audio(ref_waves_16k[0], thred=0.03)
234
+ F0_alt = rmvpe.infer_from_audio(converted_waves_16k[0], thred=0.03)
235
+
236
+ F0_ori = torch.from_numpy(F0_ori).to(device)[None]
237
+ F0_alt = torch.from_numpy(F0_alt).to(device)[None]
238
+
239
+ voiced_F0_ori = F0_ori[F0_ori > 1]
240
+ voiced_F0_alt = F0_alt[F0_alt > 1]
241
+
242
+ log_f0_alt = torch.log(F0_alt + 1e-5)
243
+ voiced_log_f0_ori = torch.log(voiced_F0_ori + 1e-5)
244
+ voiced_log_f0_alt = torch.log(voiced_F0_alt + 1e-5)
245
+ median_log_f0_ori = torch.median(voiced_log_f0_ori)
246
+ median_log_f0_alt = torch.median(voiced_log_f0_alt)
247
+
248
+ # shift alt log f0 level to ori log f0 level
249
+ shifted_log_f0_alt = log_f0_alt.clone()
250
+ if auto_f0_adjust:
251
+ shifted_log_f0_alt[F0_alt > 1] = log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori
252
+ shifted_f0_alt = torch.exp(shifted_log_f0_alt)
253
+ if pitch_shift != 0:
254
+ shifted_f0_alt[F0_alt > 1] = adjust_f0_semitones(shifted_f0_alt[F0_alt > 1], pitch_shift)
255
+ else:
256
+ F0_ori = None
257
+ F0_alt = None
258
+ shifted_f0_alt = None
259
+
260
+ # Length regulation
261
+ cond, _, codes, commitment_loss, codebook_loss = inference_module.length_regulator(S_alt, ylens=target_lengths, n_quantizers=3, f0=shifted_f0_alt)
262
+ prompt_condition, _, codes, commitment_loss, codebook_loss = inference_module.length_regulator(S_ori, ylens=target2_lengths, n_quantizers=3, f0=F0_ori)
263
+
264
+ max_source_window = max_context_window - mel2.size(2)
265
+ # split source condition (cond) into chunks
266
+ processed_frames = 0
267
+ generated_wave_chunks = []
268
+ # generate chunk by chunk and stream the output
269
+ while processed_frames < cond.size(1):
270
+ chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
271
+ is_last_chunk = processed_frames + max_source_window >= cond.size(1)
272
+ cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
273
+ with torch.autocast(device_type=device.type, dtype=torch.float16):
274
+ # Voice Conversion
275
+ vc_target = inference_module.cfm.inference(cat_condition,
276
+ torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
277
+ mel2, style2, None, diffusion_steps,
278
+ inference_cfg_rate=inference_cfg_rate)
279
+ vc_target = vc_target[:, :, mel2.size(-1):]
280
+ vc_wave = bigvgan_fn(vc_target.float())[0]
281
+ if processed_frames == 0:
282
+ if is_last_chunk:
283
+ output_wave = vc_wave[0].cpu().numpy()
284
+ generated_wave_chunks.append(output_wave)
285
+ output_wave = (output_wave * 32768.0).astype(np.int16)
286
+ mp3_bytes = AudioSegment(
287
+ output_wave.tobytes(), frame_rate=sr,
288
+ sample_width=output_wave.dtype.itemsize, channels=1
289
+ ).export(format="mp3", bitrate=bitrate).read()
290
+ yield mp3_bytes, (sr, np.concatenate(generated_wave_chunks))
291
+ break
292
+ output_wave = vc_wave[0, :-overlap_wave_len].cpu().numpy()
293
+ generated_wave_chunks.append(output_wave)
294
+ previous_chunk = vc_wave[0, -overlap_wave_len:]
295
+ processed_frames += vc_target.size(2) - overlap_frame_len
296
+ output_wave = (output_wave * 32768.0).astype(np.int16)
297
+ mp3_bytes = AudioSegment(
298
+ output_wave.tobytes(), frame_rate=sr,
299
+ sample_width=output_wave.dtype.itemsize, channels=1
300
+ ).export(format="mp3", bitrate=bitrate).read()
301
+ yield mp3_bytes, None
302
+ elif is_last_chunk:
303
+ output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0].cpu().numpy(), overlap_wave_len)
304
+ generated_wave_chunks.append(output_wave)
305
+ processed_frames += vc_target.size(2) - overlap_frame_len
306
+ output_wave = (output_wave * 32768.0).astype(np.int16)
307
+ mp3_bytes = AudioSegment(
308
+ output_wave.tobytes(), frame_rate=sr,
309
+ sample_width=output_wave.dtype.itemsize, channels=1
310
+ ).export(format="mp3", bitrate=bitrate).read()
311
+ yield mp3_bytes, (sr, np.concatenate(generated_wave_chunks))
312
+ break
313
+ else:
314
+ output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0, :-overlap_wave_len].cpu().numpy(), overlap_wave_len)
315
+ generated_wave_chunks.append(output_wave)
316
+ previous_chunk = vc_wave[0, -overlap_wave_len:]
317
+ processed_frames += vc_target.size(2) - overlap_frame_len
318
+ output_wave = (output_wave * 32768.0).astype(np.int16)
319
+ mp3_bytes = AudioSegment(
320
+ output_wave.tobytes(), frame_rate=sr,
321
+ sample_width=output_wave.dtype.itemsize, channels=1
322
+ ).export(format="mp3", bitrate=bitrate).read()
323
+ yield mp3_bytes, None
324
+
325
+
326
+ if __name__ == "__main__":
327
+ description = ("Zero-shot音声変換モデル(学習不要)。ローカルでの利用方法は[GitHubリポジトリ](https://github.com/Plachtaa/seed-vc)をご覧ください。"
328
+ "参考音声が25秒を超える場合、自動的に25秒にクリップされます。"
329
+ "また、��音声と参考音声の合計時間が30秒を超える場合、元音声は分割処理されます。")
330
+ inputs = [
331
+ gr.Audio(type="filepath", label="元音声"),
332
+ gr.Audio(type="filepath", label="参考音声"),
333
+ gr.Slider(minimum=1, maximum=200, value=10, step=1, label="拡散ステップ数", info="デフォルトは10、50~100が最適な品質"),
334
+ gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="長さ調整", info="1.0未満で速度を上げ、1.0以上で速度を遅くします"),
335
+ gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="推論CFG率", info="わずかな影響があります"),
336
+ gr.Checkbox(label="F0条件付きモデルを使用", value=False, info="歌声変換には必須です"),
337
+ gr.Checkbox(label="F0自動調整", value=True, info="F0をおおよそ調整して目標音声に合わせます。F0条件付きモデル使用時にのみ有効です"),
338
+ gr.Slider(label='音程変換', minimum=-24, maximum=24, step=1, value=0, info="半音単位の音程変換。F0条件付きモデル使用時にのみ有効です"),
339
+ ]
340
+
341
+ examples = [["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, False, True, 0],
342
+ ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 25, 1.0, 0.7, True, True, 0],
343
+ ["examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav",
344
+ "examples/reference/teio_0.wav", 100, 1.0, 0.7, True, False, 0],
345
+ ["examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav",
346
+ "examples/reference/trump_0.wav", 50, 1.0, 0.7, True, False, -12],
347
+ ]
348
+
349
+ outputs = [gr.Audio(label="ストリーム出力音声", streaming=True, format='mp3'),
350
+ gr.Audio(label="完全出力音声", streaming=False, format='wav')]
351
+
352
+ gr.Interface(fn=voice_conversion,
353
+ description=description,
354
+ inputs=inputs,
355
+ outputs=outputs,
356
+ title="Seed Voice Conversion",
357
+ examples=examples,
358
+ cache_examples=False,
 
 
 
359
  ).launch()