Update app.py
Browse files
app.py
CHANGED
@@ -77,8 +77,8 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
77 |
ref_audio = librosa.load(target, sr=sr)[0]
|
78 |
|
79 |
# Process audio
|
80 |
-
source_audio = torch.tensor(source_audio[:sr * 30]).unsqueeze(0).float()
|
81 |
-
ref_audio = torch.tensor(ref_audio[:sr * 30]).unsqueeze(0).float()
|
82 |
|
83 |
# Resample
|
84 |
source_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
|
@@ -88,8 +88,8 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
88 |
S_alt = cosyvoice_frontend.extract_speech_token(source_waves_16k)[0]
|
89 |
S_ori = cosyvoice_frontend.extract_speech_token(ref_waves_16k)[0]
|
90 |
|
91 |
-
mel = to_mel(source_audio.float())
|
92 |
-
mel2 = to_mel(ref_audio.float())
|
93 |
|
94 |
target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
|
95 |
target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
|
|
|
77 |
ref_audio = librosa.load(target, sr=sr)[0]
|
78 |
|
79 |
# Process audio
|
80 |
+
source_audio = torch.tensor(source_audio[:sr * 30]).unsqueeze(0).float().to(device)
|
81 |
+
ref_audio = torch.tensor(ref_audio[:sr * 30]).unsqueeze(0).float().to(device)
|
82 |
|
83 |
# Resample
|
84 |
source_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
|
|
|
88 |
S_alt = cosyvoice_frontend.extract_speech_token(source_waves_16k)[0]
|
89 |
S_ori = cosyvoice_frontend.extract_speech_token(ref_waves_16k)[0]
|
90 |
|
91 |
+
mel = to_mel(source_audio.to(device).float())
|
92 |
+
mel2 = to_mel(ref_audio.to(device).float())
|
93 |
|
94 |
target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
|
95 |
target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
|