jerryhai
commited on
Commit
•
90f7c1e
1
Parent(s):
caf7184
Track binary files with Git LFS
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .ipynb_checkpoints/requirements-checkpoint.txt +17 -0
- .ipynb_checkpoints/score_based_apc-checkpoint.py +159 -0
- .ipynb_checkpoints/template_based_apc-checkpoint.py +89 -0
- README.md +86 -3
- examples/off-key.wav +0 -0
- examples/reference.wav +0 -0
- examples/score_midi.midi +0 -0
- examples/score_midi.npy +3 -0
- examples/score_vocal.wav +0 -0
- output_score.wav +0 -0
- output_template.wav +0 -0
- pitch_controller/README.md +1 -0
- pitch_controller/__pycache__/utils.cpython-310.pyc +0 -0
- pitch_controller/config/DiffWorld_24k.yaml +24 -0
- pitch_controller/data/example/f0/p225_001.wav.npy +3 -0
- pitch_controller/data/example/mel/p225_001.wav.npy +3 -0
- pitch_controller/data/example/wav/p225_001.wav +0 -0
- pitch_controller/data/example/world/p225_001.wav.npy +3 -0
- pitch_controller/data/prepare_f0.py +66 -0
- pitch_controller/data/prepare_mel.py +72 -0
- pitch_controller/data/prepare_world.py +85 -0
- pitch_controller/dataset/__init__.py +1 -0
- pitch_controller/dataset/__pycache__/__init__.cpython-310.pyc +0 -0
- pitch_controller/dataset/__pycache__/__init__.cpython-39.pyc +0 -0
- pitch_controller/dataset/__pycache__/content_enc.cpython-310.pyc +0 -0
- pitch_controller/dataset/__pycache__/content_enc.cpython-39.pyc +0 -0
- pitch_controller/dataset/__pycache__/diff.cpython-310.pyc +0 -0
- pitch_controller/dataset/__pycache__/diff.cpython-39.pyc +0 -0
- pitch_controller/dataset/__pycache__/diff_lpc.cpython-310.pyc +0 -0
- pitch_controller/dataset/diff_lpc.py +271 -0
- pitch_controller/dataset/diff_lpc_content.py +231 -0
- pitch_controller/load_vocoder.py +51 -0
- pitch_controller/models/__pycache__/base.cpython-310.pyc +0 -0
- pitch_controller/models/__pycache__/base.cpython-39.pyc +0 -0
- pitch_controller/models/__pycache__/modules.cpython-310.pyc +0 -0
- pitch_controller/models/__pycache__/modules.cpython-39.pyc +0 -0
- pitch_controller/models/__pycache__/pitch.cpython-39.pyc +0 -0
- pitch_controller/models/__pycache__/unet.cpython-310.pyc +0 -0
- pitch_controller/models/__pycache__/unet.cpython-39.pyc +0 -0
- pitch_controller/models/__pycache__/update_unet.cpython-310.pyc +0 -0
- pitch_controller/models/__pycache__/utils.cpython-310.pyc +0 -0
- pitch_controller/models/__pycache__/utils.cpython-39.pyc +0 -0
- pitch_controller/models/base.py +30 -0
- pitch_controller/models/modules.py +237 -0
- pitch_controller/models/unet.py +153 -0
- pitch_controller/models/utils.py +110 -0
- pitch_controller/modules/BigVGAN/LICENSE +21 -0
- pitch_controller/modules/BigVGAN/README.md +95 -0
- pitch_controller/modules/BigVGAN/__pycache__/env.cpython-310.pyc +0 -0
- pitch_controller/modules/BigVGAN/__pycache__/inference.cpython-310.pyc +0 -0
.ipynb_checkpoints/requirements-checkpoint.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
diffusers
|
2 |
+
einops
|
3 |
+
fastdtw
|
4 |
+
librosa
|
5 |
+
matplotlib
|
6 |
+
music21
|
7 |
+
numpy
|
8 |
+
pandas
|
9 |
+
pretty_midi
|
10 |
+
pysptk
|
11 |
+
pyworld
|
12 |
+
scipy
|
13 |
+
soundfile
|
14 |
+
tgt
|
15 |
+
torch
|
16 |
+
torchaudio
|
17 |
+
tqdm
|
.ipynb_checkpoints/score_based_apc-checkpoint.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os.path
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
import torch
|
6 |
+
import yaml
|
7 |
+
import librosa
|
8 |
+
import soundfile as sf
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
from diffusers import DDIMScheduler
|
12 |
+
from pitch_controller.models.unet import UNetPitcher
|
13 |
+
from pitch_controller.utils import minmax_norm_diff, reverse_minmax_norm_diff
|
14 |
+
from pitch_controller.modules.BigVGAN.inference import load_model
|
15 |
+
from utils import get_mel, get_world_mel, get_f0, f0_to_coarse, show_plot, get_matched_f0, log_f0
|
16 |
+
from pitch_predictor.models.transformer import PitchFormer
|
17 |
+
import pretty_midi
|
18 |
+
|
19 |
+
|
20 |
+
def prepare_midi_wav(wav_id, midi_id, sr=24000):
|
21 |
+
midi = pretty_midi.PrettyMIDI(midi_id)
|
22 |
+
roll = midi.get_piano_roll()
|
23 |
+
roll = np.pad(roll, ((0, 0), (0, 1000)), constant_values=0)
|
24 |
+
roll[roll > 0] = 100
|
25 |
+
|
26 |
+
onset = midi.get_onsets()
|
27 |
+
before_onset = list(np.round(onset * 100 - 1).astype(int))
|
28 |
+
roll[:, before_onset] = 0
|
29 |
+
|
30 |
+
wav, sr = librosa.load(wav_id, sr=sr)
|
31 |
+
|
32 |
+
start = 0
|
33 |
+
end = round(100 * len(wav) / sr) / 100
|
34 |
+
# save audio
|
35 |
+
wav_seg = wav[round(start * sr):round(end * sr)]
|
36 |
+
cur_roll = roll[:, round(100 * start):round(100 * end)]
|
37 |
+
return wav_seg, cur_roll
|
38 |
+
|
39 |
+
|
40 |
+
def algin_mapping(content, target_len):
|
41 |
+
# align content with mel
|
42 |
+
src_len = content.shape[-1]
|
43 |
+
target = torch.zeros([content.shape[0], target_len], dtype=torch.float).to(content.device)
|
44 |
+
temp = torch.arange(src_len+1) * target_len / src_len
|
45 |
+
|
46 |
+
for i in range(target_len):
|
47 |
+
cur_idx = torch.argmin(torch.abs(temp-i))
|
48 |
+
target[:, i] = content[:, cur_idx]
|
49 |
+
return target
|
50 |
+
|
51 |
+
|
52 |
+
def midi_to_hz(midi):
|
53 |
+
idx = torch.zeros(midi.shape[-1])
|
54 |
+
for frame in range(midi.shape[-1]):
|
55 |
+
midi_frame = midi[:, frame]
|
56 |
+
non_zero = midi_frame.nonzero()
|
57 |
+
if len(non_zero) != 0:
|
58 |
+
hz = librosa.midi_to_hz(non_zero[0])
|
59 |
+
idx[frame] = torch.tensor(hz)
|
60 |
+
return idx
|
61 |
+
|
62 |
+
|
63 |
+
@torch.no_grad()
|
64 |
+
def score_pitcher(source, pitch_ref, model, hifigan, pitcher, steps=50, shift_semi=0, mask_with_source=False):
|
65 |
+
wav, midi = prepare_midi_wav(source, pitch_ref, sr=sr)
|
66 |
+
|
67 |
+
source_mel = get_world_mel(None, sr=sr, wav=wav)
|
68 |
+
|
69 |
+
midi = torch.tensor(midi, dtype=torch.float32)
|
70 |
+
midi = algin_mapping(midi, source_mel.shape[-1])
|
71 |
+
midi = midi_to_hz(midi)
|
72 |
+
|
73 |
+
f0_ori = np.nan_to_num(get_f0(source))
|
74 |
+
|
75 |
+
source_mel = torch.from_numpy(source_mel).float().unsqueeze(0).to(device)
|
76 |
+
f0_ori = torch.from_numpy(f0_ori).float().unsqueeze(0).to(device)
|
77 |
+
midi = midi.unsqueeze(0).to(device)
|
78 |
+
|
79 |
+
f0_pred = pitcher(sp=source_mel, midi=midi)
|
80 |
+
if mask_with_source:
|
81 |
+
# mask unvoiced frames based on original pitch estimation
|
82 |
+
f0_pred[f0_ori == 0] = 0
|
83 |
+
f0_pred = f0_pred.cpu().numpy()[0]
|
84 |
+
# limit range
|
85 |
+
f0_pred[f0_pred < librosa.note_to_hz('C2')] = 0
|
86 |
+
f0_pred[f0_pred > librosa.note_to_hz('C6')] = librosa.note_to_hz('C6')
|
87 |
+
|
88 |
+
f0_pred = f0_pred * (2 ** (shift_semi / 12))
|
89 |
+
|
90 |
+
f0_pred = log_f0(f0_pred, {'f0_bin': 345,
|
91 |
+
'f0_min': librosa.note_to_hz('C2'),
|
92 |
+
'f0_max': librosa.note_to_hz('C#6')})
|
93 |
+
f0_pred = torch.from_numpy(f0_pred).float().unsqueeze(0).to(device)
|
94 |
+
|
95 |
+
noise_scheduler = DDIMScheduler(num_train_timesteps=1000)
|
96 |
+
generator = torch.Generator(device=device).manual_seed(2024)
|
97 |
+
|
98 |
+
noise_scheduler.set_timesteps(steps)
|
99 |
+
noise = torch.randn(source_mel.shape, generator=generator, device=device)
|
100 |
+
pred = noise
|
101 |
+
source_x = minmax_norm_diff(source_mel, vmax=max_mel, vmin=min_mel)
|
102 |
+
|
103 |
+
for t in tqdm(noise_scheduler.timesteps):
|
104 |
+
pred = noise_scheduler.scale_model_input(pred, t)
|
105 |
+
model_output = model(x=pred, mean=source_x, f0=f0_pred, t=t, ref=None, embed=None)
|
106 |
+
pred = noise_scheduler.step(model_output=model_output,
|
107 |
+
timestep=t,
|
108 |
+
sample=pred,
|
109 |
+
eta=1, generator=generator).prev_sample
|
110 |
+
|
111 |
+
pred = reverse_minmax_norm_diff(pred, vmax=max_mel, vmin=min_mel)
|
112 |
+
|
113 |
+
pred_audio = hifigan(pred)
|
114 |
+
pred_audio = pred_audio.cpu().squeeze().clamp(-1, 1)
|
115 |
+
|
116 |
+
return pred_audio
|
117 |
+
|
118 |
+
|
119 |
+
if __name__ == '__main__':
|
120 |
+
min_mel = np.log(1e-5)
|
121 |
+
max_mel = 2.5
|
122 |
+
sr = 24000
|
123 |
+
|
124 |
+
use_gpu = torch.cuda.is_available()
|
125 |
+
device = 'cuda' if use_gpu else 'cpu'
|
126 |
+
|
127 |
+
# load diffusion model
|
128 |
+
config = yaml.load(open('pitch_controller/config/DiffWorld_24k.yaml'), Loader=yaml.FullLoader)
|
129 |
+
mel_cfg = config['logmel']
|
130 |
+
ddpm_cfg = config['ddpm']
|
131 |
+
unet_cfg = config['unet']
|
132 |
+
model = UNetPitcher(**unet_cfg)
|
133 |
+
unet_path = 'ckpts/world_fixed_40.pt'
|
134 |
+
|
135 |
+
state_dict = torch.load(unet_path)
|
136 |
+
for key in list(state_dict.keys()):
|
137 |
+
state_dict[key.replace('_orig_mod.', '')] = state_dict.pop(key)
|
138 |
+
model.load_state_dict(state_dict)
|
139 |
+
if use_gpu:
|
140 |
+
model.cuda()
|
141 |
+
model.eval()
|
142 |
+
|
143 |
+
# load vocoder
|
144 |
+
hifi_path = 'ckpts/bigvgan_24khz_100band/g_05000000.pt'
|
145 |
+
hifigan, cfg = load_model(hifi_path, device=device)
|
146 |
+
hifigan.eval()
|
147 |
+
|
148 |
+
# load pitch predictor
|
149 |
+
pitcher = PitchFormer(100, 512).to(device)
|
150 |
+
ckpt = torch.load('ckpts/ckpt_transformer_pitch/transformer_pitch_360.pt')
|
151 |
+
pitcher.load_state_dict(ckpt)
|
152 |
+
pitcher.eval()
|
153 |
+
|
154 |
+
pred_audio = score_pitcher('examples/score_vocal.wav', 'examples/score_midi.midi', model, hifigan, pitcher, steps=50)
|
155 |
+
sf.write('output_score.wav', pred_audio, samplerate=sr)
|
156 |
+
|
157 |
+
|
158 |
+
|
159 |
+
|
.ipynb_checkpoints/template_based_apc-checkpoint.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os.path
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
import torch
|
6 |
+
import yaml
|
7 |
+
import librosa
|
8 |
+
import soundfile as sf
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
from diffusers import DDIMScheduler
|
12 |
+
from pitch_controller.models.unet import UNetPitcher
|
13 |
+
from pitch_controller.utils import minmax_norm_diff, reverse_minmax_norm_diff
|
14 |
+
from pitch_controller.modules.BigVGAN.inference import load_model
|
15 |
+
from utils import get_mel, get_world_mel, get_f0, f0_to_coarse, show_plot, get_matched_f0, log_f0
|
16 |
+
|
17 |
+
|
18 |
+
@torch.no_grad()
|
19 |
+
def template_pitcher(source, pitch_ref, model, hifigan, steps=50, shift_semi=0):
|
20 |
+
|
21 |
+
source_mel = get_world_mel(source, sr=sr)
|
22 |
+
|
23 |
+
f0_ref = get_matched_f0(source, pitch_ref, 'world')
|
24 |
+
f0_ref = f0_ref * 2 ** (shift_semi / 12)
|
25 |
+
|
26 |
+
f0_ref = log_f0(f0_ref, {'f0_bin': 345,
|
27 |
+
'f0_min': librosa.note_to_hz('C2'),
|
28 |
+
'f0_max': librosa.note_to_hz('C#6')})
|
29 |
+
|
30 |
+
source_mel = torch.from_numpy(source_mel).float().unsqueeze(0).to(device)
|
31 |
+
f0_ref = torch.from_numpy(f0_ref).float().unsqueeze(0).to(device)
|
32 |
+
|
33 |
+
noise_scheduler = DDIMScheduler(num_train_timesteps=1000)
|
34 |
+
generator = torch.Generator(device=device).manual_seed(2024)
|
35 |
+
|
36 |
+
noise_scheduler.set_timesteps(steps)
|
37 |
+
noise = torch.randn(source_mel.shape, generator=generator, device=device)
|
38 |
+
pred = noise
|
39 |
+
source_x = minmax_norm_diff(source_mel, vmax=max_mel, vmin=min_mel)
|
40 |
+
|
41 |
+
for t in tqdm(noise_scheduler.timesteps):
|
42 |
+
pred = noise_scheduler.scale_model_input(pred, t)
|
43 |
+
model_output = model(x=pred, mean=source_x, f0=f0_ref, t=t, ref=None, embed=None)
|
44 |
+
pred = noise_scheduler.step(model_output=model_output,
|
45 |
+
timestep=t,
|
46 |
+
sample=pred,
|
47 |
+
eta=1, generator=generator).prev_sample
|
48 |
+
|
49 |
+
pred = reverse_minmax_norm_diff(pred, vmax=max_mel, vmin=min_mel)
|
50 |
+
|
51 |
+
pred_audio = hifigan(pred)
|
52 |
+
pred_audio = pred_audio.cpu().squeeze().clamp(-1, 1)
|
53 |
+
|
54 |
+
return pred_audio
|
55 |
+
|
56 |
+
|
57 |
+
if __name__ == '__main__':
|
58 |
+
min_mel = np.log(1e-5)
|
59 |
+
max_mel = 2.5
|
60 |
+
sr = 24000
|
61 |
+
|
62 |
+
use_gpu = torch.cuda.is_available()
|
63 |
+
device = 'cuda' if use_gpu else 'cpu'
|
64 |
+
|
65 |
+
# load diffusion model
|
66 |
+
config = yaml.load(open('pitch_controller/config/DiffWorld_24k.yaml'), Loader=yaml.FullLoader)
|
67 |
+
mel_cfg = config['logmel']
|
68 |
+
ddpm_cfg = config['ddpm']
|
69 |
+
unet_cfg = config['unet']
|
70 |
+
model = UNetPitcher(**unet_cfg)
|
71 |
+
unet_path = 'ckpts/world_fixed_40.pt'
|
72 |
+
|
73 |
+
state_dict = torch.load(unet_path)
|
74 |
+
for key in list(state_dict.keys()):
|
75 |
+
state_dict[key.replace('_orig_mod.', '')] = state_dict.pop(key)
|
76 |
+
model.load_state_dict(state_dict)
|
77 |
+
if use_gpu:
|
78 |
+
model.cuda()
|
79 |
+
model.eval()
|
80 |
+
|
81 |
+
# load vocoder
|
82 |
+
hifi_path = 'ckpts/bigvgan_24khz_100band/g_05000000.pt'
|
83 |
+
hifigan, cfg = load_model(hifi_path, device=device)
|
84 |
+
hifigan.eval()
|
85 |
+
|
86 |
+
pred_audio = template_pitcher('examples/off-key.wav', 'examples/reference.wav', model, hifigan, steps=50, shift_semi=0)
|
87 |
+
sf.write('output_template.wav', pred_audio, samplerate=sr)
|
88 |
+
|
89 |
+
|
README.md
CHANGED
@@ -1,3 +1,86 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<img src="img\cover.png">
|
2 |
+
|
3 |
+
# Diff-Pitcher (PyTorch)
|
4 |
+
|
5 |
+
Official Pytorch Implementation of [Diff-Pitcher: Diffusion-based Singing Voice Pitch Correction](https://engineering.jhu.edu/lcap/data/uploads/pdfs/waspaa2023_hai.pdf)
|
6 |
+
|
7 |
+
--------------------
|
8 |
+
|
9 |
+
Thank you all for your interest in this research project. I am currently optimizing the model's performance and computation efficiency. I plan to release a user-friendly version, either a GUI or a VST, in the first half of this year, and will update the open-source license.
|
10 |
+
|
11 |
+
If you are familiar with PyTorch, you can follow [Code Examples](#examples) to use Diff-Pitcher.
|
12 |
+
|
13 |
+
--------------------
|
14 |
+
|
15 |
+
Diff-Pitcher
|
16 |
+
|
17 |
+
- [Demo Page](#demo)
|
18 |
+
- [Todo List](#todo)
|
19 |
+
- [Code Examples](#examples)
|
20 |
+
- [References](#references)
|
21 |
+
- [Acknowledgement](#acknowledgement)
|
22 |
+
|
23 |
+
## Demo
|
24 |
+
|
25 |
+
🎵 Listen to [examples](https://jhu-lcap.github.io/Diff-Pitcher/)
|
26 |
+
|
27 |
+
## Todo
|
28 |
+
- [x] Update codes and demo
|
29 |
+
- [x] Support 🤗 [Diffusers](https://github.com/huggingface/diffusers)
|
30 |
+
- [x] Upload checkpoints
|
31 |
+
- [x] Pipeline tutorial
|
32 |
+
- [ ] Merge to [Your-Stable-Audio](https://github.com/haidog-yaqub/Your-Stable-Audio)
|
33 |
+
- [ ] Audio Plugin Support
|
34 |
+
## Examples
|
35 |
+
- Download checkpoints: 🎒[ckpts](https://github.com/haidog-yaqub/DiffPitcher/tree/main/ckpts)
|
36 |
+
- Prepare environment: [requirements.txt](requirements.txt)
|
37 |
+
- Feel free to try:
|
38 |
+
- template-based automatic pitch correction: [template_based_apc.py](template_based_apc.py)
|
39 |
+
- score-based automatic pitch correction: [score_based_apc.py](score_based_apc.py)
|
40 |
+
|
41 |
+
|
42 |
+
## References
|
43 |
+
|
44 |
+
If you find the code useful for your research, please consider citing:
|
45 |
+
|
46 |
+
```bibtex
|
47 |
+
@inproceedings{hai2023diff,
|
48 |
+
title={Diff-Pitcher: Diffusion-Based Singing Voice Pitch Correction},
|
49 |
+
author={Hai, Jiarui and Elhilali, Mounya},
|
50 |
+
booktitle={2023 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
|
51 |
+
pages={1--5},
|
52 |
+
year={2023},
|
53 |
+
organization={IEEE}
|
54 |
+
}
|
55 |
+
```
|
56 |
+
|
57 |
+
This repo is inspired by:
|
58 |
+
|
59 |
+
```bibtex
|
60 |
+
@article{popov2021diffusion,
|
61 |
+
title={Diffusion-based voice conversion with fast maximum likelihood sampling scheme},
|
62 |
+
author={Popov, Vadim and Vovk, Ivan and Gogoryan, Vladimir and Sadekova, Tasnima and Kudinov, Mikhail and Wei, Jiansheng},
|
63 |
+
journal={arXiv preprint arXiv:2109.13821},
|
64 |
+
year={2021}
|
65 |
+
}
|
66 |
+
```
|
67 |
+
```bibtex
|
68 |
+
@inproceedings{liu2022diffsinger,
|
69 |
+
title={Diffsinger: Singing voice synthesis via shallow diffusion mechanism},
|
70 |
+
author={Liu, Jinglin and Li, Chengxi and Ren, Yi and Chen, Feiyang and Zhao, Zhou},
|
71 |
+
booktitle={Proceedings of the AAAI conference on artificial intelligence},
|
72 |
+
volume={36},
|
73 |
+
number={10},
|
74 |
+
pages={11020--11028},
|
75 |
+
year={2022}
|
76 |
+
}
|
77 |
+
```
|
78 |
+
|
79 |
+
## Acknowledgement
|
80 |
+
|
81 |
+
[Welcome to LCAP! < LCAP (jhu.edu)](https://engineering.jhu.edu/lcap/)
|
82 |
+
|
83 |
+
We borrow code from following repos:
|
84 |
+
|
85 |
+
- `Diffusion Schedulers` are based on 🤗 [Diffusers](https://github.com/huggingface/diffusers)
|
86 |
+
- `2D UNet` is based on [DiffVC](https://github.com/huawei-noah/Speech-Backbones/tree/main/DiffVC)
|
examples/off-key.wav
ADDED
Binary file (816 kB). View file
|
|
examples/reference.wav
ADDED
Binary file (816 kB). View file
|
|
examples/score_midi.midi
ADDED
Binary file (121 Bytes). View file
|
|
examples/score_midi.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7baacba4afb8813d057e420cd63853657401403b6c798f6cb7f06673e7dcea5a
|
3 |
+
size 559232
|
examples/score_vocal.wav
ADDED
Binary file (262 kB). View file
|
|
output_score.wav
ADDED
Binary file (262 kB). View file
|
|
output_template.wav
ADDED
Binary file (225 kB). View file
|
|
pitch_controller/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# Diffusion-based Pitch Controller
|
pitch_controller/__pycache__/utils.cpython-310.pyc
ADDED
Binary file (1.94 kB). View file
|
|
pitch_controller/config/DiffWorld_24k.yaml
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: 1.0
|
2 |
+
|
3 |
+
logmel:
|
4 |
+
n_mels: 100
|
5 |
+
sampling_rate: 24000
|
6 |
+
n_fft: 1024
|
7 |
+
hop_size: 256
|
8 |
+
max: 2.5
|
9 |
+
min: -12
|
10 |
+
|
11 |
+
unet:
|
12 |
+
dim_base: 256
|
13 |
+
use_embed: False
|
14 |
+
dim_embed: None
|
15 |
+
use_ref_t: False
|
16 |
+
dim_cond: 128
|
17 |
+
dim_mults: [1, 2, 4]
|
18 |
+
|
19 |
+
ddpm:
|
20 |
+
num_train_steps: 1000
|
21 |
+
inference_steps: 100
|
22 |
+
eta: 0.8
|
23 |
+
|
24 |
+
|
pitch_controller/data/example/f0/p225_001.wav.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8df28ae08ef686e7c7e523fdde25b62fbd05725cdacc043cde407a898182272f
|
3 |
+
size 1672
|
pitch_controller/data/example/mel/p225_001.wav.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8bf3c0e6956f57acdd82f5d91f6390ce148d89066faedbdd6f6ac8c48d1d2c76
|
3 |
+
size 77328
|
pitch_controller/data/example/wav/p225_001.wav
ADDED
Binary file (197 kB). View file
|
|
pitch_controller/data/example/world/p225_001.wav.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e00d5eb7fa9df26321df3f3df06e2ff44c3b3732cc5179ef135e41ffeb3a3b82
|
3 |
+
size 77328
|
pitch_controller/data/prepare_f0.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import amfm_decompy.basic_tools as basic
|
2 |
+
# import amfm_decompy.pYAAPT as pYAAPT
|
3 |
+
from multiprocessing import Process
|
4 |
+
import os
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
import librosa
|
8 |
+
from librosa.core import load
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
|
12 |
+
def get_f0(wav_path):
|
13 |
+
wav, _ = load(wav_path, sr=24000)
|
14 |
+
wav = wav[:(wav.shape[0] // 256) * 256]
|
15 |
+
wav = np.pad(wav, 384, mode='reflect')
|
16 |
+
f0, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=256, center=False,
|
17 |
+
fmin=librosa.note_to_hz('C2'),
|
18 |
+
fmax=librosa.note_to_hz('C6'))
|
19 |
+
return np.nan_to_num(f0)
|
20 |
+
|
21 |
+
|
22 |
+
def chunks(arr, m):
|
23 |
+
result = [[] for i in range(m)]
|
24 |
+
for i in range(len(arr)):
|
25 |
+
result[i%m].append(arr[i])
|
26 |
+
return result
|
27 |
+
|
28 |
+
|
29 |
+
def extract_f0(subset):
|
30 |
+
meta = pd.read_csv('../raw_data/meta_fix.csv')
|
31 |
+
meta = meta[meta['subset'] == 'train']
|
32 |
+
# meta = meta[meta['folder'] == 'VCTK-Corpus/vocal/']
|
33 |
+
|
34 |
+
for i in tqdm(subset):
|
35 |
+
line = meta.iloc[i]
|
36 |
+
audio_dir = '../raw_data/' + line['folder'] + line['subfolder']
|
37 |
+
f = line['file_name']
|
38 |
+
|
39 |
+
f0_dir = audio_dir.replace('vocal', 'f0').replace('raw_data/', '24k_data_f0/')
|
40 |
+
|
41 |
+
try:
|
42 |
+
np.load(os.path.join(f0_dir, f+'.npy'))
|
43 |
+
except:
|
44 |
+
print(line)
|
45 |
+
f0 = get_f0(os.path.join(audio_dir, f))
|
46 |
+
if os.path.exists(f0_dir) is False:
|
47 |
+
os.makedirs(f0_dir, exist_ok=True)
|
48 |
+
np.save(os.path.join(f0_dir, f + '.npy'), f0)
|
49 |
+
|
50 |
+
# if os.path.exists(os.path.join(f0_dir, f+'.npy')) is False:
|
51 |
+
# f0 = get_yaapt_f0(os.path.join(audio_dir, f))
|
52 |
+
|
53 |
+
|
54 |
+
if __name__ == '__main__':
|
55 |
+
cores = 8
|
56 |
+
meta = pd.read_csv('../raw_data/meta_fix.csv')
|
57 |
+
meta = meta[meta['subset']=='train']
|
58 |
+
# meta = meta[meta['folder'] == 'VCTK-Corpus/vocal/']
|
59 |
+
|
60 |
+
idx_list = [i for i in range(len(meta))]
|
61 |
+
|
62 |
+
subsets = chunks(idx_list, cores)
|
63 |
+
|
64 |
+
for subset in subsets:
|
65 |
+
t = Process(target=extract_f0, args=(subset,))
|
66 |
+
t.start()
|
pitch_controller/data/prepare_mel.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
import librosa
|
5 |
+
from librosa.core import load
|
6 |
+
from librosa.filters import mel as librosa_mel_fn
|
7 |
+
mel_basis = librosa_mel_fn(sr=24000, n_fft=1024, n_mels=100, fmin=0, fmax=12000)
|
8 |
+
|
9 |
+
from tqdm import tqdm
|
10 |
+
import pandas as pd
|
11 |
+
|
12 |
+
from multiprocessing import Process
|
13 |
+
|
14 |
+
|
15 |
+
# def get_f0(wav_path):
|
16 |
+
# wav, _ = load(wav_path, sr=22050)
|
17 |
+
# wav = wav[:(wav.shape[0] // 256) * 256]
|
18 |
+
# wav = np.pad(wav, 384, mode='reflect')
|
19 |
+
# f0, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=256, center=False,
|
20 |
+
# fmin=librosa.note_to_hz('C2'),
|
21 |
+
# fmax=librosa.note_to_hz('C6'))
|
22 |
+
# return np.nan_to_num(f0)
|
23 |
+
|
24 |
+
def get_mel(wav_path):
|
25 |
+
wav, _ = load(wav_path, sr=24000)
|
26 |
+
wav = wav[:(wav.shape[0] // 256)*256]
|
27 |
+
wav = np.pad(wav, 384, mode='reflect')
|
28 |
+
stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False)
|
29 |
+
stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9))
|
30 |
+
mel_spectrogram = np.matmul(mel_basis, stftm)
|
31 |
+
log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None))
|
32 |
+
return log_mel_spectrogram
|
33 |
+
|
34 |
+
|
35 |
+
def chunks(arr, m):
|
36 |
+
result = [[] for i in range(m)]
|
37 |
+
for i in range(len(arr)):
|
38 |
+
result[i%m].append(arr[i])
|
39 |
+
return result
|
40 |
+
|
41 |
+
|
42 |
+
def extract_mel(subset):
|
43 |
+
meta = pd.read_csv('../raw_data/meta_fix.csv')
|
44 |
+
meta = meta[meta['folder'] == 'eval/vocal/']
|
45 |
+
|
46 |
+
for i in tqdm(subset):
|
47 |
+
line = meta.iloc[i]
|
48 |
+
audio_dir = '../raw_data/' + line['folder'] + line['subfolder']
|
49 |
+
f = line['file_name']
|
50 |
+
|
51 |
+
mel_dir = audio_dir.replace('vocal', 'mel').replace('raw_data/', '24k_data/')
|
52 |
+
|
53 |
+
if os.path.exists(os.path.join(mel_dir, f+'.npy')) is False:
|
54 |
+
mel = get_mel(os.path.join(audio_dir, f))
|
55 |
+
if os.path.exists(mel_dir) is False:
|
56 |
+
os.makedirs(mel_dir)
|
57 |
+
np.save(os.path.join(mel_dir, f+'.npy'), mel)
|
58 |
+
|
59 |
+
|
60 |
+
if __name__ == '__main__':
|
61 |
+
cores = 8
|
62 |
+
|
63 |
+
meta = pd.read_csv('../raw_data/meta_fix.csv')
|
64 |
+
meta = meta[meta['folder'] == 'eval/vocal/']
|
65 |
+
|
66 |
+
idx_list = [i for i in range(len(meta))]
|
67 |
+
|
68 |
+
subsets = chunks(idx_list, cores)
|
69 |
+
|
70 |
+
for subset in subsets:
|
71 |
+
t = Process(target=extract_mel, args=(subset,))
|
72 |
+
t.start()
|
pitch_controller/data/prepare_world.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from multiprocessing import Process
|
2 |
+
import os
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
import librosa
|
6 |
+
from librosa.core import load
|
7 |
+
from librosa.filters import mel as librosa_mel_fn
|
8 |
+
mel_basis = librosa_mel_fn(sr=24000, n_fft=1024, n_mels=100, fmin=0, fmax=12000)
|
9 |
+
|
10 |
+
from tqdm import tqdm
|
11 |
+
import pandas as pd
|
12 |
+
import pyworld as pw
|
13 |
+
|
14 |
+
|
15 |
+
def get_world_mel(wav_path, sr=24000):
|
16 |
+
wav, _ = librosa.load(wav_path, sr=sr)
|
17 |
+
wav = (wav * 32767).astype(np.int16)
|
18 |
+
wav = (wav / 32767).astype(np.float64)
|
19 |
+
# wav = wav.astype(np.float64)
|
20 |
+
wav = wav[:(wav.shape[0] // 256) * 256]
|
21 |
+
|
22 |
+
_f0, t = pw.dio(wav, sr, frame_period=256/sr*1000)
|
23 |
+
f0 = pw.stonemask(wav, _f0, t, sr)
|
24 |
+
sp = pw.cheaptrick(wav, f0, t, sr)
|
25 |
+
ap = pw.d4c(wav, f0, t, sr)
|
26 |
+
wav_hat = pw.synthesize(f0 * 0, sp, ap, sr, frame_period=256/sr*1000)
|
27 |
+
|
28 |
+
# pyworld output does not pad left
|
29 |
+
wav_hat = wav_hat[:len(wav)]
|
30 |
+
# wav_hat = wav_hat[256//2: len(wav)+256//2]
|
31 |
+
assert len(wav_hat) == len(wav)
|
32 |
+
wav = wav_hat.astype(np.float32)
|
33 |
+
wav = np.pad(wav, 384, mode='reflect')
|
34 |
+
stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False)
|
35 |
+
stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9))
|
36 |
+
mel_spectrogram = np.matmul(mel_basis, stftm)
|
37 |
+
log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None))
|
38 |
+
|
39 |
+
return log_mel_spectrogram, f0
|
40 |
+
|
41 |
+
|
42 |
+
def chunks(arr, m):
|
43 |
+
result = [[] for i in range(m)]
|
44 |
+
for i in range(len(arr)):
|
45 |
+
result[i%m].append(arr[i])
|
46 |
+
return result
|
47 |
+
|
48 |
+
|
49 |
+
def extract_pw(subset, save_f0=False):
|
50 |
+
meta = pd.read_csv('../raw_data/meta_fix.csv')
|
51 |
+
meta = meta[meta['subset'] == 'train']
|
52 |
+
|
53 |
+
for i in tqdm(subset):
|
54 |
+
line = meta.iloc[i]
|
55 |
+
audio_dir = '../raw_data/' + line['folder'] + line['subfolder']
|
56 |
+
f = line['file_name']
|
57 |
+
|
58 |
+
mel_dir = audio_dir.replace('vocal', 'world').replace('raw_data/', '24k_data/')
|
59 |
+
f0_dir = audio_dir.replace('vocal', 'f0').replace('raw_data/', '24k_f0/')
|
60 |
+
|
61 |
+
if os.path.exists(os.path.join(mel_dir, f+'.npy')) is False:
|
62 |
+
mel = get_world_mel(os.path.join(audio_dir, f))
|
63 |
+
|
64 |
+
if os.path.exists(mel_dir) is False:
|
65 |
+
os.makedirs(mel_dir)
|
66 |
+
np.save(os.path.join(mel_dir, f+'.npy'), mel)
|
67 |
+
|
68 |
+
if save_f0 is True:
|
69 |
+
if os.path.exists(f0_dir) is False:
|
70 |
+
os.makedirs(f0_dir)
|
71 |
+
np.save(os.path.join(f0_dir, f + '.npy'), f0)
|
72 |
+
|
73 |
+
|
74 |
+
if __name__ == '__main__':
|
75 |
+
cores = 8
|
76 |
+
meta = pd.read_csv('../raw_data/meta_fix.csv')
|
77 |
+
meta = meta[meta['subset'] == 'train']
|
78 |
+
|
79 |
+
idx_list = [i for i in range(len(meta))]
|
80 |
+
|
81 |
+
subsets = chunks(idx_list, cores)
|
82 |
+
|
83 |
+
for subset in subsets:
|
84 |
+
t = Process(target=extract_pw, args=(subset,))
|
85 |
+
t.start()
|
pitch_controller/dataset/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .diff_lpc import VCDecLPCDataset, VCDecLPCBatchCollate, VCDecLPCTest
|
pitch_controller/dataset/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (237 Bytes). View file
|
|
pitch_controller/dataset/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (311 Bytes). View file
|
|
pitch_controller/dataset/__pycache__/content_enc.cpython-310.pyc
ADDED
Binary file (2.85 kB). View file
|
|
pitch_controller/dataset/__pycache__/content_enc.cpython-39.pyc
ADDED
Binary file (2.84 kB). View file
|
|
pitch_controller/dataset/__pycache__/diff.cpython-310.pyc
ADDED
Binary file (5.79 kB). View file
|
|
pitch_controller/dataset/__pycache__/diff.cpython-39.pyc
ADDED
Binary file (5.83 kB). View file
|
|
pitch_controller/dataset/__pycache__/diff_lpc.cpython-310.pyc
ADDED
Binary file (7.03 kB). View file
|
|
pitch_controller/dataset/diff_lpc.py
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import tgt
|
6 |
+
import pandas as pd
|
7 |
+
|
8 |
+
from torch.utils.data import Dataset
|
9 |
+
import librosa
|
10 |
+
|
11 |
+
|
12 |
+
def f0_to_coarse(f0, hparams):
|
13 |
+
f0_bin = hparams['f0_bin']
|
14 |
+
f0_max = hparams['f0_max']
|
15 |
+
f0_min = hparams['f0_min']
|
16 |
+
is_torch = isinstance(f0, torch.Tensor)
|
17 |
+
# to mel scale
|
18 |
+
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
19 |
+
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
20 |
+
f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
|
21 |
+
|
22 |
+
unvoiced = (f0_mel == 0)
|
23 |
+
|
24 |
+
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
|
25 |
+
|
26 |
+
f0_mel[f0_mel <= 1] = 1
|
27 |
+
f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
|
28 |
+
|
29 |
+
f0_mel[unvoiced] = 0
|
30 |
+
|
31 |
+
f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int)
|
32 |
+
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min())
|
33 |
+
return f0_coarse
|
34 |
+
|
35 |
+
|
36 |
+
def log_f0(f0, hparams):
|
37 |
+
f0_bin = hparams['f0_bin']
|
38 |
+
f0_max = hparams['f0_max']
|
39 |
+
f0_min = hparams['f0_min']
|
40 |
+
|
41 |
+
f0_mel = np.zeros_like(f0)
|
42 |
+
f0_mel[f0 != 0] = 12*np.log2(f0[f0 != 0]/f0_min) + 1
|
43 |
+
f0_mel_min = 12*np.log2(f0_min/f0_min) + 1
|
44 |
+
f0_mel_max = 12*np.log2(f0_max/f0_min) + 1
|
45 |
+
|
46 |
+
unvoiced = (f0_mel == 0)
|
47 |
+
|
48 |
+
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
|
49 |
+
|
50 |
+
f0_mel[f0_mel <= 1] = 1
|
51 |
+
f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
|
52 |
+
|
53 |
+
f0_mel[unvoiced] = 0
|
54 |
+
|
55 |
+
f0_coarse = np.rint(f0_mel).astype(int)
|
56 |
+
assert f0_coarse.max() <= (f0_bin-1) and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min())
|
57 |
+
return f0_coarse
|
58 |
+
|
59 |
+
|
60 |
+
# training "average voice" encoder
|
61 |
+
class VCDecLPCDataset(Dataset):
|
62 |
+
def __init__(self, data_dir, subset, content_dir='lpc_mel_512', extract_emb=False,
|
63 |
+
f0_type='bins'):
|
64 |
+
self.path = data_dir
|
65 |
+
meta = pd.read_csv(data_dir + 'meta_fix.csv')
|
66 |
+
self.meta = meta[meta['subset'] == subset]
|
67 |
+
self.content_dir = content_dir
|
68 |
+
self.extract_emb = extract_emb
|
69 |
+
self.f0_type = f0_type
|
70 |
+
|
71 |
+
def get_vc_data(self, audio_path, mel_id):
|
72 |
+
mel_dir = audio_path.replace('vocal', 'mel')
|
73 |
+
embed_dir = audio_path.replace('vocal', 'embed')
|
74 |
+
pitch_dir = audio_path.replace('vocal', 'f0')
|
75 |
+
content_dir = audio_path.replace('vocal', self.content_dir)
|
76 |
+
|
77 |
+
mel = os.path.join(mel_dir, mel_id + '.npy')
|
78 |
+
embed = os.path.join(embed_dir, mel_id + '.npy')
|
79 |
+
pitch = os.path.join(pitch_dir, mel_id + '.npy')
|
80 |
+
content = os.path.join(content_dir, mel_id + '.npy')
|
81 |
+
|
82 |
+
mel = np.load(mel)
|
83 |
+
if self.extract_emb:
|
84 |
+
embed = np.load(embed)
|
85 |
+
else:
|
86 |
+
embed = np.zeros(1)
|
87 |
+
|
88 |
+
pitch = np.load(pitch)
|
89 |
+
content = np.load(content)
|
90 |
+
|
91 |
+
pitch = np.nan_to_num(pitch)
|
92 |
+
if self.f0_type == 'bins':
|
93 |
+
pitch = f0_to_coarse(pitch, {'f0_bin': 256,
|
94 |
+
'f0_min': librosa.note_to_hz('C2'),
|
95 |
+
'f0_max': librosa.note_to_hz('C6')})
|
96 |
+
elif self.f0_type == 'log':
|
97 |
+
pitch = log_f0(pitch, {'f0_bin': 345,
|
98 |
+
'f0_min': librosa.note_to_hz('C2'),
|
99 |
+
'f0_max': librosa.note_to_hz('C#6')})
|
100 |
+
|
101 |
+
mel = torch.from_numpy(mel).float()
|
102 |
+
embed = torch.from_numpy(embed).float()
|
103 |
+
pitch = torch.from_numpy(pitch).float()
|
104 |
+
content = torch.from_numpy(content).float()
|
105 |
+
|
106 |
+
return (mel, embed, pitch, content)
|
107 |
+
|
108 |
+
def __getitem__(self, index):
|
109 |
+
row = self.meta.iloc[index]
|
110 |
+
mel_id = row['file_name']
|
111 |
+
audio_path = self.path + row['folder'] + row['subfolder']
|
112 |
+
mel, embed, pitch, content = self.get_vc_data(audio_path, mel_id)
|
113 |
+
item = {'mel': mel, 'embed': embed, 'f0': pitch, 'content': content}
|
114 |
+
return item
|
115 |
+
|
116 |
+
def __len__(self):
|
117 |
+
return len(self.meta)
|
118 |
+
|
119 |
+
|
120 |
+
class VCDecLPCBatchCollate(object):
|
121 |
+
def __init__(self, train_frames, eps=1e-5):
|
122 |
+
self.train_frames = train_frames
|
123 |
+
self.eps = eps
|
124 |
+
|
125 |
+
def __call__(self, batch):
|
126 |
+
train_frames = self.train_frames
|
127 |
+
eps = self.eps
|
128 |
+
|
129 |
+
B = len(batch)
|
130 |
+
embed = torch.stack([item['embed'] for item in batch], 0)
|
131 |
+
|
132 |
+
n_mels = batch[0]['mel'].shape[0]
|
133 |
+
content_dim = batch[0]['content'].shape[0]
|
134 |
+
|
135 |
+
# min value of log-mel spectrogram is np.log(eps) == padding zero in time domain
|
136 |
+
mels1 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * np.log(eps)
|
137 |
+
mels2 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * np.log(eps)
|
138 |
+
|
139 |
+
# ! need to deal with empty frames here
|
140 |
+
contents1 = torch.ones((B, content_dim, train_frames), dtype=torch.float32) * np.log(eps)
|
141 |
+
|
142 |
+
f0s1 = torch.zeros((B, train_frames), dtype=torch.float32)
|
143 |
+
max_starts = [max(item['mel'].shape[-1] - train_frames, 0)
|
144 |
+
for item in batch]
|
145 |
+
|
146 |
+
starts1 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts]
|
147 |
+
starts2 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts]
|
148 |
+
mel_lengths = []
|
149 |
+
for i, item in enumerate(batch):
|
150 |
+
mel = item['mel']
|
151 |
+
f0 = item['f0']
|
152 |
+
content = item['content']
|
153 |
+
|
154 |
+
if mel.shape[-1] < train_frames:
|
155 |
+
mel_length = mel.shape[-1]
|
156 |
+
else:
|
157 |
+
mel_length = train_frames
|
158 |
+
|
159 |
+
mels1[i, :, :mel_length] = mel[:, starts1[i]:starts1[i] + mel_length]
|
160 |
+
f0s1[i, :mel_length] = f0[starts1[i]:starts1[i] + mel_length]
|
161 |
+
contents1[i, :, :mel_length] = content[:, starts1[i]:starts1[i] + mel_length]
|
162 |
+
|
163 |
+
mels2[i, :, :mel_length] = mel[:, starts2[i]:starts2[i] + mel_length]
|
164 |
+
mel_lengths.append(mel_length)
|
165 |
+
|
166 |
+
mel_lengths = torch.LongTensor(mel_lengths)
|
167 |
+
|
168 |
+
return {'mel1': mels1, 'mel2': mels2, 'mel_lengths': mel_lengths,
|
169 |
+
'embed': embed,
|
170 |
+
'f0_1': f0s1,
|
171 |
+
'content1': contents1}
|
172 |
+
|
173 |
+
|
174 |
+
class VCDecLPCTest(Dataset):
|
175 |
+
def __init__(self, data_dir, subset='test', eps=1e-5, test_frames=256, content_dir='lpc_mel_512', extract_emb=False,
|
176 |
+
f0_type='bins'):
|
177 |
+
self.path = data_dir
|
178 |
+
meta = pd.read_csv(data_dir + 'meta_test.csv')
|
179 |
+
self.meta = meta[meta['subset'] == subset]
|
180 |
+
self.content_dir = content_dir
|
181 |
+
self.extract_emb = extract_emb
|
182 |
+
self.eps = eps
|
183 |
+
self.test_frames = test_frames
|
184 |
+
self.f0_type = f0_type
|
185 |
+
|
186 |
+
def get_vc_data(self, audio_path, mel_id, pitch_shift):
|
187 |
+
mel_dir = audio_path.replace('vocal', 'mel')
|
188 |
+
embed_dir = audio_path.replace('vocal', 'embed')
|
189 |
+
pitch_dir = audio_path.replace('vocal', 'f0')
|
190 |
+
content_dir = audio_path.replace('vocal', self.content_dir)
|
191 |
+
|
192 |
+
mel = os.path.join(mel_dir, mel_id + '.npy')
|
193 |
+
embed = os.path.join(embed_dir, mel_id + '.npy')
|
194 |
+
pitch = os.path.join(pitch_dir, mel_id + '.npy')
|
195 |
+
content = os.path.join(content_dir, mel_id + '.npy')
|
196 |
+
|
197 |
+
mel = np.load(mel)
|
198 |
+
if self.extract_emb:
|
199 |
+
embed = np.load(embed)
|
200 |
+
else:
|
201 |
+
embed = np.zeros(1)
|
202 |
+
|
203 |
+
pitch = np.load(pitch)
|
204 |
+
content = np.load(content)
|
205 |
+
|
206 |
+
pitch = np.nan_to_num(pitch)
|
207 |
+
pitch = pitch*pitch_shift
|
208 |
+
|
209 |
+
if self.f0_type == 'bins':
|
210 |
+
pitch = f0_to_coarse(pitch, {'f0_bin': 256,
|
211 |
+
'f0_min': librosa.note_to_hz('C2'),
|
212 |
+
'f0_max': librosa.note_to_hz('C6')})
|
213 |
+
elif self.f0_type == 'log':
|
214 |
+
pitch = log_f0(pitch, {'f0_bin': 345,
|
215 |
+
'f0_min': librosa.note_to_hz('C2'),
|
216 |
+
'f0_max': librosa.note_to_hz('C#6')})
|
217 |
+
|
218 |
+
mel = torch.from_numpy(mel).float()
|
219 |
+
embed = torch.from_numpy(embed).float()
|
220 |
+
pitch = torch.from_numpy(pitch).float()
|
221 |
+
content = torch.from_numpy(content).float()
|
222 |
+
|
223 |
+
return (mel, embed, pitch, content)
|
224 |
+
|
225 |
+
def __getitem__(self, index):
|
226 |
+
row = self.meta.iloc[index]
|
227 |
+
|
228 |
+
mel_id = row['content_file_name']
|
229 |
+
audio_path = self.path + row['content_folder'] + row['content_subfolder']
|
230 |
+
pitch_shift = row['pitch_shift']
|
231 |
+
mel1, _, f0, content = self.get_vc_data(audio_path, mel_id, pitch_shift)
|
232 |
+
|
233 |
+
mel_id = row['timbre_file_name']
|
234 |
+
audio_path = self.path + row['timbre_folder'] + row['timbre_subfolder']
|
235 |
+
mel2, embed, _, _ = self.get_vc_data(audio_path, mel_id, pitch_shift)
|
236 |
+
|
237 |
+
n_mels = mel1.shape[0]
|
238 |
+
content_dim = content.shape[0]
|
239 |
+
|
240 |
+
mels1 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * np.log(self.eps)
|
241 |
+
mels2 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * np.log(self.eps)
|
242 |
+
lpcs1 = torch.ones((content_dim, self.test_frames), dtype=torch.float32) * np.log(self.eps)
|
243 |
+
|
244 |
+
f0s1 = torch.zeros(self.test_frames, dtype=torch.float32)
|
245 |
+
|
246 |
+
if mel1.shape[-1] < self.test_frames:
|
247 |
+
mel_length = mel1.shape[-1]
|
248 |
+
else:
|
249 |
+
mel_length = self.test_frames
|
250 |
+
mels1[:, :mel_length] = mel1[:, :mel_length]
|
251 |
+
f0s1[:mel_length] = f0[:mel_length]
|
252 |
+
lpcs1[:, :mel_length] = content[:, :mel_length]
|
253 |
+
|
254 |
+
if mel2.shape[-1] < self.test_frames:
|
255 |
+
mel_length = mel2.shape[-1]
|
256 |
+
else:
|
257 |
+
mel_length = self.test_frames
|
258 |
+
mels2[:, :mel_length] = mel2[:, :mel_length]
|
259 |
+
|
260 |
+
return {'mel1': mels1, 'mel2': mels2, 'embed': embed, 'f0_1': f0s1, 'content1': lpcs1}
|
261 |
+
|
262 |
+
def __len__(self):
|
263 |
+
return len(self.meta)
|
264 |
+
|
265 |
+
|
266 |
+
if __name__ == '__main__':
|
267 |
+
f0 = np.array([110.0, 220.0, librosa.note_to_hz('C2'), 0, librosa.note_to_hz('E3'), librosa.note_to_hz('C6')])
|
268 |
+
# 50 midi notes = (50-1)
|
269 |
+
pitch = log_f0(f0, {'f0_bin': 345,
|
270 |
+
'f0_min': librosa.note_to_hz('C2'),
|
271 |
+
'f0_max': librosa.note_to_hz('C#6')})
|
pitch_controller/dataset/diff_lpc_content.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import tgt
|
6 |
+
import pandas as pd
|
7 |
+
|
8 |
+
from torch.utils.data import Dataset
|
9 |
+
import librosa
|
10 |
+
|
11 |
+
|
12 |
+
def f0_to_coarse(f0, hparams):
|
13 |
+
f0_bin = hparams['f0_bin']
|
14 |
+
f0_max = hparams['f0_max']
|
15 |
+
f0_min = hparams['f0_min']
|
16 |
+
is_torch = isinstance(f0, torch.Tensor)
|
17 |
+
# to mel scale
|
18 |
+
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
19 |
+
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
20 |
+
f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
|
21 |
+
|
22 |
+
unvoiced = (f0_mel == 0)
|
23 |
+
|
24 |
+
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
|
25 |
+
|
26 |
+
f0_mel[f0_mel <= 1] = 1
|
27 |
+
f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
|
28 |
+
|
29 |
+
f0_mel[unvoiced] = 0
|
30 |
+
|
31 |
+
f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int)
|
32 |
+
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min())
|
33 |
+
return f0_coarse
|
34 |
+
|
35 |
+
|
36 |
+
# training "average voice" encoder
|
37 |
+
class VCDecLPCDataset(Dataset):
|
38 |
+
def __init__(self, data_dir, subset, content_dir='lpc_mel_512', extract_emb=False):
|
39 |
+
self.path = data_dir
|
40 |
+
meta = pd.read_csv(data_dir + 'meta_fix.csv')
|
41 |
+
self.meta = meta[meta['subset'] == subset]
|
42 |
+
self.content_dir = content_dir
|
43 |
+
self.extract_emb = extract_emb
|
44 |
+
|
45 |
+
def get_vc_data(self, audio_path, mel_id):
|
46 |
+
mel_dir = audio_path.replace('vocal', 'mel')
|
47 |
+
embed_dir = audio_path.replace('vocal', 'embed')
|
48 |
+
pitch_dir = audio_path.replace('vocal', 'f0')
|
49 |
+
content_dir = audio_path.replace('vocal', self.content_dir)
|
50 |
+
|
51 |
+
mel = os.path.join(mel_dir, mel_id + '.npy')
|
52 |
+
embed = os.path.join(embed_dir, mel_id + '.npy')
|
53 |
+
pitch = os.path.join(pitch_dir, mel_id + '.npy')
|
54 |
+
content = os.path.join(content_dir, mel_id + '.npy')
|
55 |
+
|
56 |
+
mel = np.load(mel)
|
57 |
+
if self.extract_emb:
|
58 |
+
embed = np.load(embed)
|
59 |
+
else:
|
60 |
+
embed = np.zeros(1)
|
61 |
+
|
62 |
+
pitch = np.load(pitch)
|
63 |
+
content = np.load(content)
|
64 |
+
|
65 |
+
pitch = np.nan_to_num(pitch)
|
66 |
+
pitch = f0_to_coarse(pitch, {'f0_bin': 256,
|
67 |
+
'f0_min': librosa.note_to_hz('C2'),
|
68 |
+
'f0_max': librosa.note_to_hz('C6')})
|
69 |
+
|
70 |
+
mel = torch.from_numpy(mel).float()
|
71 |
+
embed = torch.from_numpy(embed).float()
|
72 |
+
pitch = torch.from_numpy(pitch).float()
|
73 |
+
content = torch.from_numpy(content).float()
|
74 |
+
|
75 |
+
return (mel, embed, pitch, content)
|
76 |
+
|
77 |
+
def __getitem__(self, index):
|
78 |
+
row = self.meta.iloc[index]
|
79 |
+
mel_id = row['file_name']
|
80 |
+
audio_path = self.path + row['folder'] + row['subfolder']
|
81 |
+
mel, embed, pitch, content = self.get_vc_data(audio_path, mel_id)
|
82 |
+
item = {'mel': mel, 'embed': embed, 'f0': pitch, 'content': content}
|
83 |
+
return item
|
84 |
+
|
85 |
+
def __len__(self):
|
86 |
+
return len(self.meta)
|
87 |
+
|
88 |
+
|
89 |
+
class VCDecLPCBatchCollate(object):
|
90 |
+
def __init__(self, train_frames, eps=np.log(1e-5), content_eps=np.log(1e-12)):
|
91 |
+
self.train_frames = train_frames
|
92 |
+
self.eps = eps
|
93 |
+
self.content_eps = content_eps
|
94 |
+
|
95 |
+
def __call__(self, batch):
|
96 |
+
train_frames = self.train_frames
|
97 |
+
eps = self.eps
|
98 |
+
content_eps = self.content_eps
|
99 |
+
|
100 |
+
B = len(batch)
|
101 |
+
embed = torch.stack([item['embed'] for item in batch], 0)
|
102 |
+
|
103 |
+
n_mels = batch[0]['mel'].shape[0]
|
104 |
+
content_dim = batch[0]['content'].shape[0]
|
105 |
+
|
106 |
+
# min value of log-mel spectrogram is np.log(eps) == padding zero in time domain
|
107 |
+
mels1 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * eps
|
108 |
+
mels2 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * eps
|
109 |
+
|
110 |
+
# using a different eps
|
111 |
+
contents1 = torch.ones((B, content_dim, train_frames), dtype=torch.float32) * content_eps
|
112 |
+
|
113 |
+
f0s1 = torch.zeros((B, train_frames), dtype=torch.float32)
|
114 |
+
max_starts = [max(item['mel'].shape[-1] - train_frames, 0)
|
115 |
+
for item in batch]
|
116 |
+
|
117 |
+
starts1 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts]
|
118 |
+
starts2 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts]
|
119 |
+
mel_lengths = []
|
120 |
+
for i, item in enumerate(batch):
|
121 |
+
mel = item['mel']
|
122 |
+
f0 = item['f0']
|
123 |
+
content = item['content']
|
124 |
+
|
125 |
+
if mel.shape[-1] < train_frames:
|
126 |
+
mel_length = mel.shape[-1]
|
127 |
+
else:
|
128 |
+
mel_length = train_frames
|
129 |
+
|
130 |
+
mels1[i, :, :mel_length] = mel[:, starts1[i]:starts1[i] + mel_length]
|
131 |
+
f0s1[i, :mel_length] = f0[starts1[i]:starts1[i] + mel_length]
|
132 |
+
contents1[i, :, :mel_length] = content[:, starts1[i]:starts1[i] + mel_length]
|
133 |
+
|
134 |
+
mels2[i, :, :mel_length] = mel[:, starts2[i]:starts2[i] + mel_length]
|
135 |
+
mel_lengths.append(mel_length)
|
136 |
+
|
137 |
+
mel_lengths = torch.LongTensor(mel_lengths)
|
138 |
+
|
139 |
+
return {'mel1': mels1, 'mel2': mels2, 'mel_lengths': mel_lengths,
|
140 |
+
'embed': embed,
|
141 |
+
'f0_1': f0s1,
|
142 |
+
'content1': contents1}
|
143 |
+
|
144 |
+
|
145 |
+
class VCDecLPCTest(Dataset):
|
146 |
+
def __init__(self, data_dir, subset='test', eps=np.log(1e-5), content_eps=np.log(1e-12), test_frames=256, content_dir='lpc_mel_512', extract_emb=False):
|
147 |
+
self.path = data_dir
|
148 |
+
meta = pd.read_csv(data_dir + 'meta_test.csv')
|
149 |
+
self.meta = meta[meta['subset'] == subset]
|
150 |
+
self.content_dir = content_dir
|
151 |
+
self.extract_emb = extract_emb
|
152 |
+
self.eps = eps
|
153 |
+
self.content_eps = content_eps
|
154 |
+
self.test_frames = test_frames
|
155 |
+
|
156 |
+
def get_vc_data(self, audio_path, mel_id, pitch_shift):
|
157 |
+
mel_dir = audio_path.replace('vocal', 'mel')
|
158 |
+
embed_dir = audio_path.replace('vocal', 'embed')
|
159 |
+
pitch_dir = audio_path.replace('vocal', 'f0')
|
160 |
+
content_dir = audio_path.replace('vocal', self.content_dir)
|
161 |
+
|
162 |
+
mel = os.path.join(mel_dir, mel_id + '.npy')
|
163 |
+
embed = os.path.join(embed_dir, mel_id + '.npy')
|
164 |
+
pitch = os.path.join(pitch_dir, mel_id + '.npy')
|
165 |
+
content = os.path.join(content_dir, mel_id + '.npy')
|
166 |
+
|
167 |
+
mel = np.load(mel)
|
168 |
+
if self.extract_emb:
|
169 |
+
embed = np.load(embed)
|
170 |
+
else:
|
171 |
+
embed = np.zeros(1)
|
172 |
+
|
173 |
+
pitch = np.load(pitch)
|
174 |
+
content = np.load(content)
|
175 |
+
|
176 |
+
pitch = np.nan_to_num(pitch)
|
177 |
+
pitch = pitch*pitch_shift
|
178 |
+
pitch = f0_to_coarse(pitch, {'f0_bin': 256,
|
179 |
+
'f0_min': librosa.note_to_hz('C2'),
|
180 |
+
'f0_max': librosa.note_to_hz('C6')})
|
181 |
+
|
182 |
+
mel = torch.from_numpy(mel).float()
|
183 |
+
embed = torch.from_numpy(embed).float()
|
184 |
+
pitch = torch.from_numpy(pitch).float()
|
185 |
+
content = torch.from_numpy(content).float()
|
186 |
+
|
187 |
+
return (mel, embed, pitch, content)
|
188 |
+
|
189 |
+
def __getitem__(self, index):
|
190 |
+
row = self.meta.iloc[index]
|
191 |
+
|
192 |
+
mel_id = row['content_file_name']
|
193 |
+
audio_path = self.path + row['content_folder'] + row['content_subfolder']
|
194 |
+
pitch_shift = row['pitch_shift']
|
195 |
+
mel1, _, f0, content = self.get_vc_data(audio_path, mel_id, pitch_shift)
|
196 |
+
|
197 |
+
mel_id = row['timbre_file_name']
|
198 |
+
audio_path = self.path + row['timbre_folder'] + row['timbre_subfolder']
|
199 |
+
mel2, embed, _, _ = self.get_vc_data(audio_path, mel_id, pitch_shift)
|
200 |
+
|
201 |
+
n_mels = mel1.shape[0]
|
202 |
+
content_dim = content.shape[0]
|
203 |
+
|
204 |
+
mels1 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * self.eps
|
205 |
+
mels2 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * self.eps
|
206 |
+
# content
|
207 |
+
lpcs1 = torch.ones((content_dim, self.test_frames), dtype=torch.float32) * self.content_eps
|
208 |
+
|
209 |
+
f0s1 = torch.zeros(self.test_frames, dtype=torch.float32)
|
210 |
+
|
211 |
+
if mel1.shape[-1] < self.test_frames:
|
212 |
+
mel_length = mel1.shape[-1]
|
213 |
+
else:
|
214 |
+
mel_length = self.test_frames
|
215 |
+
mels1[:, :mel_length] = mel1[:, :mel_length]
|
216 |
+
f0s1[:mel_length] = f0[:mel_length]
|
217 |
+
lpcs1[:, :mel_length] = content[:, :mel_length]
|
218 |
+
|
219 |
+
if mel2.shape[-1] < self.test_frames:
|
220 |
+
mel_length = mel2.shape[-1]
|
221 |
+
else:
|
222 |
+
mel_length = self.test_frames
|
223 |
+
mels2[:, :mel_length] = mel2[:, :mel_length]
|
224 |
+
|
225 |
+
return {'mel1': mels1, 'mel2': mels2, 'embed': embed, 'f0_1': f0s1, 'content1': lpcs1}
|
226 |
+
|
227 |
+
def __len__(self):
|
228 |
+
return len(self.meta)
|
229 |
+
|
230 |
+
|
231 |
+
|
pitch_controller/load_vocoder.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from nsf_hifigan.models import load_model
|
2 |
+
from modules.BigVGAN.inference import load_model
|
3 |
+
import librosa
|
4 |
+
|
5 |
+
import torch
|
6 |
+
import torch.nn.functional as F
|
7 |
+
import torchaudio
|
8 |
+
import torchaudio.transforms as transforms
|
9 |
+
|
10 |
+
import numpy as np
|
11 |
+
import soundfile as sf
|
12 |
+
|
13 |
+
|
14 |
+
class LogMelSpectrogram(torch.nn.Module):
|
15 |
+
def __init__(self):
|
16 |
+
super().__init__()
|
17 |
+
self.melspctrogram = transforms.MelSpectrogram(
|
18 |
+
sample_rate=22050,
|
19 |
+
n_fft=1024,
|
20 |
+
win_length=1024,
|
21 |
+
hop_length=256,
|
22 |
+
center=False,
|
23 |
+
power=1.0,
|
24 |
+
norm="slaney",
|
25 |
+
n_mels=80,
|
26 |
+
mel_scale="slaney",
|
27 |
+
f_max=8000,
|
28 |
+
f_min=0,
|
29 |
+
)
|
30 |
+
|
31 |
+
def forward(self, wav):
|
32 |
+
wav = F.pad(wav, ((1024 - 256) // 2, (1024 - 256) // 2), "reflect")
|
33 |
+
mel = self.melspctrogram(wav)
|
34 |
+
logmel = torch.log(torch.clamp(mel, min=1e-5))
|
35 |
+
return logmel
|
36 |
+
|
37 |
+
|
38 |
+
hifigan, cfg = load_model('modules/BigVGAN/ckpt/bigvgan_22khz_80band/g_05000000', device='cuda')
|
39 |
+
M = LogMelSpectrogram()
|
40 |
+
|
41 |
+
source, sr = torchaudio.load("music.mp3")
|
42 |
+
source = torchaudio.functional.resample(source, sr, 22050)
|
43 |
+
source = source.unsqueeze(0)
|
44 |
+
mel = M(source).squeeze(0)
|
45 |
+
|
46 |
+
# f0, f0_bin = get_pitch("116_1_pred.wav")
|
47 |
+
# f0 = torch.tensor(f0).unsqueeze(0)
|
48 |
+
with torch.no_grad():
|
49 |
+
y_hat = hifigan(mel.cuda()).cpu().numpy().squeeze(1)
|
50 |
+
|
51 |
+
sf.write('test.wav', y_hat[0], samplerate=22050)
|
pitch_controller/models/__pycache__/base.cpython-310.pyc
ADDED
Binary file (1.17 kB). View file
|
|
pitch_controller/models/__pycache__/base.cpython-39.pyc
ADDED
Binary file (1.14 kB). View file
|
|
pitch_controller/models/__pycache__/modules.cpython-310.pyc
ADDED
Binary file (8.26 kB). View file
|
|
pitch_controller/models/__pycache__/modules.cpython-39.pyc
ADDED
Binary file (8.45 kB). View file
|
|
pitch_controller/models/__pycache__/pitch.cpython-39.pyc
ADDED
Binary file (1.1 kB). View file
|
|
pitch_controller/models/__pycache__/unet.cpython-310.pyc
ADDED
Binary file (3.56 kB). View file
|
|
pitch_controller/models/__pycache__/unet.cpython-39.pyc
ADDED
Binary file (3.48 kB). View file
|
|
pitch_controller/models/__pycache__/update_unet.cpython-310.pyc
ADDED
Binary file (3.69 kB). View file
|
|
pitch_controller/models/__pycache__/utils.cpython-310.pyc
ADDED
Binary file (3.99 kB). View file
|
|
pitch_controller/models/__pycache__/utils.cpython-39.pyc
ADDED
Binary file (3.98 kB). View file
|
|
pitch_controller/models/base.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
|
2 |
+
# This program is free software; you can redistribute it and/or modify
|
3 |
+
# it under the terms of the MIT License.
|
4 |
+
# This program is distributed in the hope that it will be useful,
|
5 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
6 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
7 |
+
# MIT License for more details.
|
8 |
+
|
9 |
+
import numpy as np
|
10 |
+
import torch
|
11 |
+
|
12 |
+
|
13 |
+
class BaseModule(torch.nn.Module):
|
14 |
+
def __init__(self):
|
15 |
+
super(BaseModule, self).__init__()
|
16 |
+
|
17 |
+
@property
|
18 |
+
def nparams(self):
|
19 |
+
num_params = 0
|
20 |
+
for name, param in self.named_parameters():
|
21 |
+
if param.requires_grad:
|
22 |
+
num_params += np.prod(param.detach().cpu().numpy().shape)
|
23 |
+
return num_params
|
24 |
+
|
25 |
+
def relocate_input(self, x: list):
|
26 |
+
device = next(self.parameters()).device
|
27 |
+
for i in range(len(x)):
|
28 |
+
if isinstance(x[i], torch.Tensor) and x[i].device != device:
|
29 |
+
x[i] = x[i].to(device)
|
30 |
+
return x
|
pitch_controller/models/modules.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
|
2 |
+
# This program is free software; you can redistribute it and/or modify
|
3 |
+
# it under the terms of the MIT License.
|
4 |
+
# This program is distributed in the hope that it will be useful,
|
5 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
6 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
7 |
+
# MIT License for more details.
|
8 |
+
|
9 |
+
import math
|
10 |
+
import torch
|
11 |
+
from einops import rearrange
|
12 |
+
|
13 |
+
from .base import BaseModule
|
14 |
+
|
15 |
+
|
16 |
+
class Mish(BaseModule):
|
17 |
+
def forward(self, x):
|
18 |
+
return x * torch.tanh(torch.nn.functional.softplus(x))
|
19 |
+
|
20 |
+
|
21 |
+
class Upsample(BaseModule):
|
22 |
+
def __init__(self, dim):
|
23 |
+
super(Upsample, self).__init__()
|
24 |
+
self.conv = torch.nn.ConvTranspose2d(dim, dim, 4, 2, 1)
|
25 |
+
|
26 |
+
def forward(self, x):
|
27 |
+
return self.conv(x)
|
28 |
+
|
29 |
+
|
30 |
+
class Downsample(BaseModule):
|
31 |
+
def __init__(self, dim):
|
32 |
+
super(Downsample, self).__init__()
|
33 |
+
self.conv = torch.nn.Conv2d(dim, dim, 3, 2, 1)
|
34 |
+
|
35 |
+
def forward(self, x):
|
36 |
+
return self.conv(x)
|
37 |
+
|
38 |
+
|
39 |
+
class Rezero(BaseModule):
|
40 |
+
def __init__(self, fn):
|
41 |
+
super(Rezero, self).__init__()
|
42 |
+
self.fn = fn
|
43 |
+
self.g = torch.nn.Parameter(torch.zeros(1))
|
44 |
+
|
45 |
+
def forward(self, x):
|
46 |
+
return self.fn(x) * self.g
|
47 |
+
|
48 |
+
|
49 |
+
class Block(BaseModule):
|
50 |
+
def __init__(self, dim, dim_out, groups=8):
|
51 |
+
super(Block, self).__init__()
|
52 |
+
self.block = torch.nn.Sequential(torch.nn.Conv2d(dim, dim_out, 3,
|
53 |
+
padding=1), torch.nn.GroupNorm(
|
54 |
+
groups, dim_out), Mish())
|
55 |
+
|
56 |
+
def forward(self, x):
|
57 |
+
output = self.block(x)
|
58 |
+
return output
|
59 |
+
|
60 |
+
|
61 |
+
class ResnetBlock(BaseModule):
|
62 |
+
def __init__(self, dim, dim_out, time_emb_dim, groups=8):
|
63 |
+
super(ResnetBlock, self).__init__()
|
64 |
+
self.mlp = torch.nn.Sequential(Mish(), torch.nn.Linear(time_emb_dim,
|
65 |
+
dim_out))
|
66 |
+
|
67 |
+
self.block1 = Block(dim, dim_out, groups=groups)
|
68 |
+
self.block2 = Block(dim_out, dim_out, groups=groups)
|
69 |
+
if dim != dim_out:
|
70 |
+
self.res_conv = torch.nn.Conv2d(dim, dim_out, 1)
|
71 |
+
else:
|
72 |
+
self.res_conv = torch.nn.Identity()
|
73 |
+
|
74 |
+
def forward(self, x, time_emb):
|
75 |
+
h = self.block1(x)
|
76 |
+
h += self.mlp(time_emb).unsqueeze(-1).unsqueeze(-1)
|
77 |
+
h = self.block2(h)
|
78 |
+
output = h + self.res_conv(x)
|
79 |
+
return output
|
80 |
+
|
81 |
+
|
82 |
+
class LinearAttention(BaseModule):
|
83 |
+
def __init__(self, dim, heads=4, dim_head=32, q_norm=True):
|
84 |
+
super(LinearAttention, self).__init__()
|
85 |
+
self.heads = heads
|
86 |
+
hidden_dim = dim_head * heads
|
87 |
+
self.to_qkv = torch.nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
|
88 |
+
self.to_out = torch.nn.Conv2d(hidden_dim, dim, 1)
|
89 |
+
self.q_norm = q_norm
|
90 |
+
|
91 |
+
def forward(self, x):
|
92 |
+
b, c, h, w = x.shape
|
93 |
+
qkv = self.to_qkv(x)
|
94 |
+
q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)',
|
95 |
+
heads=self.heads, qkv=3)
|
96 |
+
k = k.softmax(dim=-1)
|
97 |
+
if self.q_norm:
|
98 |
+
q = q.softmax(dim=-2)
|
99 |
+
|
100 |
+
context = torch.einsum('bhdn,bhen->bhde', k, v)
|
101 |
+
out = torch.einsum('bhde,bhdn->bhen', context, q)
|
102 |
+
out = rearrange(out, 'b heads c (h w) -> b (heads c) h w',
|
103 |
+
heads=self.heads, h=h, w=w)
|
104 |
+
return self.to_out(out)
|
105 |
+
|
106 |
+
|
107 |
+
class Residual(BaseModule):
|
108 |
+
def __init__(self, fn):
|
109 |
+
super(Residual, self).__init__()
|
110 |
+
self.fn = fn
|
111 |
+
|
112 |
+
def forward(self, x, *args, **kwargs):
|
113 |
+
output = self.fn(x, *args, **kwargs) + x
|
114 |
+
return output
|
115 |
+
|
116 |
+
|
117 |
+
def get_timestep_embedding(
|
118 |
+
timesteps: torch.Tensor,
|
119 |
+
embedding_dim: int,
|
120 |
+
flip_sin_to_cos: bool = False,
|
121 |
+
downscale_freq_shift: float = 1,
|
122 |
+
scale: float = 1,
|
123 |
+
max_period: int = 10000,
|
124 |
+
):
|
125 |
+
"""
|
126 |
+
This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
|
127 |
+
:param timesteps: a 1-D Tensor of N indices, one per batch element.
|
128 |
+
These may be fractional.
|
129 |
+
:param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
|
130 |
+
embeddings. :return: an [N x dim] Tensor of positional embeddings.
|
131 |
+
"""
|
132 |
+
assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
|
133 |
+
|
134 |
+
half_dim = embedding_dim // 2
|
135 |
+
exponent = -math.log(max_period) * torch.arange(
|
136 |
+
start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
|
137 |
+
)
|
138 |
+
exponent = exponent / (half_dim - downscale_freq_shift)
|
139 |
+
|
140 |
+
emb = torch.exp(exponent)
|
141 |
+
emb = timesteps[:, None].float() * emb[None, :]
|
142 |
+
|
143 |
+
# scale embeddings
|
144 |
+
emb = scale * emb
|
145 |
+
|
146 |
+
# concat sine and cosine embeddings
|
147 |
+
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
|
148 |
+
|
149 |
+
# flip sine and cosine embeddings
|
150 |
+
if flip_sin_to_cos:
|
151 |
+
emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
|
152 |
+
|
153 |
+
# zero pad
|
154 |
+
if embedding_dim % 2 == 1:
|
155 |
+
emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
|
156 |
+
return emb
|
157 |
+
|
158 |
+
|
159 |
+
class Timesteps(BaseModule):
|
160 |
+
def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
|
161 |
+
super().__init__()
|
162 |
+
self.num_channels = num_channels
|
163 |
+
self.flip_sin_to_cos = flip_sin_to_cos
|
164 |
+
self.downscale_freq_shift = downscale_freq_shift
|
165 |
+
|
166 |
+
def forward(self, timesteps):
|
167 |
+
t_emb = get_timestep_embedding(
|
168 |
+
timesteps,
|
169 |
+
self.num_channels,
|
170 |
+
flip_sin_to_cos=self.flip_sin_to_cos,
|
171 |
+
downscale_freq_shift=self.downscale_freq_shift,
|
172 |
+
)
|
173 |
+
return t_emb
|
174 |
+
|
175 |
+
|
176 |
+
class PitchPosEmb(BaseModule):
|
177 |
+
def __init__(self, dim, flip_sin_to_cos=False, downscale_freq_shift=0):
|
178 |
+
super(PitchPosEmb, self).__init__()
|
179 |
+
self.dim = dim
|
180 |
+
self.flip_sin_to_cos = flip_sin_to_cos
|
181 |
+
self.downscale_freq_shift = downscale_freq_shift
|
182 |
+
|
183 |
+
def forward(self, x):
|
184 |
+
# B * L
|
185 |
+
b, l = x.shape
|
186 |
+
x = rearrange(x, 'b l -> (b l)')
|
187 |
+
emb = get_timestep_embedding(
|
188 |
+
x,
|
189 |
+
self.dim,
|
190 |
+
flip_sin_to_cos=self.flip_sin_to_cos,
|
191 |
+
downscale_freq_shift=self.downscale_freq_shift,
|
192 |
+
)
|
193 |
+
emb = rearrange(emb, '(b l) d -> b d l', b=b, l=l)
|
194 |
+
return emb
|
195 |
+
|
196 |
+
|
197 |
+
class TimbreBlock(BaseModule):
|
198 |
+
def __init__(self, out_dim):
|
199 |
+
super(TimbreBlock, self).__init__()
|
200 |
+
base_dim = out_dim // 4
|
201 |
+
|
202 |
+
self.block11 = torch.nn.Sequential(torch.nn.Conv2d(1, 2 * base_dim,
|
203 |
+
3, 1, 1),
|
204 |
+
torch.nn.InstanceNorm2d(2 * base_dim, affine=True),
|
205 |
+
torch.nn.GLU(dim=1))
|
206 |
+
self.block12 = torch.nn.Sequential(torch.nn.Conv2d(base_dim, 2 * base_dim,
|
207 |
+
3, 1, 1),
|
208 |
+
torch.nn.InstanceNorm2d(2 * base_dim, affine=True),
|
209 |
+
torch.nn.GLU(dim=1))
|
210 |
+
self.block21 = torch.nn.Sequential(torch.nn.Conv2d(base_dim, 4 * base_dim,
|
211 |
+
3, 1, 1),
|
212 |
+
torch.nn.InstanceNorm2d(4 * base_dim, affine=True),
|
213 |
+
torch.nn.GLU(dim=1))
|
214 |
+
self.block22 = torch.nn.Sequential(torch.nn.Conv2d(2 * base_dim, 4 * base_dim,
|
215 |
+
3, 1, 1),
|
216 |
+
torch.nn.InstanceNorm2d(4 * base_dim, affine=True),
|
217 |
+
torch.nn.GLU(dim=1))
|
218 |
+
self.block31 = torch.nn.Sequential(torch.nn.Conv2d(2 * base_dim, 8 * base_dim,
|
219 |
+
3, 1, 1),
|
220 |
+
torch.nn.InstanceNorm2d(8 * base_dim, affine=True),
|
221 |
+
torch.nn.GLU(dim=1))
|
222 |
+
self.block32 = torch.nn.Sequential(torch.nn.Conv2d(4 * base_dim, 8 * base_dim,
|
223 |
+
3, 1, 1),
|
224 |
+
torch.nn.InstanceNorm2d(8 * base_dim, affine=True),
|
225 |
+
torch.nn.GLU(dim=1))
|
226 |
+
self.final_conv = torch.nn.Conv2d(4 * base_dim, out_dim, 1)
|
227 |
+
|
228 |
+
def forward(self, x):
|
229 |
+
y = self.block11(x)
|
230 |
+
y = self.block12(y)
|
231 |
+
y = self.block21(y)
|
232 |
+
y = self.block22(y)
|
233 |
+
y = self.block31(y)
|
234 |
+
y = self.block32(y)
|
235 |
+
y = self.final_conv(y)
|
236 |
+
|
237 |
+
return y.sum((2, 3)) / (y.shape[2] * y.shape[3])
|
pitch_controller/models/unet.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
|
4 |
+
from .base import BaseModule
|
5 |
+
from .modules import Mish, Upsample, Downsample, Rezero, Block, ResnetBlock
|
6 |
+
from .modules import LinearAttention, Residual, Timesteps, TimbreBlock, PitchPosEmb
|
7 |
+
|
8 |
+
from einops import rearrange
|
9 |
+
|
10 |
+
|
11 |
+
class UNetPitcher(BaseModule):
|
12 |
+
def __init__(self,
|
13 |
+
dim_base,
|
14 |
+
dim_cond,
|
15 |
+
use_ref_t,
|
16 |
+
use_embed,
|
17 |
+
dim_embed=256,
|
18 |
+
dim_mults=(1, 2, 4),
|
19 |
+
pitch_type='bins'):
|
20 |
+
|
21 |
+
super(UNetPitcher, self).__init__()
|
22 |
+
self.use_ref_t = use_ref_t
|
23 |
+
self.use_embed = use_embed
|
24 |
+
self.pitch_type = pitch_type
|
25 |
+
|
26 |
+
dim_in = 2
|
27 |
+
|
28 |
+
# time embedding
|
29 |
+
self.time_pos_emb = Timesteps(num_channels=dim_base,
|
30 |
+
flip_sin_to_cos=True,
|
31 |
+
downscale_freq_shift=0)
|
32 |
+
|
33 |
+
self.mlp = torch.nn.Sequential(torch.nn.Linear(dim_base, dim_base * 4),
|
34 |
+
Mish(), torch.nn.Linear(dim_base * 4, dim_base))
|
35 |
+
|
36 |
+
# speaker embedding
|
37 |
+
timbre_total = 0
|
38 |
+
if use_ref_t:
|
39 |
+
self.ref_block = TimbreBlock(out_dim=dim_cond)
|
40 |
+
timbre_total += dim_cond
|
41 |
+
if use_embed:
|
42 |
+
timbre_total += dim_embed
|
43 |
+
|
44 |
+
if timbre_total != 0:
|
45 |
+
self.timbre_block = torch.nn.Sequential(
|
46 |
+
torch.nn.Linear(timbre_total, 4 * dim_cond),
|
47 |
+
Mish(),
|
48 |
+
torch.nn.Linear(4 * dim_cond, dim_cond))
|
49 |
+
|
50 |
+
if use_embed or use_ref_t:
|
51 |
+
dim_in += dim_cond
|
52 |
+
|
53 |
+
self.pitch_pos_emb = PitchPosEmb(dim_cond)
|
54 |
+
self.pitch_mlp = torch.nn.Sequential(
|
55 |
+
torch.nn.Conv1d(dim_cond, dim_cond * 4, 1, stride=1),
|
56 |
+
Mish(),
|
57 |
+
torch.nn.Conv1d(dim_cond * 4, dim_cond, 1, stride=1), )
|
58 |
+
dim_in += dim_cond
|
59 |
+
|
60 |
+
# pitch embedding
|
61 |
+
# if self.pitch_type == 'bins':
|
62 |
+
# print('using mel bins for f0')
|
63 |
+
# elif self.pitch_type == 'log':
|
64 |
+
# print('using log bins f0')
|
65 |
+
|
66 |
+
dims = [dim_in, *map(lambda m: dim_base * m, dim_mults)]
|
67 |
+
in_out = list(zip(dims[:-1], dims[1:]))
|
68 |
+
# blocks
|
69 |
+
self.downs = torch.nn.ModuleList([])
|
70 |
+
self.ups = torch.nn.ModuleList([])
|
71 |
+
num_resolutions = len(in_out)
|
72 |
+
|
73 |
+
for ind, (dim_in, dim_out) in enumerate(in_out):
|
74 |
+
is_last = ind >= (num_resolutions - 1)
|
75 |
+
self.downs.append(torch.nn.ModuleList([
|
76 |
+
ResnetBlock(dim_in, dim_out, time_emb_dim=dim_base),
|
77 |
+
ResnetBlock(dim_out, dim_out, time_emb_dim=dim_base),
|
78 |
+
Residual(Rezero(LinearAttention(dim_out))),
|
79 |
+
Downsample(dim_out) if not is_last else torch.nn.Identity()]))
|
80 |
+
|
81 |
+
mid_dim = dims[-1]
|
82 |
+
self.mid_block1 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim_base)
|
83 |
+
self.mid_attn = Residual(Rezero(LinearAttention(mid_dim)))
|
84 |
+
self.mid_block2 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim_base)
|
85 |
+
|
86 |
+
for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
|
87 |
+
self.ups.append(torch.nn.ModuleList([
|
88 |
+
ResnetBlock(dim_out * 2, dim_in, time_emb_dim=dim_base),
|
89 |
+
ResnetBlock(dim_in, dim_in, time_emb_dim=dim_base),
|
90 |
+
Residual(Rezero(LinearAttention(dim_in))),
|
91 |
+
Upsample(dim_in)]))
|
92 |
+
self.final_block = Block(dim_base, dim_base)
|
93 |
+
self.final_conv = torch.nn.Conv2d(dim_base, 1, 1)
|
94 |
+
|
95 |
+
def forward(self, x, mean, f0, t, ref=None, embed=None):
|
96 |
+
if not torch.is_tensor(t):
|
97 |
+
t = torch.tensor([t], dtype=torch.long, device=x.device)
|
98 |
+
if len(t.shape) == 0:
|
99 |
+
t = t * torch.ones(x.shape[0], dtype=t.dtype, device=x.device)
|
100 |
+
|
101 |
+
t = self.time_pos_emb(t)
|
102 |
+
t = self.mlp(t)
|
103 |
+
|
104 |
+
x = torch.stack([x, mean], 1)
|
105 |
+
|
106 |
+
f0 = self.pitch_pos_emb(f0)
|
107 |
+
f0 = self.pitch_mlp(f0)
|
108 |
+
f0 = f0.unsqueeze(2)
|
109 |
+
f0 = torch.cat(x.shape[2] * [f0], 2)
|
110 |
+
|
111 |
+
timbre = None
|
112 |
+
if self.use_ref_t:
|
113 |
+
ref = torch.stack([ref], 1)
|
114 |
+
timbre = self.ref_block(ref)
|
115 |
+
if self.use_embed:
|
116 |
+
if timbre is not None:
|
117 |
+
timbre = torch.cat([timbre, embed], 1)
|
118 |
+
else:
|
119 |
+
timbre = embed
|
120 |
+
if timbre is None:
|
121 |
+
# raise Exception("at least use one timbre condition")
|
122 |
+
condition = f0
|
123 |
+
else:
|
124 |
+
timbre = self.timbre_block(timbre).unsqueeze(-1).unsqueeze(-1)
|
125 |
+
timbre = torch.cat(x.shape[2] * [timbre], 2)
|
126 |
+
timbre = torch.cat(x.shape[3] * [timbre], 3)
|
127 |
+
condition = torch.cat([f0, timbre], 1)
|
128 |
+
|
129 |
+
x = torch.cat([x, condition], 1)
|
130 |
+
|
131 |
+
hiddens = []
|
132 |
+
for resnet1, resnet2, attn, downsample in self.downs:
|
133 |
+
x = resnet1(x, t)
|
134 |
+
x = resnet2(x, t)
|
135 |
+
x = attn(x)
|
136 |
+
hiddens.append(x)
|
137 |
+
x = downsample(x)
|
138 |
+
|
139 |
+
x = self.mid_block1(x, t)
|
140 |
+
x = self.mid_attn(x)
|
141 |
+
x = self.mid_block2(x, t)
|
142 |
+
|
143 |
+
for resnet1, resnet2, attn, upsample in self.ups:
|
144 |
+
x = torch.cat((x, hiddens.pop()), dim=1)
|
145 |
+
x = resnet1(x, t)
|
146 |
+
x = resnet2(x, t)
|
147 |
+
x = attn(x)
|
148 |
+
x = upsample(x)
|
149 |
+
|
150 |
+
x = self.final_block(x)
|
151 |
+
output = self.final_conv(x)
|
152 |
+
|
153 |
+
return output.squeeze(1)
|
pitch_controller/models/utils.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
|
2 |
+
# This program is free software; you can redistribute it and/or modify
|
3 |
+
# it under the terms of the MIT License.
|
4 |
+
# This program is distributed in the hope that it will be useful,
|
5 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
6 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
7 |
+
# MIT License for more details.
|
8 |
+
|
9 |
+
import torch
|
10 |
+
import torchaudio
|
11 |
+
import numpy as np
|
12 |
+
from librosa.filters import mel as librosa_mel_fn
|
13 |
+
|
14 |
+
from .base import BaseModule
|
15 |
+
|
16 |
+
|
17 |
+
def mse_loss(x, y, mask, n_feats):
|
18 |
+
loss = torch.sum(((x - y)**2) * mask)
|
19 |
+
return loss / (torch.sum(mask) * n_feats)
|
20 |
+
|
21 |
+
|
22 |
+
def sequence_mask(length, max_length=None):
|
23 |
+
if max_length is None:
|
24 |
+
max_length = length.max()
|
25 |
+
x = torch.arange(int(max_length), dtype=length.dtype, device=length.device)
|
26 |
+
return x.unsqueeze(0) < length.unsqueeze(1)
|
27 |
+
|
28 |
+
|
29 |
+
def convert_pad_shape(pad_shape):
|
30 |
+
l = pad_shape[::-1]
|
31 |
+
pad_shape = [item for sublist in l for item in sublist]
|
32 |
+
return pad_shape
|
33 |
+
|
34 |
+
|
35 |
+
def fix_len_compatibility(length, num_downsamplings_in_unet=2):
|
36 |
+
while True:
|
37 |
+
if length % (2**num_downsamplings_in_unet) == 0:
|
38 |
+
return length
|
39 |
+
length += 1
|
40 |
+
|
41 |
+
|
42 |
+
class PseudoInversion(BaseModule):
|
43 |
+
def __init__(self, n_mels, sampling_rate, n_fft):
|
44 |
+
super(PseudoInversion, self).__init__()
|
45 |
+
self.n_mels = n_mels
|
46 |
+
self.sampling_rate = sampling_rate
|
47 |
+
self.n_fft = n_fft
|
48 |
+
mel_basis = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=0, fmax=8000)
|
49 |
+
mel_basis_inverse = np.linalg.pinv(mel_basis)
|
50 |
+
mel_basis_inverse = torch.from_numpy(mel_basis_inverse).float()
|
51 |
+
self.register_buffer("mel_basis_inverse", mel_basis_inverse)
|
52 |
+
|
53 |
+
def forward(self, log_mel_spectrogram):
|
54 |
+
mel_spectrogram = torch.exp(log_mel_spectrogram)
|
55 |
+
stftm = torch.matmul(self.mel_basis_inverse, mel_spectrogram)
|
56 |
+
return stftm
|
57 |
+
|
58 |
+
|
59 |
+
class InitialReconstruction(BaseModule):
|
60 |
+
def __init__(self, n_fft, hop_size):
|
61 |
+
super(InitialReconstruction, self).__init__()
|
62 |
+
self.n_fft = n_fft
|
63 |
+
self.hop_size = hop_size
|
64 |
+
window = torch.hann_window(n_fft).float()
|
65 |
+
self.register_buffer("window", window)
|
66 |
+
|
67 |
+
def forward(self, stftm):
|
68 |
+
real_part = torch.ones_like(stftm, device=stftm.device)
|
69 |
+
imag_part = torch.zeros_like(stftm, device=stftm.device)
|
70 |
+
stft = torch.stack([real_part, imag_part], -1)*stftm.unsqueeze(-1)
|
71 |
+
istft = torch.istft(stft, n_fft=self.n_fft,
|
72 |
+
hop_length=self.hop_size, win_length=self.n_fft,
|
73 |
+
window=self.window, center=True)
|
74 |
+
return istft.unsqueeze(1)
|
75 |
+
|
76 |
+
|
77 |
+
# Fast Griffin-Lim algorithm as a PyTorch module
|
78 |
+
class FastGL(BaseModule):
|
79 |
+
def __init__(self, n_mels, sampling_rate, n_fft, hop_size, momentum=0.99):
|
80 |
+
super(FastGL, self).__init__()
|
81 |
+
self.n_mels = n_mels
|
82 |
+
self.sampling_rate = sampling_rate
|
83 |
+
self.n_fft = n_fft
|
84 |
+
self.hop_size = hop_size
|
85 |
+
self.momentum = momentum
|
86 |
+
self.pi = PseudoInversion(n_mels, sampling_rate, n_fft)
|
87 |
+
self.ir = InitialReconstruction(n_fft, hop_size)
|
88 |
+
window = torch.hann_window(n_fft).float()
|
89 |
+
self.register_buffer("window", window)
|
90 |
+
|
91 |
+
@torch.no_grad()
|
92 |
+
def forward(self, s, n_iters=32):
|
93 |
+
c = self.pi(s)
|
94 |
+
x = self.ir(c)
|
95 |
+
x = x.squeeze(1)
|
96 |
+
c = c.unsqueeze(-1)
|
97 |
+
prev_angles = torch.zeros_like(c, device=c.device)
|
98 |
+
for _ in range(n_iters):
|
99 |
+
s = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_size,
|
100 |
+
win_length=self.n_fft, window=self.window,
|
101 |
+
center=True)
|
102 |
+
real_part, imag_part = s.unbind(-1)
|
103 |
+
stftm = torch.sqrt(torch.clamp(real_part**2 + imag_part**2, min=1e-8))
|
104 |
+
angles = s / stftm.unsqueeze(-1)
|
105 |
+
s = c * (angles + self.momentum * (angles - prev_angles))
|
106 |
+
x = torch.istft(s, n_fft=self.n_fft, hop_length=self.hop_size,
|
107 |
+
win_length=self.n_fft, window=self.window,
|
108 |
+
center=True)
|
109 |
+
prev_angles = angles
|
110 |
+
return x.unsqueeze(1)
|
pitch_controller/modules/BigVGAN/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2022 NVIDIA CORPORATION.
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
pitch_controller/modules/BigVGAN/README.md
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## BigVGAN: A Universal Neural Vocoder with Large-Scale Training
|
2 |
+
#### Sang-gil Lee, Wei Ping, Boris Ginsburg, Bryan Catanzaro, Sungroh Yoon
|
3 |
+
|
4 |
+
<center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800"></center>
|
5 |
+
|
6 |
+
|
7 |
+
### [Paper](https://arxiv.org/abs/2206.04658)
|
8 |
+
### [Audio demo](https://bigvgan-demo.github.io/)
|
9 |
+
|
10 |
+
## Installation
|
11 |
+
Clone the repository and install dependencies.
|
12 |
+
```shell
|
13 |
+
# the codebase has been tested on Python 3.8 / 3.10 with PyTorch 1.12.1 / 1.13 conda binaries
|
14 |
+
git clone https://github.com/NVIDIA/BigVGAN
|
15 |
+
pip install -r requirements.txt
|
16 |
+
```
|
17 |
+
|
18 |
+
Create symbolic link to the root of the dataset. The codebase uses filelist with the relative path from the dataset. Below are the example commands for LibriTTS dataset.
|
19 |
+
``` shell
|
20 |
+
cd LibriTTS && \
|
21 |
+
ln -s /path/to/your/LibriTTS/train-clean-100 train-clean-100 && \
|
22 |
+
ln -s /path/to/your/LibriTTS/train-clean-360 train-clean-360 && \
|
23 |
+
ln -s /path/to/your/LibriTTS/train-other-500 train-other-500 && \
|
24 |
+
ln -s /path/to/your/LibriTTS/dev-clean dev-clean && \
|
25 |
+
ln -s /path/to/your/LibriTTS/dev-other dev-other && \
|
26 |
+
ln -s /path/to/your/LibriTTS/test-clean test-clean && \
|
27 |
+
ln -s /path/to/your/LibriTTS/test-other test-other && \
|
28 |
+
cd ..
|
29 |
+
```
|
30 |
+
|
31 |
+
## Training
|
32 |
+
Train BigVGAN model. Below is an example command for training BigVGAN using LibriTTS dataset at 24kHz with a full 100-band mel spectrogram as input.
|
33 |
+
```shell
|
34 |
+
python train.py \
|
35 |
+
--config configs/bigvgan_24khz_100band.json \
|
36 |
+
--input_wavs_dir LibriTTS \
|
37 |
+
--input_training_file LibriTTS/train-full.txt \
|
38 |
+
--input_validation_file LibriTTS/val-full.txt \
|
39 |
+
--list_input_unseen_wavs_dir LibriTTS LibriTTS \
|
40 |
+
--list_input_unseen_validation_file LibriTTS/dev-clean.txt LibriTTS/dev-other.txt \
|
41 |
+
--checkpoint_path exp/bigvgan
|
42 |
+
```
|
43 |
+
|
44 |
+
## Synthesis
|
45 |
+
Synthesize from BigVGAN model. Below is an example command for generating audio from the model.
|
46 |
+
It computes mel spectrograms using wav files from `--input_wavs_dir` and saves the generated audio to `--output_dir`.
|
47 |
+
```shell
|
48 |
+
python inference.py \
|
49 |
+
--checkpoint_file exp/bigvgan/g_05000000 \
|
50 |
+
--input_wavs_dir /path/to/your/input_wav \
|
51 |
+
--output_dir /path/to/your/output_wav
|
52 |
+
```
|
53 |
+
|
54 |
+
`inference_e2e.py` supports synthesis directly from the mel spectrogram saved in `.npy` format, with shapes `[1, channel, frame]` or `[channel, frame]`.
|
55 |
+
It loads mel spectrograms from `--input_mels_dir` and saves the generated audio to `--output_dir`.
|
56 |
+
|
57 |
+
Make sure that the STFT hyperparameters for mel spectrogram are the same as the model, which are defined in `config.json` of the corresponding model.
|
58 |
+
```shell
|
59 |
+
python inference_e2e.py \
|
60 |
+
--checkpoint_file exp/bigvgan/g_05000000 \
|
61 |
+
--input_mels_dir /path/to/your/input_mel \
|
62 |
+
--output_dir /path/to/your/output_wav
|
63 |
+
```
|
64 |
+
|
65 |
+
## Pretrained Models
|
66 |
+
We provide the [pretrained models](https://drive.google.com/drive/folders/1e9wdM29d-t3EHUpBb8T4dcHrkYGAXTgq).
|
67 |
+
One can download the checkpoints of generator (e.g., g_05000000) and discriminator (e.g., do_05000000) within the listed folders.
|
68 |
+
|
69 |
+
|Folder Name|Sampling Rate|Mel band|fmax|Params.|Dataset|Fine-Tuned|
|
70 |
+
|------|---|---|---|---|------|---|
|
71 |
+
|bigvgan_24khz_100band|24 kHz|100|12000|112M|LibriTTS|No|
|
72 |
+
|bigvgan_base_24khz_100band|24 kHz|100|12000|14M|LibriTTS|No|
|
73 |
+
|bigvgan_22khz_80band|22 kHz|80|8000|112M|LibriTTS + VCTK + LJSpeech|No|
|
74 |
+
|bigvgan_base_22khz_80band|22 kHz|80|8000|14M|LibriTTS + VCTK + LJSpeech|No|
|
75 |
+
|
76 |
+
The paper results are based on 24kHz BigVGAN models trained on LibriTTS dataset.
|
77 |
+
We also provide 22kHz BigVGAN models with band-limited setup (i.e., fmax=8000) for TTS applications.
|
78 |
+
Note that, the latest checkpoints use ``snakebeta`` activation with log scale parameterization, which have the best overall quality.
|
79 |
+
|
80 |
+
|
81 |
+
## TODO
|
82 |
+
|
83 |
+
Current codebase only provides a plain PyTorch implementation for the filtered nonlinearity. We are working on a fast CUDA kernel implementation, which will be released in the future.
|
84 |
+
|
85 |
+
|
86 |
+
## References
|
87 |
+
* [HiFi-GAN](https://github.com/jik876/hifi-gan) (for generator and multi-period discriminator)
|
88 |
+
|
89 |
+
* [Snake](https://github.com/EdwardDixon/snake) (for periodic activation)
|
90 |
+
|
91 |
+
* [Alias-free-torch](https://github.com/junjun3518/alias-free-torch) (for anti-aliasing)
|
92 |
+
|
93 |
+
* [Julius](https://github.com/adefossez/julius) (for low-pass filter)
|
94 |
+
|
95 |
+
* [UnivNet](https://github.com/mindslab-ai/univnet) (for multi-resolution discriminator)
|
pitch_controller/modules/BigVGAN/__pycache__/env.cpython-310.pyc
ADDED
Binary file (845 Bytes). View file
|
|
pitch_controller/modules/BigVGAN/__pycache__/inference.cpython-310.pyc
ADDED
Binary file (1.11 kB). View file
|
|