jerryhai commited on
Commit
90f7c1e
1 Parent(s): caf7184

Track binary files with Git LFS

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .ipynb_checkpoints/requirements-checkpoint.txt +17 -0
  2. .ipynb_checkpoints/score_based_apc-checkpoint.py +159 -0
  3. .ipynb_checkpoints/template_based_apc-checkpoint.py +89 -0
  4. README.md +86 -3
  5. examples/off-key.wav +0 -0
  6. examples/reference.wav +0 -0
  7. examples/score_midi.midi +0 -0
  8. examples/score_midi.npy +3 -0
  9. examples/score_vocal.wav +0 -0
  10. output_score.wav +0 -0
  11. output_template.wav +0 -0
  12. pitch_controller/README.md +1 -0
  13. pitch_controller/__pycache__/utils.cpython-310.pyc +0 -0
  14. pitch_controller/config/DiffWorld_24k.yaml +24 -0
  15. pitch_controller/data/example/f0/p225_001.wav.npy +3 -0
  16. pitch_controller/data/example/mel/p225_001.wav.npy +3 -0
  17. pitch_controller/data/example/wav/p225_001.wav +0 -0
  18. pitch_controller/data/example/world/p225_001.wav.npy +3 -0
  19. pitch_controller/data/prepare_f0.py +66 -0
  20. pitch_controller/data/prepare_mel.py +72 -0
  21. pitch_controller/data/prepare_world.py +85 -0
  22. pitch_controller/dataset/__init__.py +1 -0
  23. pitch_controller/dataset/__pycache__/__init__.cpython-310.pyc +0 -0
  24. pitch_controller/dataset/__pycache__/__init__.cpython-39.pyc +0 -0
  25. pitch_controller/dataset/__pycache__/content_enc.cpython-310.pyc +0 -0
  26. pitch_controller/dataset/__pycache__/content_enc.cpython-39.pyc +0 -0
  27. pitch_controller/dataset/__pycache__/diff.cpython-310.pyc +0 -0
  28. pitch_controller/dataset/__pycache__/diff.cpython-39.pyc +0 -0
  29. pitch_controller/dataset/__pycache__/diff_lpc.cpython-310.pyc +0 -0
  30. pitch_controller/dataset/diff_lpc.py +271 -0
  31. pitch_controller/dataset/diff_lpc_content.py +231 -0
  32. pitch_controller/load_vocoder.py +51 -0
  33. pitch_controller/models/__pycache__/base.cpython-310.pyc +0 -0
  34. pitch_controller/models/__pycache__/base.cpython-39.pyc +0 -0
  35. pitch_controller/models/__pycache__/modules.cpython-310.pyc +0 -0
  36. pitch_controller/models/__pycache__/modules.cpython-39.pyc +0 -0
  37. pitch_controller/models/__pycache__/pitch.cpython-39.pyc +0 -0
  38. pitch_controller/models/__pycache__/unet.cpython-310.pyc +0 -0
  39. pitch_controller/models/__pycache__/unet.cpython-39.pyc +0 -0
  40. pitch_controller/models/__pycache__/update_unet.cpython-310.pyc +0 -0
  41. pitch_controller/models/__pycache__/utils.cpython-310.pyc +0 -0
  42. pitch_controller/models/__pycache__/utils.cpython-39.pyc +0 -0
  43. pitch_controller/models/base.py +30 -0
  44. pitch_controller/models/modules.py +237 -0
  45. pitch_controller/models/unet.py +153 -0
  46. pitch_controller/models/utils.py +110 -0
  47. pitch_controller/modules/BigVGAN/LICENSE +21 -0
  48. pitch_controller/modules/BigVGAN/README.md +95 -0
  49. pitch_controller/modules/BigVGAN/__pycache__/env.cpython-310.pyc +0 -0
  50. pitch_controller/modules/BigVGAN/__pycache__/inference.cpython-310.pyc +0 -0
.ipynb_checkpoints/requirements-checkpoint.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ diffusers
2
+ einops
3
+ fastdtw
4
+ librosa
5
+ matplotlib
6
+ music21
7
+ numpy
8
+ pandas
9
+ pretty_midi
10
+ pysptk
11
+ pyworld
12
+ scipy
13
+ soundfile
14
+ tgt
15
+ torch
16
+ torchaudio
17
+ tqdm
.ipynb_checkpoints/score_based_apc-checkpoint.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import torch
6
+ import yaml
7
+ import librosa
8
+ import soundfile as sf
9
+ from tqdm import tqdm
10
+
11
+ from diffusers import DDIMScheduler
12
+ from pitch_controller.models.unet import UNetPitcher
13
+ from pitch_controller.utils import minmax_norm_diff, reverse_minmax_norm_diff
14
+ from pitch_controller.modules.BigVGAN.inference import load_model
15
+ from utils import get_mel, get_world_mel, get_f0, f0_to_coarse, show_plot, get_matched_f0, log_f0
16
+ from pitch_predictor.models.transformer import PitchFormer
17
+ import pretty_midi
18
+
19
+
20
+ def prepare_midi_wav(wav_id, midi_id, sr=24000):
21
+ midi = pretty_midi.PrettyMIDI(midi_id)
22
+ roll = midi.get_piano_roll()
23
+ roll = np.pad(roll, ((0, 0), (0, 1000)), constant_values=0)
24
+ roll[roll > 0] = 100
25
+
26
+ onset = midi.get_onsets()
27
+ before_onset = list(np.round(onset * 100 - 1).astype(int))
28
+ roll[:, before_onset] = 0
29
+
30
+ wav, sr = librosa.load(wav_id, sr=sr)
31
+
32
+ start = 0
33
+ end = round(100 * len(wav) / sr) / 100
34
+ # save audio
35
+ wav_seg = wav[round(start * sr):round(end * sr)]
36
+ cur_roll = roll[:, round(100 * start):round(100 * end)]
37
+ return wav_seg, cur_roll
38
+
39
+
40
+ def algin_mapping(content, target_len):
41
+ # align content with mel
42
+ src_len = content.shape[-1]
43
+ target = torch.zeros([content.shape[0], target_len], dtype=torch.float).to(content.device)
44
+ temp = torch.arange(src_len+1) * target_len / src_len
45
+
46
+ for i in range(target_len):
47
+ cur_idx = torch.argmin(torch.abs(temp-i))
48
+ target[:, i] = content[:, cur_idx]
49
+ return target
50
+
51
+
52
+ def midi_to_hz(midi):
53
+ idx = torch.zeros(midi.shape[-1])
54
+ for frame in range(midi.shape[-1]):
55
+ midi_frame = midi[:, frame]
56
+ non_zero = midi_frame.nonzero()
57
+ if len(non_zero) != 0:
58
+ hz = librosa.midi_to_hz(non_zero[0])
59
+ idx[frame] = torch.tensor(hz)
60
+ return idx
61
+
62
+
63
+ @torch.no_grad()
64
+ def score_pitcher(source, pitch_ref, model, hifigan, pitcher, steps=50, shift_semi=0, mask_with_source=False):
65
+ wav, midi = prepare_midi_wav(source, pitch_ref, sr=sr)
66
+
67
+ source_mel = get_world_mel(None, sr=sr, wav=wav)
68
+
69
+ midi = torch.tensor(midi, dtype=torch.float32)
70
+ midi = algin_mapping(midi, source_mel.shape[-1])
71
+ midi = midi_to_hz(midi)
72
+
73
+ f0_ori = np.nan_to_num(get_f0(source))
74
+
75
+ source_mel = torch.from_numpy(source_mel).float().unsqueeze(0).to(device)
76
+ f0_ori = torch.from_numpy(f0_ori).float().unsqueeze(0).to(device)
77
+ midi = midi.unsqueeze(0).to(device)
78
+
79
+ f0_pred = pitcher(sp=source_mel, midi=midi)
80
+ if mask_with_source:
81
+ # mask unvoiced frames based on original pitch estimation
82
+ f0_pred[f0_ori == 0] = 0
83
+ f0_pred = f0_pred.cpu().numpy()[0]
84
+ # limit range
85
+ f0_pred[f0_pred < librosa.note_to_hz('C2')] = 0
86
+ f0_pred[f0_pred > librosa.note_to_hz('C6')] = librosa.note_to_hz('C6')
87
+
88
+ f0_pred = f0_pred * (2 ** (shift_semi / 12))
89
+
90
+ f0_pred = log_f0(f0_pred, {'f0_bin': 345,
91
+ 'f0_min': librosa.note_to_hz('C2'),
92
+ 'f0_max': librosa.note_to_hz('C#6')})
93
+ f0_pred = torch.from_numpy(f0_pred).float().unsqueeze(0).to(device)
94
+
95
+ noise_scheduler = DDIMScheduler(num_train_timesteps=1000)
96
+ generator = torch.Generator(device=device).manual_seed(2024)
97
+
98
+ noise_scheduler.set_timesteps(steps)
99
+ noise = torch.randn(source_mel.shape, generator=generator, device=device)
100
+ pred = noise
101
+ source_x = minmax_norm_diff(source_mel, vmax=max_mel, vmin=min_mel)
102
+
103
+ for t in tqdm(noise_scheduler.timesteps):
104
+ pred = noise_scheduler.scale_model_input(pred, t)
105
+ model_output = model(x=pred, mean=source_x, f0=f0_pred, t=t, ref=None, embed=None)
106
+ pred = noise_scheduler.step(model_output=model_output,
107
+ timestep=t,
108
+ sample=pred,
109
+ eta=1, generator=generator).prev_sample
110
+
111
+ pred = reverse_minmax_norm_diff(pred, vmax=max_mel, vmin=min_mel)
112
+
113
+ pred_audio = hifigan(pred)
114
+ pred_audio = pred_audio.cpu().squeeze().clamp(-1, 1)
115
+
116
+ return pred_audio
117
+
118
+
119
+ if __name__ == '__main__':
120
+ min_mel = np.log(1e-5)
121
+ max_mel = 2.5
122
+ sr = 24000
123
+
124
+ use_gpu = torch.cuda.is_available()
125
+ device = 'cuda' if use_gpu else 'cpu'
126
+
127
+ # load diffusion model
128
+ config = yaml.load(open('pitch_controller/config/DiffWorld_24k.yaml'), Loader=yaml.FullLoader)
129
+ mel_cfg = config['logmel']
130
+ ddpm_cfg = config['ddpm']
131
+ unet_cfg = config['unet']
132
+ model = UNetPitcher(**unet_cfg)
133
+ unet_path = 'ckpts/world_fixed_40.pt'
134
+
135
+ state_dict = torch.load(unet_path)
136
+ for key in list(state_dict.keys()):
137
+ state_dict[key.replace('_orig_mod.', '')] = state_dict.pop(key)
138
+ model.load_state_dict(state_dict)
139
+ if use_gpu:
140
+ model.cuda()
141
+ model.eval()
142
+
143
+ # load vocoder
144
+ hifi_path = 'ckpts/bigvgan_24khz_100band/g_05000000.pt'
145
+ hifigan, cfg = load_model(hifi_path, device=device)
146
+ hifigan.eval()
147
+
148
+ # load pitch predictor
149
+ pitcher = PitchFormer(100, 512).to(device)
150
+ ckpt = torch.load('ckpts/ckpt_transformer_pitch/transformer_pitch_360.pt')
151
+ pitcher.load_state_dict(ckpt)
152
+ pitcher.eval()
153
+
154
+ pred_audio = score_pitcher('examples/score_vocal.wav', 'examples/score_midi.midi', model, hifigan, pitcher, steps=50)
155
+ sf.write('output_score.wav', pred_audio, samplerate=sr)
156
+
157
+
158
+
159
+
.ipynb_checkpoints/template_based_apc-checkpoint.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import torch
6
+ import yaml
7
+ import librosa
8
+ import soundfile as sf
9
+ from tqdm import tqdm
10
+
11
+ from diffusers import DDIMScheduler
12
+ from pitch_controller.models.unet import UNetPitcher
13
+ from pitch_controller.utils import minmax_norm_diff, reverse_minmax_norm_diff
14
+ from pitch_controller.modules.BigVGAN.inference import load_model
15
+ from utils import get_mel, get_world_mel, get_f0, f0_to_coarse, show_plot, get_matched_f0, log_f0
16
+
17
+
18
+ @torch.no_grad()
19
+ def template_pitcher(source, pitch_ref, model, hifigan, steps=50, shift_semi=0):
20
+
21
+ source_mel = get_world_mel(source, sr=sr)
22
+
23
+ f0_ref = get_matched_f0(source, pitch_ref, 'world')
24
+ f0_ref = f0_ref * 2 ** (shift_semi / 12)
25
+
26
+ f0_ref = log_f0(f0_ref, {'f0_bin': 345,
27
+ 'f0_min': librosa.note_to_hz('C2'),
28
+ 'f0_max': librosa.note_to_hz('C#6')})
29
+
30
+ source_mel = torch.from_numpy(source_mel).float().unsqueeze(0).to(device)
31
+ f0_ref = torch.from_numpy(f0_ref).float().unsqueeze(0).to(device)
32
+
33
+ noise_scheduler = DDIMScheduler(num_train_timesteps=1000)
34
+ generator = torch.Generator(device=device).manual_seed(2024)
35
+
36
+ noise_scheduler.set_timesteps(steps)
37
+ noise = torch.randn(source_mel.shape, generator=generator, device=device)
38
+ pred = noise
39
+ source_x = minmax_norm_diff(source_mel, vmax=max_mel, vmin=min_mel)
40
+
41
+ for t in tqdm(noise_scheduler.timesteps):
42
+ pred = noise_scheduler.scale_model_input(pred, t)
43
+ model_output = model(x=pred, mean=source_x, f0=f0_ref, t=t, ref=None, embed=None)
44
+ pred = noise_scheduler.step(model_output=model_output,
45
+ timestep=t,
46
+ sample=pred,
47
+ eta=1, generator=generator).prev_sample
48
+
49
+ pred = reverse_minmax_norm_diff(pred, vmax=max_mel, vmin=min_mel)
50
+
51
+ pred_audio = hifigan(pred)
52
+ pred_audio = pred_audio.cpu().squeeze().clamp(-1, 1)
53
+
54
+ return pred_audio
55
+
56
+
57
+ if __name__ == '__main__':
58
+ min_mel = np.log(1e-5)
59
+ max_mel = 2.5
60
+ sr = 24000
61
+
62
+ use_gpu = torch.cuda.is_available()
63
+ device = 'cuda' if use_gpu else 'cpu'
64
+
65
+ # load diffusion model
66
+ config = yaml.load(open('pitch_controller/config/DiffWorld_24k.yaml'), Loader=yaml.FullLoader)
67
+ mel_cfg = config['logmel']
68
+ ddpm_cfg = config['ddpm']
69
+ unet_cfg = config['unet']
70
+ model = UNetPitcher(**unet_cfg)
71
+ unet_path = 'ckpts/world_fixed_40.pt'
72
+
73
+ state_dict = torch.load(unet_path)
74
+ for key in list(state_dict.keys()):
75
+ state_dict[key.replace('_orig_mod.', '')] = state_dict.pop(key)
76
+ model.load_state_dict(state_dict)
77
+ if use_gpu:
78
+ model.cuda()
79
+ model.eval()
80
+
81
+ # load vocoder
82
+ hifi_path = 'ckpts/bigvgan_24khz_100band/g_05000000.pt'
83
+ hifigan, cfg = load_model(hifi_path, device=device)
84
+ hifigan.eval()
85
+
86
+ pred_audio = template_pitcher('examples/off-key.wav', 'examples/reference.wav', model, hifigan, steps=50, shift_semi=0)
87
+ sf.write('output_template.wav', pred_audio, samplerate=sr)
88
+
89
+
README.md CHANGED
@@ -1,3 +1,86 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <img src="img\cover.png">
2
+
3
+ # Diff-Pitcher (PyTorch)
4
+
5
+ Official Pytorch Implementation of [Diff-Pitcher: Diffusion-based Singing Voice Pitch Correction](https://engineering.jhu.edu/lcap/data/uploads/pdfs/waspaa2023_hai.pdf)
6
+
7
+ --------------------
8
+
9
+ Thank you all for your interest in this research project. I am currently optimizing the model's performance and computation efficiency. I plan to release a user-friendly version, either a GUI or a VST, in the first half of this year, and will update the open-source license.
10
+
11
+ If you are familiar with PyTorch, you can follow [Code Examples](#examples) to use Diff-Pitcher.
12
+
13
+ --------------------
14
+
15
+ Diff-Pitcher
16
+
17
+ - [Demo Page](#demo)
18
+ - [Todo List](#todo)
19
+ - [Code Examples](#examples)
20
+ - [References](#references)
21
+ - [Acknowledgement](#acknowledgement)
22
+
23
+ ## Demo
24
+
25
+ 🎵 Listen to [examples](https://jhu-lcap.github.io/Diff-Pitcher/)
26
+
27
+ ## Todo
28
+ - [x] Update codes and demo
29
+ - [x] Support 🤗 [Diffusers](https://github.com/huggingface/diffusers)
30
+ - [x] Upload checkpoints
31
+ - [x] Pipeline tutorial
32
+ - [ ] Merge to [Your-Stable-Audio](https://github.com/haidog-yaqub/Your-Stable-Audio)
33
+ - [ ] Audio Plugin Support
34
+ ## Examples
35
+ - Download checkpoints: 🎒[ckpts](https://github.com/haidog-yaqub/DiffPitcher/tree/main/ckpts)
36
+ - Prepare environment: [requirements.txt](requirements.txt)
37
+ - Feel free to try:
38
+ - template-based automatic pitch correction: [template_based_apc.py](template_based_apc.py)
39
+ - score-based automatic pitch correction: [score_based_apc.py](score_based_apc.py)
40
+
41
+
42
+ ## References
43
+
44
+ If you find the code useful for your research, please consider citing:
45
+
46
+ ```bibtex
47
+ @inproceedings{hai2023diff,
48
+ title={Diff-Pitcher: Diffusion-Based Singing Voice Pitch Correction},
49
+ author={Hai, Jiarui and Elhilali, Mounya},
50
+ booktitle={2023 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
51
+ pages={1--5},
52
+ year={2023},
53
+ organization={IEEE}
54
+ }
55
+ ```
56
+
57
+ This repo is inspired by:
58
+
59
+ ```bibtex
60
+ @article{popov2021diffusion,
61
+ title={Diffusion-based voice conversion with fast maximum likelihood sampling scheme},
62
+ author={Popov, Vadim and Vovk, Ivan and Gogoryan, Vladimir and Sadekova, Tasnima and Kudinov, Mikhail and Wei, Jiansheng},
63
+ journal={arXiv preprint arXiv:2109.13821},
64
+ year={2021}
65
+ }
66
+ ```
67
+ ```bibtex
68
+ @inproceedings{liu2022diffsinger,
69
+ title={Diffsinger: Singing voice synthesis via shallow diffusion mechanism},
70
+ author={Liu, Jinglin and Li, Chengxi and Ren, Yi and Chen, Feiyang and Zhao, Zhou},
71
+ booktitle={Proceedings of the AAAI conference on artificial intelligence},
72
+ volume={36},
73
+ number={10},
74
+ pages={11020--11028},
75
+ year={2022}
76
+ }
77
+ ```
78
+
79
+ ## Acknowledgement
80
+
81
+ [Welcome to LCAP! < LCAP (jhu.edu)](https://engineering.jhu.edu/lcap/)
82
+
83
+ We borrow code from following repos:
84
+
85
+ - `Diffusion Schedulers` are based on 🤗 [Diffusers](https://github.com/huggingface/diffusers)
86
+ - `2D UNet` is based on [DiffVC](https://github.com/huawei-noah/Speech-Backbones/tree/main/DiffVC)
examples/off-key.wav ADDED
Binary file (816 kB). View file
 
examples/reference.wav ADDED
Binary file (816 kB). View file
 
examples/score_midi.midi ADDED
Binary file (121 Bytes). View file
 
examples/score_midi.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7baacba4afb8813d057e420cd63853657401403b6c798f6cb7f06673e7dcea5a
3
+ size 559232
examples/score_vocal.wav ADDED
Binary file (262 kB). View file
 
output_score.wav ADDED
Binary file (262 kB). View file
 
output_template.wav ADDED
Binary file (225 kB). View file
 
pitch_controller/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # Diffusion-based Pitch Controller
pitch_controller/__pycache__/utils.cpython-310.pyc ADDED
Binary file (1.94 kB). View file
 
pitch_controller/config/DiffWorld_24k.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ logmel:
4
+ n_mels: 100
5
+ sampling_rate: 24000
6
+ n_fft: 1024
7
+ hop_size: 256
8
+ max: 2.5
9
+ min: -12
10
+
11
+ unet:
12
+ dim_base: 256
13
+ use_embed: False
14
+ dim_embed: None
15
+ use_ref_t: False
16
+ dim_cond: 128
17
+ dim_mults: [1, 2, 4]
18
+
19
+ ddpm:
20
+ num_train_steps: 1000
21
+ inference_steps: 100
22
+ eta: 0.8
23
+
24
+
pitch_controller/data/example/f0/p225_001.wav.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8df28ae08ef686e7c7e523fdde25b62fbd05725cdacc043cde407a898182272f
3
+ size 1672
pitch_controller/data/example/mel/p225_001.wav.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bf3c0e6956f57acdd82f5d91f6390ce148d89066faedbdd6f6ac8c48d1d2c76
3
+ size 77328
pitch_controller/data/example/wav/p225_001.wav ADDED
Binary file (197 kB). View file
 
pitch_controller/data/example/world/p225_001.wav.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e00d5eb7fa9df26321df3f3df06e2ff44c3b3732cc5179ef135e41ffeb3a3b82
3
+ size 77328
pitch_controller/data/prepare_f0.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import amfm_decompy.basic_tools as basic
2
+ # import amfm_decompy.pYAAPT as pYAAPT
3
+ from multiprocessing import Process
4
+ import os
5
+ import numpy as np
6
+ import pandas as pd
7
+ import librosa
8
+ from librosa.core import load
9
+ from tqdm import tqdm
10
+
11
+
12
+ def get_f0(wav_path):
13
+ wav, _ = load(wav_path, sr=24000)
14
+ wav = wav[:(wav.shape[0] // 256) * 256]
15
+ wav = np.pad(wav, 384, mode='reflect')
16
+ f0, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=256, center=False,
17
+ fmin=librosa.note_to_hz('C2'),
18
+ fmax=librosa.note_to_hz('C6'))
19
+ return np.nan_to_num(f0)
20
+
21
+
22
+ def chunks(arr, m):
23
+ result = [[] for i in range(m)]
24
+ for i in range(len(arr)):
25
+ result[i%m].append(arr[i])
26
+ return result
27
+
28
+
29
+ def extract_f0(subset):
30
+ meta = pd.read_csv('../raw_data/meta_fix.csv')
31
+ meta = meta[meta['subset'] == 'train']
32
+ # meta = meta[meta['folder'] == 'VCTK-Corpus/vocal/']
33
+
34
+ for i in tqdm(subset):
35
+ line = meta.iloc[i]
36
+ audio_dir = '../raw_data/' + line['folder'] + line['subfolder']
37
+ f = line['file_name']
38
+
39
+ f0_dir = audio_dir.replace('vocal', 'f0').replace('raw_data/', '24k_data_f0/')
40
+
41
+ try:
42
+ np.load(os.path.join(f0_dir, f+'.npy'))
43
+ except:
44
+ print(line)
45
+ f0 = get_f0(os.path.join(audio_dir, f))
46
+ if os.path.exists(f0_dir) is False:
47
+ os.makedirs(f0_dir, exist_ok=True)
48
+ np.save(os.path.join(f0_dir, f + '.npy'), f0)
49
+
50
+ # if os.path.exists(os.path.join(f0_dir, f+'.npy')) is False:
51
+ # f0 = get_yaapt_f0(os.path.join(audio_dir, f))
52
+
53
+
54
+ if __name__ == '__main__':
55
+ cores = 8
56
+ meta = pd.read_csv('../raw_data/meta_fix.csv')
57
+ meta = meta[meta['subset']=='train']
58
+ # meta = meta[meta['folder'] == 'VCTK-Corpus/vocal/']
59
+
60
+ idx_list = [i for i in range(len(meta))]
61
+
62
+ subsets = chunks(idx_list, cores)
63
+
64
+ for subset in subsets:
65
+ t = Process(target=extract_f0, args=(subset,))
66
+ t.start()
pitch_controller/data/prepare_mel.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+
4
+ import librosa
5
+ from librosa.core import load
6
+ from librosa.filters import mel as librosa_mel_fn
7
+ mel_basis = librosa_mel_fn(sr=24000, n_fft=1024, n_mels=100, fmin=0, fmax=12000)
8
+
9
+ from tqdm import tqdm
10
+ import pandas as pd
11
+
12
+ from multiprocessing import Process
13
+
14
+
15
+ # def get_f0(wav_path):
16
+ # wav, _ = load(wav_path, sr=22050)
17
+ # wav = wav[:(wav.shape[0] // 256) * 256]
18
+ # wav = np.pad(wav, 384, mode='reflect')
19
+ # f0, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=256, center=False,
20
+ # fmin=librosa.note_to_hz('C2'),
21
+ # fmax=librosa.note_to_hz('C6'))
22
+ # return np.nan_to_num(f0)
23
+
24
+ def get_mel(wav_path):
25
+ wav, _ = load(wav_path, sr=24000)
26
+ wav = wav[:(wav.shape[0] // 256)*256]
27
+ wav = np.pad(wav, 384, mode='reflect')
28
+ stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False)
29
+ stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9))
30
+ mel_spectrogram = np.matmul(mel_basis, stftm)
31
+ log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None))
32
+ return log_mel_spectrogram
33
+
34
+
35
+ def chunks(arr, m):
36
+ result = [[] for i in range(m)]
37
+ for i in range(len(arr)):
38
+ result[i%m].append(arr[i])
39
+ return result
40
+
41
+
42
+ def extract_mel(subset):
43
+ meta = pd.read_csv('../raw_data/meta_fix.csv')
44
+ meta = meta[meta['folder'] == 'eval/vocal/']
45
+
46
+ for i in tqdm(subset):
47
+ line = meta.iloc[i]
48
+ audio_dir = '../raw_data/' + line['folder'] + line['subfolder']
49
+ f = line['file_name']
50
+
51
+ mel_dir = audio_dir.replace('vocal', 'mel').replace('raw_data/', '24k_data/')
52
+
53
+ if os.path.exists(os.path.join(mel_dir, f+'.npy')) is False:
54
+ mel = get_mel(os.path.join(audio_dir, f))
55
+ if os.path.exists(mel_dir) is False:
56
+ os.makedirs(mel_dir)
57
+ np.save(os.path.join(mel_dir, f+'.npy'), mel)
58
+
59
+
60
+ if __name__ == '__main__':
61
+ cores = 8
62
+
63
+ meta = pd.read_csv('../raw_data/meta_fix.csv')
64
+ meta = meta[meta['folder'] == 'eval/vocal/']
65
+
66
+ idx_list = [i for i in range(len(meta))]
67
+
68
+ subsets = chunks(idx_list, cores)
69
+
70
+ for subset in subsets:
71
+ t = Process(target=extract_mel, args=(subset,))
72
+ t.start()
pitch_controller/data/prepare_world.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from multiprocessing import Process
2
+ import os
3
+ import numpy as np
4
+
5
+ import librosa
6
+ from librosa.core import load
7
+ from librosa.filters import mel as librosa_mel_fn
8
+ mel_basis = librosa_mel_fn(sr=24000, n_fft=1024, n_mels=100, fmin=0, fmax=12000)
9
+
10
+ from tqdm import tqdm
11
+ import pandas as pd
12
+ import pyworld as pw
13
+
14
+
15
+ def get_world_mel(wav_path, sr=24000):
16
+ wav, _ = librosa.load(wav_path, sr=sr)
17
+ wav = (wav * 32767).astype(np.int16)
18
+ wav = (wav / 32767).astype(np.float64)
19
+ # wav = wav.astype(np.float64)
20
+ wav = wav[:(wav.shape[0] // 256) * 256]
21
+
22
+ _f0, t = pw.dio(wav, sr, frame_period=256/sr*1000)
23
+ f0 = pw.stonemask(wav, _f0, t, sr)
24
+ sp = pw.cheaptrick(wav, f0, t, sr)
25
+ ap = pw.d4c(wav, f0, t, sr)
26
+ wav_hat = pw.synthesize(f0 * 0, sp, ap, sr, frame_period=256/sr*1000)
27
+
28
+ # pyworld output does not pad left
29
+ wav_hat = wav_hat[:len(wav)]
30
+ # wav_hat = wav_hat[256//2: len(wav)+256//2]
31
+ assert len(wav_hat) == len(wav)
32
+ wav = wav_hat.astype(np.float32)
33
+ wav = np.pad(wav, 384, mode='reflect')
34
+ stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False)
35
+ stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9))
36
+ mel_spectrogram = np.matmul(mel_basis, stftm)
37
+ log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None))
38
+
39
+ return log_mel_spectrogram, f0
40
+
41
+
42
+ def chunks(arr, m):
43
+ result = [[] for i in range(m)]
44
+ for i in range(len(arr)):
45
+ result[i%m].append(arr[i])
46
+ return result
47
+
48
+
49
+ def extract_pw(subset, save_f0=False):
50
+ meta = pd.read_csv('../raw_data/meta_fix.csv')
51
+ meta = meta[meta['subset'] == 'train']
52
+
53
+ for i in tqdm(subset):
54
+ line = meta.iloc[i]
55
+ audio_dir = '../raw_data/' + line['folder'] + line['subfolder']
56
+ f = line['file_name']
57
+
58
+ mel_dir = audio_dir.replace('vocal', 'world').replace('raw_data/', '24k_data/')
59
+ f0_dir = audio_dir.replace('vocal', 'f0').replace('raw_data/', '24k_f0/')
60
+
61
+ if os.path.exists(os.path.join(mel_dir, f+'.npy')) is False:
62
+ mel = get_world_mel(os.path.join(audio_dir, f))
63
+
64
+ if os.path.exists(mel_dir) is False:
65
+ os.makedirs(mel_dir)
66
+ np.save(os.path.join(mel_dir, f+'.npy'), mel)
67
+
68
+ if save_f0 is True:
69
+ if os.path.exists(f0_dir) is False:
70
+ os.makedirs(f0_dir)
71
+ np.save(os.path.join(f0_dir, f + '.npy'), f0)
72
+
73
+
74
+ if __name__ == '__main__':
75
+ cores = 8
76
+ meta = pd.read_csv('../raw_data/meta_fix.csv')
77
+ meta = meta[meta['subset'] == 'train']
78
+
79
+ idx_list = [i for i in range(len(meta))]
80
+
81
+ subsets = chunks(idx_list, cores)
82
+
83
+ for subset in subsets:
84
+ t = Process(target=extract_pw, args=(subset,))
85
+ t.start()
pitch_controller/dataset/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .diff_lpc import VCDecLPCDataset, VCDecLPCBatchCollate, VCDecLPCTest
pitch_controller/dataset/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (237 Bytes). View file
 
pitch_controller/dataset/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (311 Bytes). View file
 
pitch_controller/dataset/__pycache__/content_enc.cpython-310.pyc ADDED
Binary file (2.85 kB). View file
 
pitch_controller/dataset/__pycache__/content_enc.cpython-39.pyc ADDED
Binary file (2.84 kB). View file
 
pitch_controller/dataset/__pycache__/diff.cpython-310.pyc ADDED
Binary file (5.79 kB). View file
 
pitch_controller/dataset/__pycache__/diff.cpython-39.pyc ADDED
Binary file (5.83 kB). View file
 
pitch_controller/dataset/__pycache__/diff_lpc.cpython-310.pyc ADDED
Binary file (7.03 kB). View file
 
pitch_controller/dataset/diff_lpc.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import numpy as np
4
+ import torch
5
+ import tgt
6
+ import pandas as pd
7
+
8
+ from torch.utils.data import Dataset
9
+ import librosa
10
+
11
+
12
+ def f0_to_coarse(f0, hparams):
13
+ f0_bin = hparams['f0_bin']
14
+ f0_max = hparams['f0_max']
15
+ f0_min = hparams['f0_min']
16
+ is_torch = isinstance(f0, torch.Tensor)
17
+ # to mel scale
18
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
19
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
20
+ f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
21
+
22
+ unvoiced = (f0_mel == 0)
23
+
24
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
25
+
26
+ f0_mel[f0_mel <= 1] = 1
27
+ f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
28
+
29
+ f0_mel[unvoiced] = 0
30
+
31
+ f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int)
32
+ assert f0_coarse.max() <= 255 and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min())
33
+ return f0_coarse
34
+
35
+
36
+ def log_f0(f0, hparams):
37
+ f0_bin = hparams['f0_bin']
38
+ f0_max = hparams['f0_max']
39
+ f0_min = hparams['f0_min']
40
+
41
+ f0_mel = np.zeros_like(f0)
42
+ f0_mel[f0 != 0] = 12*np.log2(f0[f0 != 0]/f0_min) + 1
43
+ f0_mel_min = 12*np.log2(f0_min/f0_min) + 1
44
+ f0_mel_max = 12*np.log2(f0_max/f0_min) + 1
45
+
46
+ unvoiced = (f0_mel == 0)
47
+
48
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
49
+
50
+ f0_mel[f0_mel <= 1] = 1
51
+ f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
52
+
53
+ f0_mel[unvoiced] = 0
54
+
55
+ f0_coarse = np.rint(f0_mel).astype(int)
56
+ assert f0_coarse.max() <= (f0_bin-1) and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min())
57
+ return f0_coarse
58
+
59
+
60
+ # training "average voice" encoder
61
+ class VCDecLPCDataset(Dataset):
62
+ def __init__(self, data_dir, subset, content_dir='lpc_mel_512', extract_emb=False,
63
+ f0_type='bins'):
64
+ self.path = data_dir
65
+ meta = pd.read_csv(data_dir + 'meta_fix.csv')
66
+ self.meta = meta[meta['subset'] == subset]
67
+ self.content_dir = content_dir
68
+ self.extract_emb = extract_emb
69
+ self.f0_type = f0_type
70
+
71
+ def get_vc_data(self, audio_path, mel_id):
72
+ mel_dir = audio_path.replace('vocal', 'mel')
73
+ embed_dir = audio_path.replace('vocal', 'embed')
74
+ pitch_dir = audio_path.replace('vocal', 'f0')
75
+ content_dir = audio_path.replace('vocal', self.content_dir)
76
+
77
+ mel = os.path.join(mel_dir, mel_id + '.npy')
78
+ embed = os.path.join(embed_dir, mel_id + '.npy')
79
+ pitch = os.path.join(pitch_dir, mel_id + '.npy')
80
+ content = os.path.join(content_dir, mel_id + '.npy')
81
+
82
+ mel = np.load(mel)
83
+ if self.extract_emb:
84
+ embed = np.load(embed)
85
+ else:
86
+ embed = np.zeros(1)
87
+
88
+ pitch = np.load(pitch)
89
+ content = np.load(content)
90
+
91
+ pitch = np.nan_to_num(pitch)
92
+ if self.f0_type == 'bins':
93
+ pitch = f0_to_coarse(pitch, {'f0_bin': 256,
94
+ 'f0_min': librosa.note_to_hz('C2'),
95
+ 'f0_max': librosa.note_to_hz('C6')})
96
+ elif self.f0_type == 'log':
97
+ pitch = log_f0(pitch, {'f0_bin': 345,
98
+ 'f0_min': librosa.note_to_hz('C2'),
99
+ 'f0_max': librosa.note_to_hz('C#6')})
100
+
101
+ mel = torch.from_numpy(mel).float()
102
+ embed = torch.from_numpy(embed).float()
103
+ pitch = torch.from_numpy(pitch).float()
104
+ content = torch.from_numpy(content).float()
105
+
106
+ return (mel, embed, pitch, content)
107
+
108
+ def __getitem__(self, index):
109
+ row = self.meta.iloc[index]
110
+ mel_id = row['file_name']
111
+ audio_path = self.path + row['folder'] + row['subfolder']
112
+ mel, embed, pitch, content = self.get_vc_data(audio_path, mel_id)
113
+ item = {'mel': mel, 'embed': embed, 'f0': pitch, 'content': content}
114
+ return item
115
+
116
+ def __len__(self):
117
+ return len(self.meta)
118
+
119
+
120
+ class VCDecLPCBatchCollate(object):
121
+ def __init__(self, train_frames, eps=1e-5):
122
+ self.train_frames = train_frames
123
+ self.eps = eps
124
+
125
+ def __call__(self, batch):
126
+ train_frames = self.train_frames
127
+ eps = self.eps
128
+
129
+ B = len(batch)
130
+ embed = torch.stack([item['embed'] for item in batch], 0)
131
+
132
+ n_mels = batch[0]['mel'].shape[0]
133
+ content_dim = batch[0]['content'].shape[0]
134
+
135
+ # min value of log-mel spectrogram is np.log(eps) == padding zero in time domain
136
+ mels1 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * np.log(eps)
137
+ mels2 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * np.log(eps)
138
+
139
+ # ! need to deal with empty frames here
140
+ contents1 = torch.ones((B, content_dim, train_frames), dtype=torch.float32) * np.log(eps)
141
+
142
+ f0s1 = torch.zeros((B, train_frames), dtype=torch.float32)
143
+ max_starts = [max(item['mel'].shape[-1] - train_frames, 0)
144
+ for item in batch]
145
+
146
+ starts1 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts]
147
+ starts2 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts]
148
+ mel_lengths = []
149
+ for i, item in enumerate(batch):
150
+ mel = item['mel']
151
+ f0 = item['f0']
152
+ content = item['content']
153
+
154
+ if mel.shape[-1] < train_frames:
155
+ mel_length = mel.shape[-1]
156
+ else:
157
+ mel_length = train_frames
158
+
159
+ mels1[i, :, :mel_length] = mel[:, starts1[i]:starts1[i] + mel_length]
160
+ f0s1[i, :mel_length] = f0[starts1[i]:starts1[i] + mel_length]
161
+ contents1[i, :, :mel_length] = content[:, starts1[i]:starts1[i] + mel_length]
162
+
163
+ mels2[i, :, :mel_length] = mel[:, starts2[i]:starts2[i] + mel_length]
164
+ mel_lengths.append(mel_length)
165
+
166
+ mel_lengths = torch.LongTensor(mel_lengths)
167
+
168
+ return {'mel1': mels1, 'mel2': mels2, 'mel_lengths': mel_lengths,
169
+ 'embed': embed,
170
+ 'f0_1': f0s1,
171
+ 'content1': contents1}
172
+
173
+
174
+ class VCDecLPCTest(Dataset):
175
+ def __init__(self, data_dir, subset='test', eps=1e-5, test_frames=256, content_dir='lpc_mel_512', extract_emb=False,
176
+ f0_type='bins'):
177
+ self.path = data_dir
178
+ meta = pd.read_csv(data_dir + 'meta_test.csv')
179
+ self.meta = meta[meta['subset'] == subset]
180
+ self.content_dir = content_dir
181
+ self.extract_emb = extract_emb
182
+ self.eps = eps
183
+ self.test_frames = test_frames
184
+ self.f0_type = f0_type
185
+
186
+ def get_vc_data(self, audio_path, mel_id, pitch_shift):
187
+ mel_dir = audio_path.replace('vocal', 'mel')
188
+ embed_dir = audio_path.replace('vocal', 'embed')
189
+ pitch_dir = audio_path.replace('vocal', 'f0')
190
+ content_dir = audio_path.replace('vocal', self.content_dir)
191
+
192
+ mel = os.path.join(mel_dir, mel_id + '.npy')
193
+ embed = os.path.join(embed_dir, mel_id + '.npy')
194
+ pitch = os.path.join(pitch_dir, mel_id + '.npy')
195
+ content = os.path.join(content_dir, mel_id + '.npy')
196
+
197
+ mel = np.load(mel)
198
+ if self.extract_emb:
199
+ embed = np.load(embed)
200
+ else:
201
+ embed = np.zeros(1)
202
+
203
+ pitch = np.load(pitch)
204
+ content = np.load(content)
205
+
206
+ pitch = np.nan_to_num(pitch)
207
+ pitch = pitch*pitch_shift
208
+
209
+ if self.f0_type == 'bins':
210
+ pitch = f0_to_coarse(pitch, {'f0_bin': 256,
211
+ 'f0_min': librosa.note_to_hz('C2'),
212
+ 'f0_max': librosa.note_to_hz('C6')})
213
+ elif self.f0_type == 'log':
214
+ pitch = log_f0(pitch, {'f0_bin': 345,
215
+ 'f0_min': librosa.note_to_hz('C2'),
216
+ 'f0_max': librosa.note_to_hz('C#6')})
217
+
218
+ mel = torch.from_numpy(mel).float()
219
+ embed = torch.from_numpy(embed).float()
220
+ pitch = torch.from_numpy(pitch).float()
221
+ content = torch.from_numpy(content).float()
222
+
223
+ return (mel, embed, pitch, content)
224
+
225
+ def __getitem__(self, index):
226
+ row = self.meta.iloc[index]
227
+
228
+ mel_id = row['content_file_name']
229
+ audio_path = self.path + row['content_folder'] + row['content_subfolder']
230
+ pitch_shift = row['pitch_shift']
231
+ mel1, _, f0, content = self.get_vc_data(audio_path, mel_id, pitch_shift)
232
+
233
+ mel_id = row['timbre_file_name']
234
+ audio_path = self.path + row['timbre_folder'] + row['timbre_subfolder']
235
+ mel2, embed, _, _ = self.get_vc_data(audio_path, mel_id, pitch_shift)
236
+
237
+ n_mels = mel1.shape[0]
238
+ content_dim = content.shape[0]
239
+
240
+ mels1 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * np.log(self.eps)
241
+ mels2 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * np.log(self.eps)
242
+ lpcs1 = torch.ones((content_dim, self.test_frames), dtype=torch.float32) * np.log(self.eps)
243
+
244
+ f0s1 = torch.zeros(self.test_frames, dtype=torch.float32)
245
+
246
+ if mel1.shape[-1] < self.test_frames:
247
+ mel_length = mel1.shape[-1]
248
+ else:
249
+ mel_length = self.test_frames
250
+ mels1[:, :mel_length] = mel1[:, :mel_length]
251
+ f0s1[:mel_length] = f0[:mel_length]
252
+ lpcs1[:, :mel_length] = content[:, :mel_length]
253
+
254
+ if mel2.shape[-1] < self.test_frames:
255
+ mel_length = mel2.shape[-1]
256
+ else:
257
+ mel_length = self.test_frames
258
+ mels2[:, :mel_length] = mel2[:, :mel_length]
259
+
260
+ return {'mel1': mels1, 'mel2': mels2, 'embed': embed, 'f0_1': f0s1, 'content1': lpcs1}
261
+
262
+ def __len__(self):
263
+ return len(self.meta)
264
+
265
+
266
+ if __name__ == '__main__':
267
+ f0 = np.array([110.0, 220.0, librosa.note_to_hz('C2'), 0, librosa.note_to_hz('E3'), librosa.note_to_hz('C6')])
268
+ # 50 midi notes = (50-1)
269
+ pitch = log_f0(f0, {'f0_bin': 345,
270
+ 'f0_min': librosa.note_to_hz('C2'),
271
+ 'f0_max': librosa.note_to_hz('C#6')})
pitch_controller/dataset/diff_lpc_content.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import numpy as np
4
+ import torch
5
+ import tgt
6
+ import pandas as pd
7
+
8
+ from torch.utils.data import Dataset
9
+ import librosa
10
+
11
+
12
+ def f0_to_coarse(f0, hparams):
13
+ f0_bin = hparams['f0_bin']
14
+ f0_max = hparams['f0_max']
15
+ f0_min = hparams['f0_min']
16
+ is_torch = isinstance(f0, torch.Tensor)
17
+ # to mel scale
18
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
19
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
20
+ f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
21
+
22
+ unvoiced = (f0_mel == 0)
23
+
24
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
25
+
26
+ f0_mel[f0_mel <= 1] = 1
27
+ f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
28
+
29
+ f0_mel[unvoiced] = 0
30
+
31
+ f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int)
32
+ assert f0_coarse.max() <= 255 and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min())
33
+ return f0_coarse
34
+
35
+
36
+ # training "average voice" encoder
37
+ class VCDecLPCDataset(Dataset):
38
+ def __init__(self, data_dir, subset, content_dir='lpc_mel_512', extract_emb=False):
39
+ self.path = data_dir
40
+ meta = pd.read_csv(data_dir + 'meta_fix.csv')
41
+ self.meta = meta[meta['subset'] == subset]
42
+ self.content_dir = content_dir
43
+ self.extract_emb = extract_emb
44
+
45
+ def get_vc_data(self, audio_path, mel_id):
46
+ mel_dir = audio_path.replace('vocal', 'mel')
47
+ embed_dir = audio_path.replace('vocal', 'embed')
48
+ pitch_dir = audio_path.replace('vocal', 'f0')
49
+ content_dir = audio_path.replace('vocal', self.content_dir)
50
+
51
+ mel = os.path.join(mel_dir, mel_id + '.npy')
52
+ embed = os.path.join(embed_dir, mel_id + '.npy')
53
+ pitch = os.path.join(pitch_dir, mel_id + '.npy')
54
+ content = os.path.join(content_dir, mel_id + '.npy')
55
+
56
+ mel = np.load(mel)
57
+ if self.extract_emb:
58
+ embed = np.load(embed)
59
+ else:
60
+ embed = np.zeros(1)
61
+
62
+ pitch = np.load(pitch)
63
+ content = np.load(content)
64
+
65
+ pitch = np.nan_to_num(pitch)
66
+ pitch = f0_to_coarse(pitch, {'f0_bin': 256,
67
+ 'f0_min': librosa.note_to_hz('C2'),
68
+ 'f0_max': librosa.note_to_hz('C6')})
69
+
70
+ mel = torch.from_numpy(mel).float()
71
+ embed = torch.from_numpy(embed).float()
72
+ pitch = torch.from_numpy(pitch).float()
73
+ content = torch.from_numpy(content).float()
74
+
75
+ return (mel, embed, pitch, content)
76
+
77
+ def __getitem__(self, index):
78
+ row = self.meta.iloc[index]
79
+ mel_id = row['file_name']
80
+ audio_path = self.path + row['folder'] + row['subfolder']
81
+ mel, embed, pitch, content = self.get_vc_data(audio_path, mel_id)
82
+ item = {'mel': mel, 'embed': embed, 'f0': pitch, 'content': content}
83
+ return item
84
+
85
+ def __len__(self):
86
+ return len(self.meta)
87
+
88
+
89
+ class VCDecLPCBatchCollate(object):
90
+ def __init__(self, train_frames, eps=np.log(1e-5), content_eps=np.log(1e-12)):
91
+ self.train_frames = train_frames
92
+ self.eps = eps
93
+ self.content_eps = content_eps
94
+
95
+ def __call__(self, batch):
96
+ train_frames = self.train_frames
97
+ eps = self.eps
98
+ content_eps = self.content_eps
99
+
100
+ B = len(batch)
101
+ embed = torch.stack([item['embed'] for item in batch], 0)
102
+
103
+ n_mels = batch[0]['mel'].shape[0]
104
+ content_dim = batch[0]['content'].shape[0]
105
+
106
+ # min value of log-mel spectrogram is np.log(eps) == padding zero in time domain
107
+ mels1 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * eps
108
+ mels2 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * eps
109
+
110
+ # using a different eps
111
+ contents1 = torch.ones((B, content_dim, train_frames), dtype=torch.float32) * content_eps
112
+
113
+ f0s1 = torch.zeros((B, train_frames), dtype=torch.float32)
114
+ max_starts = [max(item['mel'].shape[-1] - train_frames, 0)
115
+ for item in batch]
116
+
117
+ starts1 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts]
118
+ starts2 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts]
119
+ mel_lengths = []
120
+ for i, item in enumerate(batch):
121
+ mel = item['mel']
122
+ f0 = item['f0']
123
+ content = item['content']
124
+
125
+ if mel.shape[-1] < train_frames:
126
+ mel_length = mel.shape[-1]
127
+ else:
128
+ mel_length = train_frames
129
+
130
+ mels1[i, :, :mel_length] = mel[:, starts1[i]:starts1[i] + mel_length]
131
+ f0s1[i, :mel_length] = f0[starts1[i]:starts1[i] + mel_length]
132
+ contents1[i, :, :mel_length] = content[:, starts1[i]:starts1[i] + mel_length]
133
+
134
+ mels2[i, :, :mel_length] = mel[:, starts2[i]:starts2[i] + mel_length]
135
+ mel_lengths.append(mel_length)
136
+
137
+ mel_lengths = torch.LongTensor(mel_lengths)
138
+
139
+ return {'mel1': mels1, 'mel2': mels2, 'mel_lengths': mel_lengths,
140
+ 'embed': embed,
141
+ 'f0_1': f0s1,
142
+ 'content1': contents1}
143
+
144
+
145
+ class VCDecLPCTest(Dataset):
146
+ def __init__(self, data_dir, subset='test', eps=np.log(1e-5), content_eps=np.log(1e-12), test_frames=256, content_dir='lpc_mel_512', extract_emb=False):
147
+ self.path = data_dir
148
+ meta = pd.read_csv(data_dir + 'meta_test.csv')
149
+ self.meta = meta[meta['subset'] == subset]
150
+ self.content_dir = content_dir
151
+ self.extract_emb = extract_emb
152
+ self.eps = eps
153
+ self.content_eps = content_eps
154
+ self.test_frames = test_frames
155
+
156
+ def get_vc_data(self, audio_path, mel_id, pitch_shift):
157
+ mel_dir = audio_path.replace('vocal', 'mel')
158
+ embed_dir = audio_path.replace('vocal', 'embed')
159
+ pitch_dir = audio_path.replace('vocal', 'f0')
160
+ content_dir = audio_path.replace('vocal', self.content_dir)
161
+
162
+ mel = os.path.join(mel_dir, mel_id + '.npy')
163
+ embed = os.path.join(embed_dir, mel_id + '.npy')
164
+ pitch = os.path.join(pitch_dir, mel_id + '.npy')
165
+ content = os.path.join(content_dir, mel_id + '.npy')
166
+
167
+ mel = np.load(mel)
168
+ if self.extract_emb:
169
+ embed = np.load(embed)
170
+ else:
171
+ embed = np.zeros(1)
172
+
173
+ pitch = np.load(pitch)
174
+ content = np.load(content)
175
+
176
+ pitch = np.nan_to_num(pitch)
177
+ pitch = pitch*pitch_shift
178
+ pitch = f0_to_coarse(pitch, {'f0_bin': 256,
179
+ 'f0_min': librosa.note_to_hz('C2'),
180
+ 'f0_max': librosa.note_to_hz('C6')})
181
+
182
+ mel = torch.from_numpy(mel).float()
183
+ embed = torch.from_numpy(embed).float()
184
+ pitch = torch.from_numpy(pitch).float()
185
+ content = torch.from_numpy(content).float()
186
+
187
+ return (mel, embed, pitch, content)
188
+
189
+ def __getitem__(self, index):
190
+ row = self.meta.iloc[index]
191
+
192
+ mel_id = row['content_file_name']
193
+ audio_path = self.path + row['content_folder'] + row['content_subfolder']
194
+ pitch_shift = row['pitch_shift']
195
+ mel1, _, f0, content = self.get_vc_data(audio_path, mel_id, pitch_shift)
196
+
197
+ mel_id = row['timbre_file_name']
198
+ audio_path = self.path + row['timbre_folder'] + row['timbre_subfolder']
199
+ mel2, embed, _, _ = self.get_vc_data(audio_path, mel_id, pitch_shift)
200
+
201
+ n_mels = mel1.shape[0]
202
+ content_dim = content.shape[0]
203
+
204
+ mels1 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * self.eps
205
+ mels2 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * self.eps
206
+ # content
207
+ lpcs1 = torch.ones((content_dim, self.test_frames), dtype=torch.float32) * self.content_eps
208
+
209
+ f0s1 = torch.zeros(self.test_frames, dtype=torch.float32)
210
+
211
+ if mel1.shape[-1] < self.test_frames:
212
+ mel_length = mel1.shape[-1]
213
+ else:
214
+ mel_length = self.test_frames
215
+ mels1[:, :mel_length] = mel1[:, :mel_length]
216
+ f0s1[:mel_length] = f0[:mel_length]
217
+ lpcs1[:, :mel_length] = content[:, :mel_length]
218
+
219
+ if mel2.shape[-1] < self.test_frames:
220
+ mel_length = mel2.shape[-1]
221
+ else:
222
+ mel_length = self.test_frames
223
+ mels2[:, :mel_length] = mel2[:, :mel_length]
224
+
225
+ return {'mel1': mels1, 'mel2': mels2, 'embed': embed, 'f0_1': f0s1, 'content1': lpcs1}
226
+
227
+ def __len__(self):
228
+ return len(self.meta)
229
+
230
+
231
+
pitch_controller/load_vocoder.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from nsf_hifigan.models import load_model
2
+ from modules.BigVGAN.inference import load_model
3
+ import librosa
4
+
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import torchaudio
8
+ import torchaudio.transforms as transforms
9
+
10
+ import numpy as np
11
+ import soundfile as sf
12
+
13
+
14
+ class LogMelSpectrogram(torch.nn.Module):
15
+ def __init__(self):
16
+ super().__init__()
17
+ self.melspctrogram = transforms.MelSpectrogram(
18
+ sample_rate=22050,
19
+ n_fft=1024,
20
+ win_length=1024,
21
+ hop_length=256,
22
+ center=False,
23
+ power=1.0,
24
+ norm="slaney",
25
+ n_mels=80,
26
+ mel_scale="slaney",
27
+ f_max=8000,
28
+ f_min=0,
29
+ )
30
+
31
+ def forward(self, wav):
32
+ wav = F.pad(wav, ((1024 - 256) // 2, (1024 - 256) // 2), "reflect")
33
+ mel = self.melspctrogram(wav)
34
+ logmel = torch.log(torch.clamp(mel, min=1e-5))
35
+ return logmel
36
+
37
+
38
+ hifigan, cfg = load_model('modules/BigVGAN/ckpt/bigvgan_22khz_80band/g_05000000', device='cuda')
39
+ M = LogMelSpectrogram()
40
+
41
+ source, sr = torchaudio.load("music.mp3")
42
+ source = torchaudio.functional.resample(source, sr, 22050)
43
+ source = source.unsqueeze(0)
44
+ mel = M(source).squeeze(0)
45
+
46
+ # f0, f0_bin = get_pitch("116_1_pred.wav")
47
+ # f0 = torch.tensor(f0).unsqueeze(0)
48
+ with torch.no_grad():
49
+ y_hat = hifigan(mel.cuda()).cpu().numpy().squeeze(1)
50
+
51
+ sf.write('test.wav', y_hat[0], samplerate=22050)
pitch_controller/models/__pycache__/base.cpython-310.pyc ADDED
Binary file (1.17 kB). View file
 
pitch_controller/models/__pycache__/base.cpython-39.pyc ADDED
Binary file (1.14 kB). View file
 
pitch_controller/models/__pycache__/modules.cpython-310.pyc ADDED
Binary file (8.26 kB). View file
 
pitch_controller/models/__pycache__/modules.cpython-39.pyc ADDED
Binary file (8.45 kB). View file
 
pitch_controller/models/__pycache__/pitch.cpython-39.pyc ADDED
Binary file (1.1 kB). View file
 
pitch_controller/models/__pycache__/unet.cpython-310.pyc ADDED
Binary file (3.56 kB). View file
 
pitch_controller/models/__pycache__/unet.cpython-39.pyc ADDED
Binary file (3.48 kB). View file
 
pitch_controller/models/__pycache__/update_unet.cpython-310.pyc ADDED
Binary file (3.69 kB). View file
 
pitch_controller/models/__pycache__/utils.cpython-310.pyc ADDED
Binary file (3.99 kB). View file
 
pitch_controller/models/__pycache__/utils.cpython-39.pyc ADDED
Binary file (3.98 kB). View file
 
pitch_controller/models/base.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
2
+ # This program is free software; you can redistribute it and/or modify
3
+ # it under the terms of the MIT License.
4
+ # This program is distributed in the hope that it will be useful,
5
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
6
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
7
+ # MIT License for more details.
8
+
9
+ import numpy as np
10
+ import torch
11
+
12
+
13
+ class BaseModule(torch.nn.Module):
14
+ def __init__(self):
15
+ super(BaseModule, self).__init__()
16
+
17
+ @property
18
+ def nparams(self):
19
+ num_params = 0
20
+ for name, param in self.named_parameters():
21
+ if param.requires_grad:
22
+ num_params += np.prod(param.detach().cpu().numpy().shape)
23
+ return num_params
24
+
25
+ def relocate_input(self, x: list):
26
+ device = next(self.parameters()).device
27
+ for i in range(len(x)):
28
+ if isinstance(x[i], torch.Tensor) and x[i].device != device:
29
+ x[i] = x[i].to(device)
30
+ return x
pitch_controller/models/modules.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
2
+ # This program is free software; you can redistribute it and/or modify
3
+ # it under the terms of the MIT License.
4
+ # This program is distributed in the hope that it will be useful,
5
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
6
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
7
+ # MIT License for more details.
8
+
9
+ import math
10
+ import torch
11
+ from einops import rearrange
12
+
13
+ from .base import BaseModule
14
+
15
+
16
+ class Mish(BaseModule):
17
+ def forward(self, x):
18
+ return x * torch.tanh(torch.nn.functional.softplus(x))
19
+
20
+
21
+ class Upsample(BaseModule):
22
+ def __init__(self, dim):
23
+ super(Upsample, self).__init__()
24
+ self.conv = torch.nn.ConvTranspose2d(dim, dim, 4, 2, 1)
25
+
26
+ def forward(self, x):
27
+ return self.conv(x)
28
+
29
+
30
+ class Downsample(BaseModule):
31
+ def __init__(self, dim):
32
+ super(Downsample, self).__init__()
33
+ self.conv = torch.nn.Conv2d(dim, dim, 3, 2, 1)
34
+
35
+ def forward(self, x):
36
+ return self.conv(x)
37
+
38
+
39
+ class Rezero(BaseModule):
40
+ def __init__(self, fn):
41
+ super(Rezero, self).__init__()
42
+ self.fn = fn
43
+ self.g = torch.nn.Parameter(torch.zeros(1))
44
+
45
+ def forward(self, x):
46
+ return self.fn(x) * self.g
47
+
48
+
49
+ class Block(BaseModule):
50
+ def __init__(self, dim, dim_out, groups=8):
51
+ super(Block, self).__init__()
52
+ self.block = torch.nn.Sequential(torch.nn.Conv2d(dim, dim_out, 3,
53
+ padding=1), torch.nn.GroupNorm(
54
+ groups, dim_out), Mish())
55
+
56
+ def forward(self, x):
57
+ output = self.block(x)
58
+ return output
59
+
60
+
61
+ class ResnetBlock(BaseModule):
62
+ def __init__(self, dim, dim_out, time_emb_dim, groups=8):
63
+ super(ResnetBlock, self).__init__()
64
+ self.mlp = torch.nn.Sequential(Mish(), torch.nn.Linear(time_emb_dim,
65
+ dim_out))
66
+
67
+ self.block1 = Block(dim, dim_out, groups=groups)
68
+ self.block2 = Block(dim_out, dim_out, groups=groups)
69
+ if dim != dim_out:
70
+ self.res_conv = torch.nn.Conv2d(dim, dim_out, 1)
71
+ else:
72
+ self.res_conv = torch.nn.Identity()
73
+
74
+ def forward(self, x, time_emb):
75
+ h = self.block1(x)
76
+ h += self.mlp(time_emb).unsqueeze(-1).unsqueeze(-1)
77
+ h = self.block2(h)
78
+ output = h + self.res_conv(x)
79
+ return output
80
+
81
+
82
+ class LinearAttention(BaseModule):
83
+ def __init__(self, dim, heads=4, dim_head=32, q_norm=True):
84
+ super(LinearAttention, self).__init__()
85
+ self.heads = heads
86
+ hidden_dim = dim_head * heads
87
+ self.to_qkv = torch.nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
88
+ self.to_out = torch.nn.Conv2d(hidden_dim, dim, 1)
89
+ self.q_norm = q_norm
90
+
91
+ def forward(self, x):
92
+ b, c, h, w = x.shape
93
+ qkv = self.to_qkv(x)
94
+ q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)',
95
+ heads=self.heads, qkv=3)
96
+ k = k.softmax(dim=-1)
97
+ if self.q_norm:
98
+ q = q.softmax(dim=-2)
99
+
100
+ context = torch.einsum('bhdn,bhen->bhde', k, v)
101
+ out = torch.einsum('bhde,bhdn->bhen', context, q)
102
+ out = rearrange(out, 'b heads c (h w) -> b (heads c) h w',
103
+ heads=self.heads, h=h, w=w)
104
+ return self.to_out(out)
105
+
106
+
107
+ class Residual(BaseModule):
108
+ def __init__(self, fn):
109
+ super(Residual, self).__init__()
110
+ self.fn = fn
111
+
112
+ def forward(self, x, *args, **kwargs):
113
+ output = self.fn(x, *args, **kwargs) + x
114
+ return output
115
+
116
+
117
+ def get_timestep_embedding(
118
+ timesteps: torch.Tensor,
119
+ embedding_dim: int,
120
+ flip_sin_to_cos: bool = False,
121
+ downscale_freq_shift: float = 1,
122
+ scale: float = 1,
123
+ max_period: int = 10000,
124
+ ):
125
+ """
126
+ This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
127
+ :param timesteps: a 1-D Tensor of N indices, one per batch element.
128
+ These may be fractional.
129
+ :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
130
+ embeddings. :return: an [N x dim] Tensor of positional embeddings.
131
+ """
132
+ assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
133
+
134
+ half_dim = embedding_dim // 2
135
+ exponent = -math.log(max_period) * torch.arange(
136
+ start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
137
+ )
138
+ exponent = exponent / (half_dim - downscale_freq_shift)
139
+
140
+ emb = torch.exp(exponent)
141
+ emb = timesteps[:, None].float() * emb[None, :]
142
+
143
+ # scale embeddings
144
+ emb = scale * emb
145
+
146
+ # concat sine and cosine embeddings
147
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
148
+
149
+ # flip sine and cosine embeddings
150
+ if flip_sin_to_cos:
151
+ emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
152
+
153
+ # zero pad
154
+ if embedding_dim % 2 == 1:
155
+ emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
156
+ return emb
157
+
158
+
159
+ class Timesteps(BaseModule):
160
+ def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
161
+ super().__init__()
162
+ self.num_channels = num_channels
163
+ self.flip_sin_to_cos = flip_sin_to_cos
164
+ self.downscale_freq_shift = downscale_freq_shift
165
+
166
+ def forward(self, timesteps):
167
+ t_emb = get_timestep_embedding(
168
+ timesteps,
169
+ self.num_channels,
170
+ flip_sin_to_cos=self.flip_sin_to_cos,
171
+ downscale_freq_shift=self.downscale_freq_shift,
172
+ )
173
+ return t_emb
174
+
175
+
176
+ class PitchPosEmb(BaseModule):
177
+ def __init__(self, dim, flip_sin_to_cos=False, downscale_freq_shift=0):
178
+ super(PitchPosEmb, self).__init__()
179
+ self.dim = dim
180
+ self.flip_sin_to_cos = flip_sin_to_cos
181
+ self.downscale_freq_shift = downscale_freq_shift
182
+
183
+ def forward(self, x):
184
+ # B * L
185
+ b, l = x.shape
186
+ x = rearrange(x, 'b l -> (b l)')
187
+ emb = get_timestep_embedding(
188
+ x,
189
+ self.dim,
190
+ flip_sin_to_cos=self.flip_sin_to_cos,
191
+ downscale_freq_shift=self.downscale_freq_shift,
192
+ )
193
+ emb = rearrange(emb, '(b l) d -> b d l', b=b, l=l)
194
+ return emb
195
+
196
+
197
+ class TimbreBlock(BaseModule):
198
+ def __init__(self, out_dim):
199
+ super(TimbreBlock, self).__init__()
200
+ base_dim = out_dim // 4
201
+
202
+ self.block11 = torch.nn.Sequential(torch.nn.Conv2d(1, 2 * base_dim,
203
+ 3, 1, 1),
204
+ torch.nn.InstanceNorm2d(2 * base_dim, affine=True),
205
+ torch.nn.GLU(dim=1))
206
+ self.block12 = torch.nn.Sequential(torch.nn.Conv2d(base_dim, 2 * base_dim,
207
+ 3, 1, 1),
208
+ torch.nn.InstanceNorm2d(2 * base_dim, affine=True),
209
+ torch.nn.GLU(dim=1))
210
+ self.block21 = torch.nn.Sequential(torch.nn.Conv2d(base_dim, 4 * base_dim,
211
+ 3, 1, 1),
212
+ torch.nn.InstanceNorm2d(4 * base_dim, affine=True),
213
+ torch.nn.GLU(dim=1))
214
+ self.block22 = torch.nn.Sequential(torch.nn.Conv2d(2 * base_dim, 4 * base_dim,
215
+ 3, 1, 1),
216
+ torch.nn.InstanceNorm2d(4 * base_dim, affine=True),
217
+ torch.nn.GLU(dim=1))
218
+ self.block31 = torch.nn.Sequential(torch.nn.Conv2d(2 * base_dim, 8 * base_dim,
219
+ 3, 1, 1),
220
+ torch.nn.InstanceNorm2d(8 * base_dim, affine=True),
221
+ torch.nn.GLU(dim=1))
222
+ self.block32 = torch.nn.Sequential(torch.nn.Conv2d(4 * base_dim, 8 * base_dim,
223
+ 3, 1, 1),
224
+ torch.nn.InstanceNorm2d(8 * base_dim, affine=True),
225
+ torch.nn.GLU(dim=1))
226
+ self.final_conv = torch.nn.Conv2d(4 * base_dim, out_dim, 1)
227
+
228
+ def forward(self, x):
229
+ y = self.block11(x)
230
+ y = self.block12(y)
231
+ y = self.block21(y)
232
+ y = self.block22(y)
233
+ y = self.block31(y)
234
+ y = self.block32(y)
235
+ y = self.final_conv(y)
236
+
237
+ return y.sum((2, 3)) / (y.shape[2] * y.shape[3])
pitch_controller/models/unet.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+
4
+ from .base import BaseModule
5
+ from .modules import Mish, Upsample, Downsample, Rezero, Block, ResnetBlock
6
+ from .modules import LinearAttention, Residual, Timesteps, TimbreBlock, PitchPosEmb
7
+
8
+ from einops import rearrange
9
+
10
+
11
+ class UNetPitcher(BaseModule):
12
+ def __init__(self,
13
+ dim_base,
14
+ dim_cond,
15
+ use_ref_t,
16
+ use_embed,
17
+ dim_embed=256,
18
+ dim_mults=(1, 2, 4),
19
+ pitch_type='bins'):
20
+
21
+ super(UNetPitcher, self).__init__()
22
+ self.use_ref_t = use_ref_t
23
+ self.use_embed = use_embed
24
+ self.pitch_type = pitch_type
25
+
26
+ dim_in = 2
27
+
28
+ # time embedding
29
+ self.time_pos_emb = Timesteps(num_channels=dim_base,
30
+ flip_sin_to_cos=True,
31
+ downscale_freq_shift=0)
32
+
33
+ self.mlp = torch.nn.Sequential(torch.nn.Linear(dim_base, dim_base * 4),
34
+ Mish(), torch.nn.Linear(dim_base * 4, dim_base))
35
+
36
+ # speaker embedding
37
+ timbre_total = 0
38
+ if use_ref_t:
39
+ self.ref_block = TimbreBlock(out_dim=dim_cond)
40
+ timbre_total += dim_cond
41
+ if use_embed:
42
+ timbre_total += dim_embed
43
+
44
+ if timbre_total != 0:
45
+ self.timbre_block = torch.nn.Sequential(
46
+ torch.nn.Linear(timbre_total, 4 * dim_cond),
47
+ Mish(),
48
+ torch.nn.Linear(4 * dim_cond, dim_cond))
49
+
50
+ if use_embed or use_ref_t:
51
+ dim_in += dim_cond
52
+
53
+ self.pitch_pos_emb = PitchPosEmb(dim_cond)
54
+ self.pitch_mlp = torch.nn.Sequential(
55
+ torch.nn.Conv1d(dim_cond, dim_cond * 4, 1, stride=1),
56
+ Mish(),
57
+ torch.nn.Conv1d(dim_cond * 4, dim_cond, 1, stride=1), )
58
+ dim_in += dim_cond
59
+
60
+ # pitch embedding
61
+ # if self.pitch_type == 'bins':
62
+ # print('using mel bins for f0')
63
+ # elif self.pitch_type == 'log':
64
+ # print('using log bins f0')
65
+
66
+ dims = [dim_in, *map(lambda m: dim_base * m, dim_mults)]
67
+ in_out = list(zip(dims[:-1], dims[1:]))
68
+ # blocks
69
+ self.downs = torch.nn.ModuleList([])
70
+ self.ups = torch.nn.ModuleList([])
71
+ num_resolutions = len(in_out)
72
+
73
+ for ind, (dim_in, dim_out) in enumerate(in_out):
74
+ is_last = ind >= (num_resolutions - 1)
75
+ self.downs.append(torch.nn.ModuleList([
76
+ ResnetBlock(dim_in, dim_out, time_emb_dim=dim_base),
77
+ ResnetBlock(dim_out, dim_out, time_emb_dim=dim_base),
78
+ Residual(Rezero(LinearAttention(dim_out))),
79
+ Downsample(dim_out) if not is_last else torch.nn.Identity()]))
80
+
81
+ mid_dim = dims[-1]
82
+ self.mid_block1 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim_base)
83
+ self.mid_attn = Residual(Rezero(LinearAttention(mid_dim)))
84
+ self.mid_block2 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim_base)
85
+
86
+ for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
87
+ self.ups.append(torch.nn.ModuleList([
88
+ ResnetBlock(dim_out * 2, dim_in, time_emb_dim=dim_base),
89
+ ResnetBlock(dim_in, dim_in, time_emb_dim=dim_base),
90
+ Residual(Rezero(LinearAttention(dim_in))),
91
+ Upsample(dim_in)]))
92
+ self.final_block = Block(dim_base, dim_base)
93
+ self.final_conv = torch.nn.Conv2d(dim_base, 1, 1)
94
+
95
+ def forward(self, x, mean, f0, t, ref=None, embed=None):
96
+ if not torch.is_tensor(t):
97
+ t = torch.tensor([t], dtype=torch.long, device=x.device)
98
+ if len(t.shape) == 0:
99
+ t = t * torch.ones(x.shape[0], dtype=t.dtype, device=x.device)
100
+
101
+ t = self.time_pos_emb(t)
102
+ t = self.mlp(t)
103
+
104
+ x = torch.stack([x, mean], 1)
105
+
106
+ f0 = self.pitch_pos_emb(f0)
107
+ f0 = self.pitch_mlp(f0)
108
+ f0 = f0.unsqueeze(2)
109
+ f0 = torch.cat(x.shape[2] * [f0], 2)
110
+
111
+ timbre = None
112
+ if self.use_ref_t:
113
+ ref = torch.stack([ref], 1)
114
+ timbre = self.ref_block(ref)
115
+ if self.use_embed:
116
+ if timbre is not None:
117
+ timbre = torch.cat([timbre, embed], 1)
118
+ else:
119
+ timbre = embed
120
+ if timbre is None:
121
+ # raise Exception("at least use one timbre condition")
122
+ condition = f0
123
+ else:
124
+ timbre = self.timbre_block(timbre).unsqueeze(-1).unsqueeze(-1)
125
+ timbre = torch.cat(x.shape[2] * [timbre], 2)
126
+ timbre = torch.cat(x.shape[3] * [timbre], 3)
127
+ condition = torch.cat([f0, timbre], 1)
128
+
129
+ x = torch.cat([x, condition], 1)
130
+
131
+ hiddens = []
132
+ for resnet1, resnet2, attn, downsample in self.downs:
133
+ x = resnet1(x, t)
134
+ x = resnet2(x, t)
135
+ x = attn(x)
136
+ hiddens.append(x)
137
+ x = downsample(x)
138
+
139
+ x = self.mid_block1(x, t)
140
+ x = self.mid_attn(x)
141
+ x = self.mid_block2(x, t)
142
+
143
+ for resnet1, resnet2, attn, upsample in self.ups:
144
+ x = torch.cat((x, hiddens.pop()), dim=1)
145
+ x = resnet1(x, t)
146
+ x = resnet2(x, t)
147
+ x = attn(x)
148
+ x = upsample(x)
149
+
150
+ x = self.final_block(x)
151
+ output = self.final_conv(x)
152
+
153
+ return output.squeeze(1)
pitch_controller/models/utils.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
2
+ # This program is free software; you can redistribute it and/or modify
3
+ # it under the terms of the MIT License.
4
+ # This program is distributed in the hope that it will be useful,
5
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
6
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
7
+ # MIT License for more details.
8
+
9
+ import torch
10
+ import torchaudio
11
+ import numpy as np
12
+ from librosa.filters import mel as librosa_mel_fn
13
+
14
+ from .base import BaseModule
15
+
16
+
17
+ def mse_loss(x, y, mask, n_feats):
18
+ loss = torch.sum(((x - y)**2) * mask)
19
+ return loss / (torch.sum(mask) * n_feats)
20
+
21
+
22
+ def sequence_mask(length, max_length=None):
23
+ if max_length is None:
24
+ max_length = length.max()
25
+ x = torch.arange(int(max_length), dtype=length.dtype, device=length.device)
26
+ return x.unsqueeze(0) < length.unsqueeze(1)
27
+
28
+
29
+ def convert_pad_shape(pad_shape):
30
+ l = pad_shape[::-1]
31
+ pad_shape = [item for sublist in l for item in sublist]
32
+ return pad_shape
33
+
34
+
35
+ def fix_len_compatibility(length, num_downsamplings_in_unet=2):
36
+ while True:
37
+ if length % (2**num_downsamplings_in_unet) == 0:
38
+ return length
39
+ length += 1
40
+
41
+
42
+ class PseudoInversion(BaseModule):
43
+ def __init__(self, n_mels, sampling_rate, n_fft):
44
+ super(PseudoInversion, self).__init__()
45
+ self.n_mels = n_mels
46
+ self.sampling_rate = sampling_rate
47
+ self.n_fft = n_fft
48
+ mel_basis = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=0, fmax=8000)
49
+ mel_basis_inverse = np.linalg.pinv(mel_basis)
50
+ mel_basis_inverse = torch.from_numpy(mel_basis_inverse).float()
51
+ self.register_buffer("mel_basis_inverse", mel_basis_inverse)
52
+
53
+ def forward(self, log_mel_spectrogram):
54
+ mel_spectrogram = torch.exp(log_mel_spectrogram)
55
+ stftm = torch.matmul(self.mel_basis_inverse, mel_spectrogram)
56
+ return stftm
57
+
58
+
59
+ class InitialReconstruction(BaseModule):
60
+ def __init__(self, n_fft, hop_size):
61
+ super(InitialReconstruction, self).__init__()
62
+ self.n_fft = n_fft
63
+ self.hop_size = hop_size
64
+ window = torch.hann_window(n_fft).float()
65
+ self.register_buffer("window", window)
66
+
67
+ def forward(self, stftm):
68
+ real_part = torch.ones_like(stftm, device=stftm.device)
69
+ imag_part = torch.zeros_like(stftm, device=stftm.device)
70
+ stft = torch.stack([real_part, imag_part], -1)*stftm.unsqueeze(-1)
71
+ istft = torch.istft(stft, n_fft=self.n_fft,
72
+ hop_length=self.hop_size, win_length=self.n_fft,
73
+ window=self.window, center=True)
74
+ return istft.unsqueeze(1)
75
+
76
+
77
+ # Fast Griffin-Lim algorithm as a PyTorch module
78
+ class FastGL(BaseModule):
79
+ def __init__(self, n_mels, sampling_rate, n_fft, hop_size, momentum=0.99):
80
+ super(FastGL, self).__init__()
81
+ self.n_mels = n_mels
82
+ self.sampling_rate = sampling_rate
83
+ self.n_fft = n_fft
84
+ self.hop_size = hop_size
85
+ self.momentum = momentum
86
+ self.pi = PseudoInversion(n_mels, sampling_rate, n_fft)
87
+ self.ir = InitialReconstruction(n_fft, hop_size)
88
+ window = torch.hann_window(n_fft).float()
89
+ self.register_buffer("window", window)
90
+
91
+ @torch.no_grad()
92
+ def forward(self, s, n_iters=32):
93
+ c = self.pi(s)
94
+ x = self.ir(c)
95
+ x = x.squeeze(1)
96
+ c = c.unsqueeze(-1)
97
+ prev_angles = torch.zeros_like(c, device=c.device)
98
+ for _ in range(n_iters):
99
+ s = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_size,
100
+ win_length=self.n_fft, window=self.window,
101
+ center=True)
102
+ real_part, imag_part = s.unbind(-1)
103
+ stftm = torch.sqrt(torch.clamp(real_part**2 + imag_part**2, min=1e-8))
104
+ angles = s / stftm.unsqueeze(-1)
105
+ s = c * (angles + self.momentum * (angles - prev_angles))
106
+ x = torch.istft(s, n_fft=self.n_fft, hop_length=self.hop_size,
107
+ win_length=self.n_fft, window=self.window,
108
+ center=True)
109
+ prev_angles = angles
110
+ return x.unsqueeze(1)
pitch_controller/modules/BigVGAN/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 NVIDIA CORPORATION.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
pitch_controller/modules/BigVGAN/README.md ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## BigVGAN: A Universal Neural Vocoder with Large-Scale Training
2
+ #### Sang-gil Lee, Wei Ping, Boris Ginsburg, Bryan Catanzaro, Sungroh Yoon
3
+
4
+ <center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800"></center>
5
+
6
+
7
+ ### [Paper](https://arxiv.org/abs/2206.04658)
8
+ ### [Audio demo](https://bigvgan-demo.github.io/)
9
+
10
+ ## Installation
11
+ Clone the repository and install dependencies.
12
+ ```shell
13
+ # the codebase has been tested on Python 3.8 / 3.10 with PyTorch 1.12.1 / 1.13 conda binaries
14
+ git clone https://github.com/NVIDIA/BigVGAN
15
+ pip install -r requirements.txt
16
+ ```
17
+
18
+ Create symbolic link to the root of the dataset. The codebase uses filelist with the relative path from the dataset. Below are the example commands for LibriTTS dataset.
19
+ ``` shell
20
+ cd LibriTTS && \
21
+ ln -s /path/to/your/LibriTTS/train-clean-100 train-clean-100 && \
22
+ ln -s /path/to/your/LibriTTS/train-clean-360 train-clean-360 && \
23
+ ln -s /path/to/your/LibriTTS/train-other-500 train-other-500 && \
24
+ ln -s /path/to/your/LibriTTS/dev-clean dev-clean && \
25
+ ln -s /path/to/your/LibriTTS/dev-other dev-other && \
26
+ ln -s /path/to/your/LibriTTS/test-clean test-clean && \
27
+ ln -s /path/to/your/LibriTTS/test-other test-other && \
28
+ cd ..
29
+ ```
30
+
31
+ ## Training
32
+ Train BigVGAN model. Below is an example command for training BigVGAN using LibriTTS dataset at 24kHz with a full 100-band mel spectrogram as input.
33
+ ```shell
34
+ python train.py \
35
+ --config configs/bigvgan_24khz_100band.json \
36
+ --input_wavs_dir LibriTTS \
37
+ --input_training_file LibriTTS/train-full.txt \
38
+ --input_validation_file LibriTTS/val-full.txt \
39
+ --list_input_unseen_wavs_dir LibriTTS LibriTTS \
40
+ --list_input_unseen_validation_file LibriTTS/dev-clean.txt LibriTTS/dev-other.txt \
41
+ --checkpoint_path exp/bigvgan
42
+ ```
43
+
44
+ ## Synthesis
45
+ Synthesize from BigVGAN model. Below is an example command for generating audio from the model.
46
+ It computes mel spectrograms using wav files from `--input_wavs_dir` and saves the generated audio to `--output_dir`.
47
+ ```shell
48
+ python inference.py \
49
+ --checkpoint_file exp/bigvgan/g_05000000 \
50
+ --input_wavs_dir /path/to/your/input_wav \
51
+ --output_dir /path/to/your/output_wav
52
+ ```
53
+
54
+ `inference_e2e.py` supports synthesis directly from the mel spectrogram saved in `.npy` format, with shapes `[1, channel, frame]` or `[channel, frame]`.
55
+ It loads mel spectrograms from `--input_mels_dir` and saves the generated audio to `--output_dir`.
56
+
57
+ Make sure that the STFT hyperparameters for mel spectrogram are the same as the model, which are defined in `config.json` of the corresponding model.
58
+ ```shell
59
+ python inference_e2e.py \
60
+ --checkpoint_file exp/bigvgan/g_05000000 \
61
+ --input_mels_dir /path/to/your/input_mel \
62
+ --output_dir /path/to/your/output_wav
63
+ ```
64
+
65
+ ## Pretrained Models
66
+ We provide the [pretrained models](https://drive.google.com/drive/folders/1e9wdM29d-t3EHUpBb8T4dcHrkYGAXTgq).
67
+ One can download the checkpoints of generator (e.g., g_05000000) and discriminator (e.g., do_05000000) within the listed folders.
68
+
69
+ |Folder Name|Sampling Rate|Mel band|fmax|Params.|Dataset|Fine-Tuned|
70
+ |------|---|---|---|---|------|---|
71
+ |bigvgan_24khz_100band|24 kHz|100|12000|112M|LibriTTS|No|
72
+ |bigvgan_base_24khz_100band|24 kHz|100|12000|14M|LibriTTS|No|
73
+ |bigvgan_22khz_80band|22 kHz|80|8000|112M|LibriTTS + VCTK + LJSpeech|No|
74
+ |bigvgan_base_22khz_80band|22 kHz|80|8000|14M|LibriTTS + VCTK + LJSpeech|No|
75
+
76
+ The paper results are based on 24kHz BigVGAN models trained on LibriTTS dataset.
77
+ We also provide 22kHz BigVGAN models with band-limited setup (i.e., fmax=8000) for TTS applications.
78
+ Note that, the latest checkpoints use ``snakebeta`` activation with log scale parameterization, which have the best overall quality.
79
+
80
+
81
+ ## TODO
82
+
83
+ Current codebase only provides a plain PyTorch implementation for the filtered nonlinearity. We are working on a fast CUDA kernel implementation, which will be released in the future.
84
+
85
+
86
+ ## References
87
+ * [HiFi-GAN](https://github.com/jik876/hifi-gan) (for generator and multi-period discriminator)
88
+
89
+ * [Snake](https://github.com/EdwardDixon/snake) (for periodic activation)
90
+
91
+ * [Alias-free-torch](https://github.com/junjun3518/alias-free-torch) (for anti-aliasing)
92
+
93
+ * [Julius](https://github.com/adefossez/julius) (for low-pass filter)
94
+
95
+ * [UnivNet](https://github.com/mindslab-ai/univnet) (for multi-resolution discriminator)
pitch_controller/modules/BigVGAN/__pycache__/env.cpython-310.pyc ADDED
Binary file (845 Bytes). View file
 
pitch_controller/modules/BigVGAN/__pycache__/inference.cpython-310.pyc ADDED
Binary file (1.11 kB). View file