diff --git a/.ipynb_checkpoints/requirements-checkpoint.txt b/.ipynb_checkpoints/requirements-checkpoint.txt
new file mode 100644
index 0000000000000000000000000000000000000000..976247a9d4f771489046246c2f07767b41a84721
--- /dev/null
+++ b/.ipynb_checkpoints/requirements-checkpoint.txt
@@ -0,0 +1,17 @@
+diffusers
+einops
+fastdtw
+librosa
+matplotlib
+music21
+numpy
+pandas
+pretty_midi
+pysptk
+pyworld
+scipy
+soundfile
+tgt
+torch
+torchaudio
+tqdm
diff --git a/.ipynb_checkpoints/score_based_apc-checkpoint.py b/.ipynb_checkpoints/score_based_apc-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9259d641045d7bfdb6d5aede00f6cebcc28f477
--- /dev/null
+++ b/.ipynb_checkpoints/score_based_apc-checkpoint.py
@@ -0,0 +1,159 @@
+import os.path
+
+import numpy as np
+import pandas as pd
+import torch
+import yaml
+import librosa
+import soundfile as sf
+from tqdm import tqdm
+
+from diffusers import DDIMScheduler
+from pitch_controller.models.unet import UNetPitcher
+from pitch_controller.utils import minmax_norm_diff, reverse_minmax_norm_diff
+from pitch_controller.modules.BigVGAN.inference import load_model
+from utils import get_mel, get_world_mel, get_f0, f0_to_coarse, show_plot, get_matched_f0, log_f0
+from pitch_predictor.models.transformer import PitchFormer
+import pretty_midi
+
+
+def prepare_midi_wav(wav_id, midi_id, sr=24000):
+    midi = pretty_midi.PrettyMIDI(midi_id)
+    roll = midi.get_piano_roll()
+    roll = np.pad(roll, ((0, 0), (0, 1000)), constant_values=0)
+    roll[roll > 0] = 100
+
+    onset = midi.get_onsets()
+    before_onset = list(np.round(onset * 100 - 1).astype(int))
+    roll[:, before_onset] = 0
+
+    wav, sr = librosa.load(wav_id, sr=sr)
+
+    start = 0
+    end = round(100 * len(wav) / sr) / 100
+    # save audio
+    wav_seg = wav[round(start * sr):round(end * sr)]
+    cur_roll = roll[:, round(100 * start):round(100 * end)]
+    return wav_seg, cur_roll
+
+
+def algin_mapping(content, target_len):
+    # align content with mel
+    src_len = content.shape[-1]
+    target = torch.zeros([content.shape[0], target_len], dtype=torch.float).to(content.device)
+    temp = torch.arange(src_len+1) * target_len / src_len
+
+    for i in range(target_len):
+        cur_idx = torch.argmin(torch.abs(temp-i))
+        target[:, i] = content[:, cur_idx]
+    return target
+
+
+def midi_to_hz(midi):
+    idx = torch.zeros(midi.shape[-1])
+    for frame in range(midi.shape[-1]):
+        midi_frame = midi[:, frame]
+        non_zero = midi_frame.nonzero()
+        if len(non_zero) != 0:
+            hz = librosa.midi_to_hz(non_zero[0])
+            idx[frame] = torch.tensor(hz)
+    return idx
+
+
+@torch.no_grad()
+def score_pitcher(source, pitch_ref, model, hifigan, pitcher, steps=50, shift_semi=0, mask_with_source=False):
+    wav, midi = prepare_midi_wav(source, pitch_ref, sr=sr)
+
+    source_mel = get_world_mel(None, sr=sr, wav=wav)
+
+    midi = torch.tensor(midi, dtype=torch.float32)
+    midi = algin_mapping(midi, source_mel.shape[-1])
+    midi = midi_to_hz(midi)
+
+    f0_ori = np.nan_to_num(get_f0(source))
+
+    source_mel = torch.from_numpy(source_mel).float().unsqueeze(0).to(device)
+    f0_ori = torch.from_numpy(f0_ori).float().unsqueeze(0).to(device)
+    midi = midi.unsqueeze(0).to(device)
+
+    f0_pred = pitcher(sp=source_mel, midi=midi)
+    if mask_with_source:
+        # mask unvoiced frames based on original pitch estimation
+        f0_pred[f0_ori == 0] = 0
+    f0_pred = f0_pred.cpu().numpy()[0]
+    # limit range
+    f0_pred[f0_pred < librosa.note_to_hz('C2')] = 0
+    f0_pred[f0_pred > librosa.note_to_hz('C6')] = librosa.note_to_hz('C6')
+
+    f0_pred = f0_pred * (2 ** (shift_semi / 12))
+
+    f0_pred = log_f0(f0_pred, {'f0_bin': 345,
+                               'f0_min': librosa.note_to_hz('C2'),
+                               'f0_max': librosa.note_to_hz('C#6')})
+    f0_pred = torch.from_numpy(f0_pred).float().unsqueeze(0).to(device)
+
+    noise_scheduler = DDIMScheduler(num_train_timesteps=1000)
+    generator = torch.Generator(device=device).manual_seed(2024)
+
+    noise_scheduler.set_timesteps(steps)
+    noise = torch.randn(source_mel.shape, generator=generator, device=device)
+    pred = noise
+    source_x = minmax_norm_diff(source_mel, vmax=max_mel, vmin=min_mel)
+
+    for t in tqdm(noise_scheduler.timesteps):
+        pred = noise_scheduler.scale_model_input(pred, t)
+        model_output = model(x=pred, mean=source_x, f0=f0_pred, t=t, ref=None, embed=None)
+        pred = noise_scheduler.step(model_output=model_output,
+                                    timestep=t,
+                                    sample=pred,
+                                    eta=1, generator=generator).prev_sample
+
+    pred = reverse_minmax_norm_diff(pred, vmax=max_mel, vmin=min_mel)
+
+    pred_audio = hifigan(pred)
+    pred_audio = pred_audio.cpu().squeeze().clamp(-1, 1)
+
+    return pred_audio
+
+
+if __name__ == '__main__':
+    min_mel = np.log(1e-5)
+    max_mel = 2.5
+    sr = 24000
+
+    use_gpu = torch.cuda.is_available()
+    device = 'cuda' if use_gpu else 'cpu'
+
+    # load diffusion model
+    config = yaml.load(open('pitch_controller/config/DiffWorld_24k.yaml'), Loader=yaml.FullLoader)
+    mel_cfg = config['logmel']
+    ddpm_cfg = config['ddpm']
+    unet_cfg = config['unet']
+    model = UNetPitcher(**unet_cfg)
+    unet_path = 'ckpts/world_fixed_40.pt'
+
+    state_dict = torch.load(unet_path)
+    for key in list(state_dict.keys()):
+        state_dict[key.replace('_orig_mod.', '')] = state_dict.pop(key)
+    model.load_state_dict(state_dict)
+    if use_gpu:
+        model.cuda()
+    model.eval()
+
+    #  load vocoder
+    hifi_path = 'ckpts/bigvgan_24khz_100band/g_05000000.pt'
+    hifigan, cfg = load_model(hifi_path, device=device)
+    hifigan.eval()
+
+    # load pitch predictor
+    pitcher = PitchFormer(100, 512).to(device)
+    ckpt = torch.load('ckpts/ckpt_transformer_pitch/transformer_pitch_360.pt')
+    pitcher.load_state_dict(ckpt)
+    pitcher.eval()
+
+    pred_audio = score_pitcher('examples/score_vocal.wav', 'examples/score_midi.midi', model, hifigan, pitcher, steps=50)
+    sf.write('output_score.wav', pred_audio, samplerate=sr)
+
+
+
+
diff --git a/.ipynb_checkpoints/template_based_apc-checkpoint.py b/.ipynb_checkpoints/template_based_apc-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ecdf7a6d92d1e763656e003e4718ea2e5853d3d
--- /dev/null
+++ b/.ipynb_checkpoints/template_based_apc-checkpoint.py
@@ -0,0 +1,89 @@
+import os.path
+
+import numpy as np
+import pandas as pd
+import torch
+import yaml
+import librosa
+import soundfile as sf
+from tqdm import tqdm
+
+from diffusers import DDIMScheduler
+from pitch_controller.models.unet import UNetPitcher
+from pitch_controller.utils import minmax_norm_diff, reverse_minmax_norm_diff
+from pitch_controller.modules.BigVGAN.inference import load_model
+from utils import get_mel, get_world_mel, get_f0, f0_to_coarse, show_plot, get_matched_f0, log_f0
+
+
+@torch.no_grad()
+def template_pitcher(source, pitch_ref, model, hifigan, steps=50, shift_semi=0):
+
+    source_mel = get_world_mel(source, sr=sr)
+
+    f0_ref = get_matched_f0(source, pitch_ref, 'world')
+    f0_ref = f0_ref * 2 ** (shift_semi / 12)
+
+    f0_ref = log_f0(f0_ref, {'f0_bin': 345,
+                             'f0_min': librosa.note_to_hz('C2'),
+                             'f0_max': librosa.note_to_hz('C#6')})
+
+    source_mel = torch.from_numpy(source_mel).float().unsqueeze(0).to(device)
+    f0_ref = torch.from_numpy(f0_ref).float().unsqueeze(0).to(device)
+
+    noise_scheduler = DDIMScheduler(num_train_timesteps=1000)
+    generator = torch.Generator(device=device).manual_seed(2024)
+
+    noise_scheduler.set_timesteps(steps)
+    noise = torch.randn(source_mel.shape, generator=generator, device=device)
+    pred = noise
+    source_x = minmax_norm_diff(source_mel, vmax=max_mel, vmin=min_mel)
+
+    for t in tqdm(noise_scheduler.timesteps):
+        pred = noise_scheduler.scale_model_input(pred, t)
+        model_output = model(x=pred, mean=source_x, f0=f0_ref, t=t, ref=None, embed=None)
+        pred = noise_scheduler.step(model_output=model_output,
+                                    timestep=t,
+                                    sample=pred,
+                                    eta=1, generator=generator).prev_sample
+
+    pred = reverse_minmax_norm_diff(pred, vmax=max_mel, vmin=min_mel)
+
+    pred_audio = hifigan(pred)
+    pred_audio = pred_audio.cpu().squeeze().clamp(-1, 1)
+
+    return pred_audio
+
+
+if __name__ == '__main__':
+    min_mel = np.log(1e-5)
+    max_mel = 2.5
+    sr = 24000
+
+    use_gpu = torch.cuda.is_available()
+    device = 'cuda' if use_gpu else 'cpu'
+
+    # load diffusion model
+    config = yaml.load(open('pitch_controller/config/DiffWorld_24k.yaml'), Loader=yaml.FullLoader)
+    mel_cfg = config['logmel']
+    ddpm_cfg = config['ddpm']
+    unet_cfg = config['unet']
+    model = UNetPitcher(**unet_cfg)
+    unet_path = 'ckpts/world_fixed_40.pt'
+
+    state_dict = torch.load(unet_path)
+    for key in list(state_dict.keys()):
+        state_dict[key.replace('_orig_mod.', '')] = state_dict.pop(key)
+    model.load_state_dict(state_dict)
+    if use_gpu:
+        model.cuda()
+    model.eval()
+
+    #  load vocoder
+    hifi_path = 'ckpts/bigvgan_24khz_100band/g_05000000.pt'
+    hifigan, cfg = load_model(hifi_path, device=device)
+    hifigan.eval()
+
+    pred_audio = template_pitcher('examples/off-key.wav', 'examples/reference.wav', model, hifigan, steps=50, shift_semi=0)
+    sf.write('output_template.wav', pred_audio, samplerate=sr)
+
+
diff --git a/README.md b/README.md
index 7be5fc7f47d5db027d120b8024982df93db95b74..c9f28873837de10d64c45aed99bfda06c18d340d 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,86 @@
----
-license: mit
----
+<img src="img\cover.png">
+
+# Diff-Pitcher (PyTorch)
+
+Official Pytorch Implementation of [Diff-Pitcher: Diffusion-based Singing Voice Pitch Correction](https://engineering.jhu.edu/lcap/data/uploads/pdfs/waspaa2023_hai.pdf)
+
+--------------------
+
+Thank you all for your interest in this research project. I am currently optimizing the model's performance and computation efficiency. I plan to release a user-friendly version, either a GUI or a VST, in the first half of this year, and will update the open-source license.
+
+If you are familiar with PyTorch, you can follow [Code Examples](#examples) to use Diff-Pitcher.
+
+--------------------
+
+Diff-Pitcher
+
+- [Demo Page](#demo)
+- [Todo List](#todo)
+- [Code Examples](#examples)
+- [References](#references)
+- [Acknowledgement](#acknowledgement)
+
+## Demo
+
+🎵 Listen to [examples](https://jhu-lcap.github.io/Diff-Pitcher/)
+
+## Todo
+- [x] Update codes and demo
+- [x] Support 🤗 [Diffusers](https://github.com/huggingface/diffusers)
+- [x] Upload checkpoints
+- [x] Pipeline tutorial
+- [ ] Merge to [Your-Stable-Audio](https://github.com/haidog-yaqub/Your-Stable-Audio)
+- [ ] Audio Plugin Support
+## Examples
+- Download checkpoints: 🎒[ckpts](https://github.com/haidog-yaqub/DiffPitcher/tree/main/ckpts)
+- Prepare environment: [requirements.txt](requirements.txt)
+- Feel free to try:
+  - template-based automatic pitch correction: [template_based_apc.py](template_based_apc.py)
+  - score-based automatic pitch correction: [score_based_apc.py](score_based_apc.py)
+
+
+## References
+
+If you find the code useful for your research, please consider citing:
+
+```bibtex
+@inproceedings{hai2023diff,
+  title={Diff-Pitcher: Diffusion-Based Singing Voice Pitch Correction},
+  author={Hai, Jiarui and Elhilali, Mounya},
+  booktitle={2023 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
+  pages={1--5},
+  year={2023},
+  organization={IEEE}
+}
+```
+
+This repo is inspired by:
+
+```bibtex
+@article{popov2021diffusion,
+  title={Diffusion-based voice conversion with fast maximum likelihood sampling scheme},
+  author={Popov, Vadim and Vovk, Ivan and Gogoryan, Vladimir and Sadekova, Tasnima and Kudinov, Mikhail and Wei, Jiansheng},
+  journal={arXiv preprint arXiv:2109.13821},
+  year={2021}
+}
+```
+```bibtex
+@inproceedings{liu2022diffsinger,
+  title={Diffsinger: Singing voice synthesis via shallow diffusion mechanism},
+  author={Liu, Jinglin and Li, Chengxi and Ren, Yi and Chen, Feiyang and Zhao, Zhou},
+  booktitle={Proceedings of the AAAI conference on artificial intelligence},
+  volume={36},
+  number={10},
+  pages={11020--11028},
+  year={2022}
+}
+```
+
+## Acknowledgement
+
+[Welcome to LCAP! < LCAP (jhu.edu)](https://engineering.jhu.edu/lcap/)
+
+We borrow code from following repos:
+
+ - `Diffusion Schedulers` are based on 🤗 [Diffusers](https://github.com/huggingface/diffusers)
+ - `2D UNet` is based on [DiffVC](https://github.com/huawei-noah/Speech-Backbones/tree/main/DiffVC)
diff --git a/examples/off-key.wav b/examples/off-key.wav
new file mode 100644
index 0000000000000000000000000000000000000000..9f4d731509f33531784f06a90ea7ecfb9f8e58a0
Binary files /dev/null and b/examples/off-key.wav differ
diff --git a/examples/reference.wav b/examples/reference.wav
new file mode 100644
index 0000000000000000000000000000000000000000..5b88dd80696b392e278ce005cba1a7a535ecd768
Binary files /dev/null and b/examples/reference.wav differ
diff --git a/examples/score_midi.midi b/examples/score_midi.midi
new file mode 100644
index 0000000000000000000000000000000000000000..aa32cd3eb5f4e5a55ea95ad04fc61922de25adef
Binary files /dev/null and b/examples/score_midi.midi differ
diff --git a/examples/score_midi.npy b/examples/score_midi.npy
new file mode 100644
index 0000000000000000000000000000000000000000..fe94b503a4dfa5370e8c30f6b5a1dfcd1b95128c
--- /dev/null
+++ b/examples/score_midi.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7baacba4afb8813d057e420cd63853657401403b6c798f6cb7f06673e7dcea5a
+size 559232
diff --git a/examples/score_vocal.wav b/examples/score_vocal.wav
new file mode 100644
index 0000000000000000000000000000000000000000..93979f26d61d29819bacd3ad91dd5ec2a4f3d5ec
Binary files /dev/null and b/examples/score_vocal.wav differ
diff --git a/output_score.wav b/output_score.wav
new file mode 100644
index 0000000000000000000000000000000000000000..1fdfe65dfcd230f209309782d636d5d1c9c8f3e0
Binary files /dev/null and b/output_score.wav differ
diff --git a/output_template.wav b/output_template.wav
new file mode 100644
index 0000000000000000000000000000000000000000..c3dbd20ce4ca06cba47d44e21478eb5f76ade960
Binary files /dev/null and b/output_template.wav differ
diff --git a/pitch_controller/README.md b/pitch_controller/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3cc28a7375b89a65bb54f54db6c1b0a3393457f9
--- /dev/null
+++ b/pitch_controller/README.md
@@ -0,0 +1 @@
+# Diffusion-based Pitch Controller
diff --git a/pitch_controller/__pycache__/utils.cpython-310.pyc b/pitch_controller/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9986b33f07576240fb93bbbb15f247cb6c1097e9
Binary files /dev/null and b/pitch_controller/__pycache__/utils.cpython-310.pyc differ
diff --git a/pitch_controller/config/DiffWorld_24k.yaml b/pitch_controller/config/DiffWorld_24k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bddae2a73780500bfb9783180a0823648dbe7631
--- /dev/null
+++ b/pitch_controller/config/DiffWorld_24k.yaml
@@ -0,0 +1,24 @@
+version: 1.0
+
+logmel:
+  n_mels: 100
+  sampling_rate: 24000
+  n_fft: 1024
+  hop_size: 256
+  max: 2.5
+  min: -12
+
+unet:
+  dim_base: 256
+  use_embed: False
+  dim_embed: None
+  use_ref_t: False
+  dim_cond: 128
+  dim_mults: [1, 2, 4]
+
+ddpm:
+  num_train_steps: 1000
+  inference_steps: 100
+  eta: 0.8
+
+  
diff --git a/pitch_controller/data/example/f0/p225_001.wav.npy b/pitch_controller/data/example/f0/p225_001.wav.npy
new file mode 100644
index 0000000000000000000000000000000000000000..df726e935d388f9bff0757baad41a51b6928e7a8
--- /dev/null
+++ b/pitch_controller/data/example/f0/p225_001.wav.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8df28ae08ef686e7c7e523fdde25b62fbd05725cdacc043cde407a898182272f
+size 1672
diff --git a/pitch_controller/data/example/mel/p225_001.wav.npy b/pitch_controller/data/example/mel/p225_001.wav.npy
new file mode 100644
index 0000000000000000000000000000000000000000..5a9db81767bb94382793a71c6fd8d51e2585c6e4
--- /dev/null
+++ b/pitch_controller/data/example/mel/p225_001.wav.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bf3c0e6956f57acdd82f5d91f6390ce148d89066faedbdd6f6ac8c48d1d2c76
+size 77328
diff --git a/pitch_controller/data/example/wav/p225_001.wav b/pitch_controller/data/example/wav/p225_001.wav
new file mode 100644
index 0000000000000000000000000000000000000000..bdc14bce583af4fb51e3382ec852e30e3dbf62f6
Binary files /dev/null and b/pitch_controller/data/example/wav/p225_001.wav differ
diff --git a/pitch_controller/data/example/world/p225_001.wav.npy b/pitch_controller/data/example/world/p225_001.wav.npy
new file mode 100644
index 0000000000000000000000000000000000000000..19bb6564a4c130e2373f00189e4f5601ddcd0030
--- /dev/null
+++ b/pitch_controller/data/example/world/p225_001.wav.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e00d5eb7fa9df26321df3f3df06e2ff44c3b3732cc5179ef135e41ffeb3a3b82
+size 77328
diff --git a/pitch_controller/data/prepare_f0.py b/pitch_controller/data/prepare_f0.py
new file mode 100644
index 0000000000000000000000000000000000000000..664f276509e810688c330c014806fea8a96f2e6f
--- /dev/null
+++ b/pitch_controller/data/prepare_f0.py
@@ -0,0 +1,66 @@
+# import amfm_decompy.basic_tools as basic
+# import amfm_decompy.pYAAPT as pYAAPT
+from multiprocessing import Process
+import os
+import numpy as np
+import pandas as pd
+import librosa
+from librosa.core import load
+from tqdm import tqdm
+
+
+def get_f0(wav_path):
+    wav, _ = load(wav_path, sr=24000)
+    wav = wav[:(wav.shape[0] // 256) * 256]
+    wav = np.pad(wav, 384, mode='reflect')
+    f0, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=256, center=False,
+                            fmin=librosa.note_to_hz('C2'),
+                            fmax=librosa.note_to_hz('C6'))
+    return np.nan_to_num(f0)
+
+
+def chunks(arr, m):
+    result = [[] for i in range(m)]
+    for i in range(len(arr)):
+        result[i%m].append(arr[i])
+    return result
+
+
+def extract_f0(subset):
+    meta = pd.read_csv('../raw_data/meta_fix.csv')
+    meta = meta[meta['subset'] == 'train']
+    # meta = meta[meta['folder'] == 'VCTK-Corpus/vocal/']
+
+    for i in tqdm(subset):
+        line = meta.iloc[i]
+        audio_dir = '../raw_data/' + line['folder'] + line['subfolder']
+        f = line['file_name']
+
+        f0_dir = audio_dir.replace('vocal', 'f0').replace('raw_data/', '24k_data_f0/')
+
+        try:
+            np.load(os.path.join(f0_dir, f+'.npy'))
+        except:
+            print(line)
+            f0 = get_f0(os.path.join(audio_dir, f))
+            if os.path.exists(f0_dir) is False:
+                os.makedirs(f0_dir, exist_ok=True)
+            np.save(os.path.join(f0_dir, f + '.npy'), f0)
+
+        # if os.path.exists(os.path.join(f0_dir, f+'.npy')) is False:
+            # f0 = get_yaapt_f0(os.path.join(audio_dir, f))
+
+
+if __name__ == '__main__':
+    cores = 8
+    meta = pd.read_csv('../raw_data/meta_fix.csv')
+    meta = meta[meta['subset']=='train']
+    # meta = meta[meta['folder'] == 'VCTK-Corpus/vocal/']
+
+    idx_list = [i for i in range(len(meta))]
+
+    subsets = chunks(idx_list, cores)
+
+    for subset in subsets:
+        t = Process(target=extract_f0, args=(subset,))
+        t.start()
diff --git a/pitch_controller/data/prepare_mel.py b/pitch_controller/data/prepare_mel.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b441b64071040eed73fc4b9fe0688f9edb419dc
--- /dev/null
+++ b/pitch_controller/data/prepare_mel.py
@@ -0,0 +1,72 @@
+import os
+import numpy as np
+
+import librosa
+from librosa.core import load
+from librosa.filters import mel as librosa_mel_fn
+mel_basis = librosa_mel_fn(sr=24000, n_fft=1024, n_mels=100, fmin=0, fmax=12000)
+
+from tqdm import tqdm
+import pandas as pd
+
+from multiprocessing import Process
+
+
+# def get_f0(wav_path):
+#     wav, _ = load(wav_path, sr=22050)
+#     wav = wav[:(wav.shape[0] // 256) * 256]
+#     wav = np.pad(wav, 384, mode='reflect')
+#     f0, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=256, center=False,
+#                             fmin=librosa.note_to_hz('C2'),
+#                             fmax=librosa.note_to_hz('C6'))
+#     return np.nan_to_num(f0)
+
+def get_mel(wav_path):
+    wav, _ = load(wav_path, sr=24000)
+    wav = wav[:(wav.shape[0] // 256)*256]
+    wav = np.pad(wav, 384, mode='reflect')
+    stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False)
+    stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9))
+    mel_spectrogram = np.matmul(mel_basis, stftm)
+    log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None))
+    return log_mel_spectrogram
+
+
+def chunks(arr, m):
+    result = [[] for i in range(m)]
+    for i in range(len(arr)):
+        result[i%m].append(arr[i])
+    return result
+
+
+def extract_mel(subset):
+    meta = pd.read_csv('../raw_data/meta_fix.csv')
+    meta = meta[meta['folder'] == 'eval/vocal/']
+
+    for i in tqdm(subset):
+        line = meta.iloc[i]
+        audio_dir = '../raw_data/' + line['folder'] + line['subfolder']
+        f = line['file_name']
+
+        mel_dir = audio_dir.replace('vocal', 'mel').replace('raw_data/', '24k_data/')
+
+        if os.path.exists(os.path.join(mel_dir, f+'.npy')) is False:
+            mel = get_mel(os.path.join(audio_dir, f))
+            if os.path.exists(mel_dir) is False:
+                os.makedirs(mel_dir)
+            np.save(os.path.join(mel_dir, f+'.npy'), mel)
+
+
+if __name__ == '__main__':
+    cores = 8
+
+    meta = pd.read_csv('../raw_data/meta_fix.csv')
+    meta = meta[meta['folder'] == 'eval/vocal/']
+
+    idx_list = [i for i in range(len(meta))]
+
+    subsets = chunks(idx_list, cores)
+
+    for subset in subsets:
+        t = Process(target=extract_mel, args=(subset,))
+        t.start()
diff --git a/pitch_controller/data/prepare_world.py b/pitch_controller/data/prepare_world.py
new file mode 100644
index 0000000000000000000000000000000000000000..651f84a9c654b89b7bb365720fd16a2ba366067e
--- /dev/null
+++ b/pitch_controller/data/prepare_world.py
@@ -0,0 +1,85 @@
+from multiprocessing import Process
+import os
+import numpy as np
+
+import librosa
+from librosa.core import load
+from librosa.filters import mel as librosa_mel_fn
+mel_basis = librosa_mel_fn(sr=24000, n_fft=1024, n_mels=100, fmin=0, fmax=12000)
+
+from tqdm import tqdm
+import pandas as pd
+import pyworld as pw
+
+
+def get_world_mel(wav_path, sr=24000):
+    wav, _ = librosa.load(wav_path, sr=sr)
+    wav = (wav * 32767).astype(np.int16)
+    wav = (wav / 32767).astype(np.float64)
+    # wav = wav.astype(np.float64)
+    wav = wav[:(wav.shape[0] // 256) * 256]
+
+    _f0, t = pw.dio(wav, sr, frame_period=256/sr*1000)
+    f0 = pw.stonemask(wav, _f0, t, sr)
+    sp = pw.cheaptrick(wav, f0, t, sr)
+    ap = pw.d4c(wav, f0, t, sr)
+    wav_hat = pw.synthesize(f0 * 0, sp, ap, sr, frame_period=256/sr*1000)
+
+    # pyworld output does not pad left
+    wav_hat = wav_hat[:len(wav)]
+    # wav_hat = wav_hat[256//2: len(wav)+256//2]
+    assert len(wav_hat) == len(wav)
+    wav = wav_hat.astype(np.float32)
+    wav = np.pad(wav, 384, mode='reflect')
+    stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False)
+    stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9))
+    mel_spectrogram = np.matmul(mel_basis, stftm)
+    log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None))
+
+    return log_mel_spectrogram, f0
+
+
+def chunks(arr, m):
+    result = [[] for i in range(m)]
+    for i in range(len(arr)):
+        result[i%m].append(arr[i])
+    return result
+
+
+def extract_pw(subset, save_f0=False):
+    meta = pd.read_csv('../raw_data/meta_fix.csv')
+    meta = meta[meta['subset'] == 'train']
+
+    for i in tqdm(subset):
+        line = meta.iloc[i]
+        audio_dir = '../raw_data/' + line['folder'] + line['subfolder']
+        f = line['file_name']
+
+        mel_dir = audio_dir.replace('vocal', 'world').replace('raw_data/', '24k_data/')
+        f0_dir = audio_dir.replace('vocal', 'f0').replace('raw_data/', '24k_f0/')
+
+        if os.path.exists(os.path.join(mel_dir, f+'.npy')) is False:
+            mel = get_world_mel(os.path.join(audio_dir, f))
+
+            if os.path.exists(mel_dir) is False:
+                os.makedirs(mel_dir)
+            np.save(os.path.join(mel_dir, f+'.npy'), mel)
+
+            if save_f0 is True:
+                if os.path.exists(f0_dir) is False:
+                    os.makedirs(f0_dir)
+                np.save(os.path.join(f0_dir, f + '.npy'), f0)
+
+
+if __name__ == '__main__':
+    cores = 8
+    meta = pd.read_csv('../raw_data/meta_fix.csv')
+    meta = meta[meta['subset'] == 'train']
+
+    idx_list = [i for i in range(len(meta))]
+
+    subsets = chunks(idx_list, cores)
+
+    for subset in subsets:
+        t = Process(target=extract_pw, args=(subset,))
+        t.start()
\ No newline at end of file
diff --git a/pitch_controller/dataset/__init__.py b/pitch_controller/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..02ee5fcb455bf5d6bccdeb860cb28a3065d356a0
--- /dev/null
+++ b/pitch_controller/dataset/__init__.py
@@ -0,0 +1 @@
+from .diff_lpc import VCDecLPCDataset, VCDecLPCBatchCollate, VCDecLPCTest
\ No newline at end of file
diff --git a/pitch_controller/dataset/__pycache__/__init__.cpython-310.pyc b/pitch_controller/dataset/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1dd0168f76ea87c34f3effbd02c94fc1b3889228
Binary files /dev/null and b/pitch_controller/dataset/__pycache__/__init__.cpython-310.pyc differ
diff --git a/pitch_controller/dataset/__pycache__/__init__.cpython-39.pyc b/pitch_controller/dataset/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc8a0a636619ddb9e002e29ae0bfab48a9d3160d
Binary files /dev/null and b/pitch_controller/dataset/__pycache__/__init__.cpython-39.pyc differ
diff --git a/pitch_controller/dataset/__pycache__/content_enc.cpython-310.pyc b/pitch_controller/dataset/__pycache__/content_enc.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec3ae2e3219fc24868acce4d96470302d5b0c8a1
Binary files /dev/null and b/pitch_controller/dataset/__pycache__/content_enc.cpython-310.pyc differ
diff --git a/pitch_controller/dataset/__pycache__/content_enc.cpython-39.pyc b/pitch_controller/dataset/__pycache__/content_enc.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6e47d6c54c00baee291bbbbfda90872a5d7a756
Binary files /dev/null and b/pitch_controller/dataset/__pycache__/content_enc.cpython-39.pyc differ
diff --git a/pitch_controller/dataset/__pycache__/diff.cpython-310.pyc b/pitch_controller/dataset/__pycache__/diff.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8cbf3f34ac9208da3b873d56088129f16cfda236
Binary files /dev/null and b/pitch_controller/dataset/__pycache__/diff.cpython-310.pyc differ
diff --git a/pitch_controller/dataset/__pycache__/diff.cpython-39.pyc b/pitch_controller/dataset/__pycache__/diff.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b891cf87ea4f8ef38e1a969c2797b426fc90fbdd
Binary files /dev/null and b/pitch_controller/dataset/__pycache__/diff.cpython-39.pyc differ
diff --git a/pitch_controller/dataset/__pycache__/diff_lpc.cpython-310.pyc b/pitch_controller/dataset/__pycache__/diff_lpc.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e61812d6f0a77d6b1ac9e89f97d1e01636e03ff6
Binary files /dev/null and b/pitch_controller/dataset/__pycache__/diff_lpc.cpython-310.pyc differ
diff --git a/pitch_controller/dataset/diff_lpc.py b/pitch_controller/dataset/diff_lpc.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3fd77f18b339d463709c5ff38ae0721576ea951
--- /dev/null
+++ b/pitch_controller/dataset/diff_lpc.py
@@ -0,0 +1,271 @@
+import os
+import random
+import numpy as np
+import torch
+import tgt
+import pandas as pd
+
+from torch.utils.data import Dataset
+import librosa
+
+
+def f0_to_coarse(f0, hparams):
+    f0_bin = hparams['f0_bin']
+    f0_max = hparams['f0_max']
+    f0_min = hparams['f0_min']
+    is_torch = isinstance(f0, torch.Tensor)
+    # to mel scale
+    f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+    f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+    f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
+
+    unvoiced = (f0_mel == 0)
+
+    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
+
+    f0_mel[f0_mel <= 1] = 1
+    f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
+
+    f0_mel[unvoiced] = 0
+
+    f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int)
+    assert f0_coarse.max() <= 255 and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min())
+    return f0_coarse
+
+
+def log_f0(f0, hparams):
+    f0_bin = hparams['f0_bin']
+    f0_max = hparams['f0_max']
+    f0_min = hparams['f0_min']
+
+    f0_mel = np.zeros_like(f0)
+    f0_mel[f0 != 0] = 12*np.log2(f0[f0 != 0]/f0_min) + 1
+    f0_mel_min = 12*np.log2(f0_min/f0_min) + 1
+    f0_mel_max = 12*np.log2(f0_max/f0_min) + 1
+
+    unvoiced = (f0_mel == 0)
+
+    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
+
+    f0_mel[f0_mel <= 1] = 1
+    f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
+
+    f0_mel[unvoiced] = 0
+
+    f0_coarse = np.rint(f0_mel).astype(int)
+    assert f0_coarse.max() <= (f0_bin-1) and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min())
+    return f0_coarse
+
+
+# training "average voice" encoder
+class VCDecLPCDataset(Dataset):
+    def __init__(self, data_dir, subset, content_dir='lpc_mel_512', extract_emb=False,
+                 f0_type='bins'):
+        self.path = data_dir
+        meta = pd.read_csv(data_dir + 'meta_fix.csv')
+        self.meta = meta[meta['subset'] == subset]
+        self.content_dir = content_dir
+        self.extract_emb = extract_emb
+        self.f0_type = f0_type
+
+    def get_vc_data(self, audio_path, mel_id):
+        mel_dir = audio_path.replace('vocal', 'mel')
+        embed_dir = audio_path.replace('vocal', 'embed')
+        pitch_dir = audio_path.replace('vocal', 'f0')
+        content_dir = audio_path.replace('vocal', self.content_dir)
+
+        mel = os.path.join(mel_dir, mel_id + '.npy')
+        embed = os.path.join(embed_dir, mel_id + '.npy')
+        pitch = os.path.join(pitch_dir, mel_id + '.npy')
+        content = os.path.join(content_dir, mel_id + '.npy')
+
+        mel = np.load(mel)
+        if self.extract_emb:
+            embed = np.load(embed)
+        else:
+            embed = np.zeros(1)
+
+        pitch = np.load(pitch)
+        content = np.load(content)
+
+        pitch = np.nan_to_num(pitch)
+        if self.f0_type == 'bins':
+            pitch = f0_to_coarse(pitch, {'f0_bin': 256,
+                                         'f0_min': librosa.note_to_hz('C2'),
+                                         'f0_max': librosa.note_to_hz('C6')})
+        elif self.f0_type == 'log':
+            pitch = log_f0(pitch, {'f0_bin': 345,
+                                   'f0_min': librosa.note_to_hz('C2'),
+                                   'f0_max': librosa.note_to_hz('C#6')})
+
+        mel = torch.from_numpy(mel).float()
+        embed = torch.from_numpy(embed).float()
+        pitch = torch.from_numpy(pitch).float()
+        content = torch.from_numpy(content).float()
+
+        return (mel, embed, pitch, content)
+
+    def __getitem__(self, index):
+        row = self.meta.iloc[index]
+        mel_id = row['file_name']
+        audio_path = self.path + row['folder'] + row['subfolder']
+        mel, embed, pitch, content = self.get_vc_data(audio_path, mel_id)
+        item = {'mel': mel, 'embed': embed, 'f0': pitch, 'content': content}
+        return item
+
+    def __len__(self):
+        return len(self.meta)
+
+
+class VCDecLPCBatchCollate(object):
+    def __init__(self, train_frames, eps=1e-5):
+        self.train_frames = train_frames
+        self.eps = eps
+
+    def __call__(self, batch):
+        train_frames = self.train_frames
+        eps = self.eps
+
+        B = len(batch)
+        embed = torch.stack([item['embed'] for item in batch], 0)
+
+        n_mels = batch[0]['mel'].shape[0]
+        content_dim = batch[0]['content'].shape[0]
+
+        # min value of log-mel spectrogram is np.log(eps) == padding zero in time domain
+        mels1 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * np.log(eps)
+        mels2 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * np.log(eps)
+
+        # ! need to deal with empty frames here
+        contents1 = torch.ones((B, content_dim, train_frames), dtype=torch.float32) * np.log(eps)
+
+        f0s1 = torch.zeros((B, train_frames), dtype=torch.float32)
+        max_starts = [max(item['mel'].shape[-1] - train_frames, 0)
+                      for item in batch]
+
+        starts1 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts]
+        starts2 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts]
+        mel_lengths = []
+        for i, item in enumerate(batch):
+            mel = item['mel']
+            f0 = item['f0']
+            content = item['content']
+
+            if mel.shape[-1] < train_frames:
+                mel_length = mel.shape[-1]
+            else:
+                mel_length = train_frames
+
+            mels1[i, :, :mel_length] = mel[:, starts1[i]:starts1[i] + mel_length]
+            f0s1[i, :mel_length] = f0[starts1[i]:starts1[i] + mel_length]
+            contents1[i, :, :mel_length] = content[:, starts1[i]:starts1[i] + mel_length]
+
+            mels2[i, :, :mel_length] = mel[:, starts2[i]:starts2[i] + mel_length]
+            mel_lengths.append(mel_length)
+
+        mel_lengths = torch.LongTensor(mel_lengths)
+
+        return {'mel1': mels1, 'mel2': mels2, 'mel_lengths': mel_lengths,
+                'embed': embed,
+                'f0_1': f0s1,
+                'content1': contents1}
+
+
+class VCDecLPCTest(Dataset):
+    def __init__(self, data_dir, subset='test', eps=1e-5, test_frames=256, content_dir='lpc_mel_512', extract_emb=False,
+                 f0_type='bins'):
+        self.path = data_dir
+        meta = pd.read_csv(data_dir + 'meta_test.csv')
+        self.meta = meta[meta['subset'] == subset]
+        self.content_dir = content_dir
+        self.extract_emb = extract_emb
+        self.eps = eps
+        self.test_frames = test_frames
+        self.f0_type = f0_type
+
+    def get_vc_data(self, audio_path, mel_id, pitch_shift):
+        mel_dir = audio_path.replace('vocal', 'mel')
+        embed_dir = audio_path.replace('vocal', 'embed')
+        pitch_dir = audio_path.replace('vocal', 'f0')
+        content_dir = audio_path.replace('vocal', self.content_dir)
+
+        mel = os.path.join(mel_dir, mel_id + '.npy')
+        embed = os.path.join(embed_dir, mel_id + '.npy')
+        pitch = os.path.join(pitch_dir, mel_id + '.npy')
+        content = os.path.join(content_dir, mel_id + '.npy')
+
+        mel = np.load(mel)
+        if self.extract_emb:
+            embed = np.load(embed)
+        else:
+            embed = np.zeros(1)
+
+        pitch = np.load(pitch)
+        content = np.load(content)
+
+        pitch = np.nan_to_num(pitch)
+        pitch = pitch*pitch_shift
+
+        if self.f0_type == 'bins':
+            pitch = f0_to_coarse(pitch, {'f0_bin': 256,
+                                         'f0_min': librosa.note_to_hz('C2'),
+                                         'f0_max': librosa.note_to_hz('C6')})
+        elif self.f0_type == 'log':
+            pitch = log_f0(pitch, {'f0_bin': 345,
+                                   'f0_min': librosa.note_to_hz('C2'),
+                                   'f0_max': librosa.note_to_hz('C#6')})
+
+        mel = torch.from_numpy(mel).float()
+        embed = torch.from_numpy(embed).float()
+        pitch = torch.from_numpy(pitch).float()
+        content = torch.from_numpy(content).float()
+
+        return (mel, embed, pitch, content)
+
+    def __getitem__(self, index):
+        row = self.meta.iloc[index]
+
+        mel_id = row['content_file_name']
+        audio_path = self.path + row['content_folder'] + row['content_subfolder']
+        pitch_shift = row['pitch_shift']
+        mel1, _, f0, content = self.get_vc_data(audio_path, mel_id, pitch_shift)
+
+        mel_id = row['timbre_file_name']
+        audio_path = self.path + row['timbre_folder'] + row['timbre_subfolder']
+        mel2, embed, _, _ = self.get_vc_data(audio_path, mel_id, pitch_shift)
+
+        n_mels = mel1.shape[0]
+        content_dim = content.shape[0]
+
+        mels1 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * np.log(self.eps)
+        mels2 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * np.log(self.eps)
+        lpcs1 = torch.ones((content_dim, self.test_frames), dtype=torch.float32) * np.log(self.eps)
+
+        f0s1 = torch.zeros(self.test_frames, dtype=torch.float32)
+
+        if mel1.shape[-1] < self.test_frames:
+            mel_length = mel1.shape[-1]
+        else:
+            mel_length = self.test_frames
+        mels1[:, :mel_length] = mel1[:, :mel_length]
+        f0s1[:mel_length] = f0[:mel_length]
+        lpcs1[:, :mel_length] = content[:, :mel_length]
+
+        if mel2.shape[-1] < self.test_frames:
+            mel_length = mel2.shape[-1]
+        else:
+            mel_length = self.test_frames
+        mels2[:, :mel_length] = mel2[:, :mel_length]
+
+        return {'mel1': mels1, 'mel2': mels2, 'embed': embed, 'f0_1': f0s1, 'content1': lpcs1}
+
+    def __len__(self):
+        return len(self.meta)
+
+
+if __name__ == '__main__':
+    f0 = np.array([110.0, 220.0, librosa.note_to_hz('C2'), 0, librosa.note_to_hz('E3'), librosa.note_to_hz('C6')])
+    # 50 midi notes = (50-1)
+    pitch = log_f0(f0, {'f0_bin': 345,
+                        'f0_min': librosa.note_to_hz('C2'),
+                        'f0_max': librosa.note_to_hz('C#6')})
\ No newline at end of file
diff --git a/pitch_controller/dataset/diff_lpc_content.py b/pitch_controller/dataset/diff_lpc_content.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1263ee8cd332c7c64aad683c08d201a04b80883
--- /dev/null
+++ b/pitch_controller/dataset/diff_lpc_content.py
@@ -0,0 +1,231 @@
+import os
+import random
+import numpy as np
+import torch
+import tgt
+import pandas as pd
+
+from torch.utils.data import Dataset
+import librosa
+
+
+def f0_to_coarse(f0, hparams):
+    f0_bin = hparams['f0_bin']
+    f0_max = hparams['f0_max']
+    f0_min = hparams['f0_min']
+    is_torch = isinstance(f0, torch.Tensor)
+    # to mel scale
+    f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+    f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+    f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
+
+    unvoiced = (f0_mel == 0)
+
+    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
+
+    f0_mel[f0_mel <= 1] = 1
+    f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
+
+    f0_mel[unvoiced] = 0
+
+    f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int)
+    assert f0_coarse.max() <= 255 and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min())
+    return f0_coarse
+
+
+# training "average voice" encoder
+class VCDecLPCDataset(Dataset):
+    def __init__(self, data_dir, subset, content_dir='lpc_mel_512', extract_emb=False):
+        self.path = data_dir
+        meta = pd.read_csv(data_dir + 'meta_fix.csv')
+        self.meta = meta[meta['subset'] == subset]
+        self.content_dir = content_dir
+        self.extract_emb = extract_emb
+
+    def get_vc_data(self, audio_path, mel_id):
+        mel_dir = audio_path.replace('vocal', 'mel')
+        embed_dir = audio_path.replace('vocal', 'embed')
+        pitch_dir = audio_path.replace('vocal', 'f0')
+        content_dir = audio_path.replace('vocal', self.content_dir)
+
+        mel = os.path.join(mel_dir, mel_id + '.npy')
+        embed = os.path.join(embed_dir, mel_id + '.npy')
+        pitch = os.path.join(pitch_dir, mel_id + '.npy')
+        content = os.path.join(content_dir, mel_id + '.npy')
+
+        mel = np.load(mel)
+        if self.extract_emb:
+            embed = np.load(embed)
+        else:
+            embed = np.zeros(1)
+
+        pitch = np.load(pitch)
+        content = np.load(content)
+
+        pitch = np.nan_to_num(pitch)
+        pitch = f0_to_coarse(pitch, {'f0_bin': 256,
+                                     'f0_min': librosa.note_to_hz('C2'),
+                                     'f0_max': librosa.note_to_hz('C6')})
+
+        mel = torch.from_numpy(mel).float()
+        embed = torch.from_numpy(embed).float()
+        pitch = torch.from_numpy(pitch).float()
+        content = torch.from_numpy(content).float()
+
+        return (mel, embed, pitch, content)
+
+    def __getitem__(self, index):
+        row = self.meta.iloc[index]
+        mel_id = row['file_name']
+        audio_path = self.path + row['folder'] + row['subfolder']
+        mel, embed, pitch, content = self.get_vc_data(audio_path, mel_id)
+        item = {'mel': mel, 'embed': embed, 'f0': pitch, 'content': content}
+        return item
+
+    def __len__(self):
+        return len(self.meta)
+
+
+class VCDecLPCBatchCollate(object):
+    def __init__(self, train_frames, eps=np.log(1e-5), content_eps=np.log(1e-12)):
+        self.train_frames = train_frames
+        self.eps = eps
+        self.content_eps = content_eps
+
+    def __call__(self, batch):
+        train_frames = self.train_frames
+        eps = self.eps
+        content_eps = self.content_eps
+
+        B = len(batch)
+        embed = torch.stack([item['embed'] for item in batch], 0)
+
+        n_mels = batch[0]['mel'].shape[0]
+        content_dim = batch[0]['content'].shape[0]
+
+        # min value of log-mel spectrogram is np.log(eps) == padding zero in time domain
+        mels1 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * eps
+        mels2 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * eps
+
+        # using a different eps
+        contents1 = torch.ones((B, content_dim, train_frames), dtype=torch.float32) * content_eps
+
+        f0s1 = torch.zeros((B, train_frames), dtype=torch.float32)
+        max_starts = [max(item['mel'].shape[-1] - train_frames, 0)
+                      for item in batch]
+
+        starts1 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts]
+        starts2 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts]
+        mel_lengths = []
+        for i, item in enumerate(batch):
+            mel = item['mel']
+            f0 = item['f0']
+            content = item['content']
+
+            if mel.shape[-1] < train_frames:
+                mel_length = mel.shape[-1]
+            else:
+                mel_length = train_frames
+
+            mels1[i, :, :mel_length] = mel[:, starts1[i]:starts1[i] + mel_length]
+            f0s1[i, :mel_length] = f0[starts1[i]:starts1[i] + mel_length]
+            contents1[i, :, :mel_length] = content[:, starts1[i]:starts1[i] + mel_length]
+
+            mels2[i, :, :mel_length] = mel[:, starts2[i]:starts2[i] + mel_length]
+            mel_lengths.append(mel_length)
+
+        mel_lengths = torch.LongTensor(mel_lengths)
+
+        return {'mel1': mels1, 'mel2': mels2, 'mel_lengths': mel_lengths,
+                'embed': embed,
+                'f0_1': f0s1,
+                'content1': contents1}
+
+
+class VCDecLPCTest(Dataset):
+    def __init__(self, data_dir, subset='test', eps=np.log(1e-5), content_eps=np.log(1e-12), test_frames=256, content_dir='lpc_mel_512', extract_emb=False):
+        self.path = data_dir
+        meta = pd.read_csv(data_dir + 'meta_test.csv')
+        self.meta = meta[meta['subset'] == subset]
+        self.content_dir = content_dir
+        self.extract_emb = extract_emb
+        self.eps = eps
+        self.content_eps = content_eps
+        self.test_frames = test_frames
+
+    def get_vc_data(self, audio_path, mel_id, pitch_shift):
+        mel_dir = audio_path.replace('vocal', 'mel')
+        embed_dir = audio_path.replace('vocal', 'embed')
+        pitch_dir = audio_path.replace('vocal', 'f0')
+        content_dir = audio_path.replace('vocal', self.content_dir)
+
+        mel = os.path.join(mel_dir, mel_id + '.npy')
+        embed = os.path.join(embed_dir, mel_id + '.npy')
+        pitch = os.path.join(pitch_dir, mel_id + '.npy')
+        content = os.path.join(content_dir, mel_id + '.npy')
+
+        mel = np.load(mel)
+        if self.extract_emb:
+            embed = np.load(embed)
+        else:
+            embed = np.zeros(1)
+
+        pitch = np.load(pitch)
+        content = np.load(content)
+
+        pitch = np.nan_to_num(pitch)
+        pitch = pitch*pitch_shift
+        pitch = f0_to_coarse(pitch, {'f0_bin': 256,
+                                     'f0_min': librosa.note_to_hz('C2'),
+                                     'f0_max': librosa.note_to_hz('C6')})
+
+        mel = torch.from_numpy(mel).float()
+        embed = torch.from_numpy(embed).float()
+        pitch = torch.from_numpy(pitch).float()
+        content = torch.from_numpy(content).float()
+
+        return (mel, embed, pitch, content)
+
+    def __getitem__(self, index):
+        row = self.meta.iloc[index]
+
+        mel_id = row['content_file_name']
+        audio_path = self.path + row['content_folder'] + row['content_subfolder']
+        pitch_shift = row['pitch_shift']
+        mel1, _, f0, content = self.get_vc_data(audio_path, mel_id, pitch_shift)
+
+        mel_id = row['timbre_file_name']
+        audio_path = self.path + row['timbre_folder'] + row['timbre_subfolder']
+        mel2, embed, _, _ = self.get_vc_data(audio_path, mel_id, pitch_shift)
+
+        n_mels = mel1.shape[0]
+        content_dim = content.shape[0]
+
+        mels1 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * self.eps
+        mels2 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * self.eps
+        # content
+        lpcs1 = torch.ones((content_dim, self.test_frames), dtype=torch.float32) * self.content_eps
+
+        f0s1 = torch.zeros(self.test_frames, dtype=torch.float32)
+
+        if mel1.shape[-1] < self.test_frames:
+            mel_length = mel1.shape[-1]
+        else:
+            mel_length = self.test_frames
+        mels1[:, :mel_length] = mel1[:, :mel_length]
+        f0s1[:mel_length] = f0[:mel_length]
+        lpcs1[:, :mel_length] = content[:, :mel_length]
+
+        if mel2.shape[-1] < self.test_frames:
+            mel_length = mel2.shape[-1]
+        else:
+            mel_length = self.test_frames
+        mels2[:, :mel_length] = mel2[:, :mel_length]
+
+        return {'mel1': mels1, 'mel2': mels2, 'embed': embed, 'f0_1': f0s1, 'content1': lpcs1}
+
+    def __len__(self):
+        return len(self.meta)
+
+
+
diff --git a/pitch_controller/load_vocoder.py b/pitch_controller/load_vocoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d61fbc0a259861a69e7a4cad245a8f460b12707
--- /dev/null
+++ b/pitch_controller/load_vocoder.py
@@ -0,0 +1,51 @@
+# from nsf_hifigan.models import load_model
+from modules.BigVGAN.inference import load_model
+import librosa
+
+import torch
+import torch.nn.functional as F
+import torchaudio
+import torchaudio.transforms as transforms
+
+import numpy as np
+import soundfile as sf
+
+
+class LogMelSpectrogram(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.melspctrogram = transforms.MelSpectrogram(
+            sample_rate=22050,
+            n_fft=1024,
+            win_length=1024,
+            hop_length=256,
+            center=False,
+            power=1.0,
+            norm="slaney",
+            n_mels=80,
+            mel_scale="slaney",
+            f_max=8000,
+            f_min=0,
+        )
+
+    def forward(self, wav):
+        wav = F.pad(wav, ((1024 - 256) // 2, (1024 - 256) // 2), "reflect")
+        mel = self.melspctrogram(wav)
+        logmel = torch.log(torch.clamp(mel, min=1e-5))
+        return logmel
+
+
+hifigan, cfg = load_model('modules/BigVGAN/ckpt/bigvgan_22khz_80band/g_05000000', device='cuda')
+M = LogMelSpectrogram()
+
+source, sr = torchaudio.load("music.mp3")
+source = torchaudio.functional.resample(source, sr, 22050)
+source = source.unsqueeze(0)
+mel = M(source).squeeze(0)
+
+# f0, f0_bin = get_pitch("116_1_pred.wav")
+# f0 = torch.tensor(f0).unsqueeze(0)
+with torch.no_grad():
+    y_hat = hifigan(mel.cuda()).cpu().numpy().squeeze(1)
+
+sf.write('test.wav', y_hat[0], samplerate=22050)
\ No newline at end of file
diff --git a/pitch_controller/models/__pycache__/base.cpython-310.pyc b/pitch_controller/models/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9054e2273f7445bb67fbbc6dc8c4606ec04924d1
Binary files /dev/null and b/pitch_controller/models/__pycache__/base.cpython-310.pyc differ
diff --git a/pitch_controller/models/__pycache__/base.cpython-39.pyc b/pitch_controller/models/__pycache__/base.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..461a7872ff2e1d23ec111d6f250c3c8cc4728166
Binary files /dev/null and b/pitch_controller/models/__pycache__/base.cpython-39.pyc differ
diff --git a/pitch_controller/models/__pycache__/modules.cpython-310.pyc b/pitch_controller/models/__pycache__/modules.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df9d7ed57017530ef7b4c5aeff750f3722a24395
Binary files /dev/null and b/pitch_controller/models/__pycache__/modules.cpython-310.pyc differ
diff --git a/pitch_controller/models/__pycache__/modules.cpython-39.pyc b/pitch_controller/models/__pycache__/modules.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61d0a80a28ac7f5398bdbfa16099009012b4792e
Binary files /dev/null and b/pitch_controller/models/__pycache__/modules.cpython-39.pyc differ
diff --git a/pitch_controller/models/__pycache__/pitch.cpython-39.pyc b/pitch_controller/models/__pycache__/pitch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7a520d1d49ca153688c6cd64179734bffa9732a
Binary files /dev/null and b/pitch_controller/models/__pycache__/pitch.cpython-39.pyc differ
diff --git a/pitch_controller/models/__pycache__/unet.cpython-310.pyc b/pitch_controller/models/__pycache__/unet.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9a5bff3d0e6fb01be2a1b9bbe236ec5ca5a6e81
Binary files /dev/null and b/pitch_controller/models/__pycache__/unet.cpython-310.pyc differ
diff --git a/pitch_controller/models/__pycache__/unet.cpython-39.pyc b/pitch_controller/models/__pycache__/unet.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66469b07f2add05927d1b7094014dc4ddca437e3
Binary files /dev/null and b/pitch_controller/models/__pycache__/unet.cpython-39.pyc differ
diff --git a/pitch_controller/models/__pycache__/update_unet.cpython-310.pyc b/pitch_controller/models/__pycache__/update_unet.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19b214b3997b5cf6fe01e7dfcfef408d2ac7aab1
Binary files /dev/null and b/pitch_controller/models/__pycache__/update_unet.cpython-310.pyc differ
diff --git a/pitch_controller/models/__pycache__/utils.cpython-310.pyc b/pitch_controller/models/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3b953957b48c64d44a111d45f8db0d28d723b43
Binary files /dev/null and b/pitch_controller/models/__pycache__/utils.cpython-310.pyc differ
diff --git a/pitch_controller/models/__pycache__/utils.cpython-39.pyc b/pitch_controller/models/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aeb6b27427a6bf7de53487337de85b96921b4846
Binary files /dev/null and b/pitch_controller/models/__pycache__/utils.cpython-39.pyc differ
diff --git a/pitch_controller/models/base.py b/pitch_controller/models/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c7395ddeba674eea0cb59594b9b2c838ae78c55
--- /dev/null
+++ b/pitch_controller/models/base.py
@@ -0,0 +1,30 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the MIT License.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# MIT License for more details.
+
+import numpy as np
+import torch
+
+
+class BaseModule(torch.nn.Module):
+    def __init__(self):
+        super(BaseModule, self).__init__()
+
+    @property
+    def nparams(self):
+        num_params = 0
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                num_params += np.prod(param.detach().cpu().numpy().shape)
+        return num_params
+
+    def relocate_input(self, x: list):
+        device = next(self.parameters()).device
+        for i in range(len(x)):
+            if isinstance(x[i], torch.Tensor) and x[i].device != device:
+                x[i] = x[i].to(device)
+        return x
diff --git a/pitch_controller/models/modules.py b/pitch_controller/models/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..da76268c128bfc9acc587d4db138f44ef180d5cc
--- /dev/null
+++ b/pitch_controller/models/modules.py
@@ -0,0 +1,237 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the MIT License.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# MIT License for more details.
+
+import math
+import torch
+from einops import rearrange
+
+from .base import BaseModule
+
+
+class Mish(BaseModule):
+    def forward(self, x):
+        return x * torch.tanh(torch.nn.functional.softplus(x))
+
+
+class Upsample(BaseModule):
+    def __init__(self, dim):
+        super(Upsample, self).__init__()
+        self.conv = torch.nn.ConvTranspose2d(dim, dim, 4, 2, 1)
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class Downsample(BaseModule):
+    def __init__(self, dim):
+        super(Downsample, self).__init__()
+        self.conv = torch.nn.Conv2d(dim, dim, 3, 2, 1)
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class Rezero(BaseModule):
+    def __init__(self, fn):
+        super(Rezero, self).__init__()
+        self.fn = fn
+        self.g = torch.nn.Parameter(torch.zeros(1))
+
+    def forward(self, x):
+        return self.fn(x) * self.g
+
+
+class Block(BaseModule):
+    def __init__(self, dim, dim_out, groups=8):
+        super(Block, self).__init__()
+        self.block = torch.nn.Sequential(torch.nn.Conv2d(dim, dim_out, 3, 
+                                         padding=1), torch.nn.GroupNorm(
+                                         groups, dim_out), Mish())
+
+    def forward(self, x):
+        output = self.block(x)
+        return output
+
+
+class ResnetBlock(BaseModule):
+    def __init__(self, dim, dim_out, time_emb_dim, groups=8):
+        super(ResnetBlock, self).__init__()
+        self.mlp = torch.nn.Sequential(Mish(), torch.nn.Linear(time_emb_dim, 
+                                                               dim_out))
+
+        self.block1 = Block(dim, dim_out, groups=groups)
+        self.block2 = Block(dim_out, dim_out, groups=groups)
+        if dim != dim_out:
+            self.res_conv = torch.nn.Conv2d(dim, dim_out, 1)
+        else:
+            self.res_conv = torch.nn.Identity()
+
+    def forward(self, x, time_emb):
+        h = self.block1(x)
+        h += self.mlp(time_emb).unsqueeze(-1).unsqueeze(-1)
+        h = self.block2(h)
+        output = h + self.res_conv(x)
+        return output
+
+
+class LinearAttention(BaseModule):
+    def __init__(self, dim, heads=4, dim_head=32, q_norm=True):
+        super(LinearAttention, self).__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = torch.nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = torch.nn.Conv2d(hidden_dim, dim, 1)
+        self.q_norm = q_norm
+
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', 
+                            heads=self.heads, qkv=3)
+        k = k.softmax(dim=-1)
+        if self.q_norm:
+            q = q.softmax(dim=-2)
+
+        context = torch.einsum('bhdn,bhen->bhde', k, v)
+        out = torch.einsum('bhde,bhdn->bhen', context, q)
+        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', 
+                        heads=self.heads, h=h, w=w)
+        return self.to_out(out)
+
+
+class Residual(BaseModule):
+    def __init__(self, fn):
+        super(Residual, self).__init__()
+        self.fn = fn
+
+    def forward(self, x, *args, **kwargs):
+        output = self.fn(x, *args, **kwargs) + x
+        return output
+
+
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+
+    # scale embeddings
+    emb = scale * emb
+
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+
+
+class Timesteps(BaseModule):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        return t_emb
+
+
+class PitchPosEmb(BaseModule):
+    def __init__(self, dim, flip_sin_to_cos=False, downscale_freq_shift=0):
+        super(PitchPosEmb, self).__init__()
+        self.dim = dim
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+
+    def forward(self, x):
+        # B * L
+        b, l = x.shape
+        x = rearrange(x, 'b l -> (b l)')
+        emb = get_timestep_embedding(
+            x,
+            self.dim,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        emb = rearrange(emb, '(b l) d -> b d l', b=b, l=l)
+        return emb
+
+
+class TimbreBlock(BaseModule):
+    def __init__(self, out_dim):
+        super(TimbreBlock, self).__init__()
+        base_dim = out_dim // 4
+
+        self.block11 = torch.nn.Sequential(torch.nn.Conv2d(1, 2 * base_dim,
+                                                           3, 1, 1),
+                                           torch.nn.InstanceNorm2d(2 * base_dim, affine=True),
+                                           torch.nn.GLU(dim=1))
+        self.block12 = torch.nn.Sequential(torch.nn.Conv2d(base_dim, 2 * base_dim,
+                                                           3, 1, 1),
+                                           torch.nn.InstanceNorm2d(2 * base_dim, affine=True),
+                                           torch.nn.GLU(dim=1))
+        self.block21 = torch.nn.Sequential(torch.nn.Conv2d(base_dim, 4 * base_dim,
+                                                           3, 1, 1),
+                                           torch.nn.InstanceNorm2d(4 * base_dim, affine=True),
+                                           torch.nn.GLU(dim=1))
+        self.block22 = torch.nn.Sequential(torch.nn.Conv2d(2 * base_dim, 4 * base_dim,
+                                                           3, 1, 1),
+                                           torch.nn.InstanceNorm2d(4 * base_dim, affine=True),
+                                           torch.nn.GLU(dim=1))
+        self.block31 = torch.nn.Sequential(torch.nn.Conv2d(2 * base_dim, 8 * base_dim,
+                                                           3, 1, 1),
+                                           torch.nn.InstanceNorm2d(8 * base_dim, affine=True),
+                                           torch.nn.GLU(dim=1))
+        self.block32 = torch.nn.Sequential(torch.nn.Conv2d(4 * base_dim, 8 * base_dim,
+                                                           3, 1, 1),
+                                           torch.nn.InstanceNorm2d(8 * base_dim, affine=True),
+                                           torch.nn.GLU(dim=1))
+        self.final_conv = torch.nn.Conv2d(4 * base_dim, out_dim, 1)
+
+    def forward(self, x):
+        y = self.block11(x)
+        y = self.block12(y)
+        y = self.block21(y)
+        y = self.block22(y)
+        y = self.block31(y)
+        y = self.block32(y)
+        y = self.final_conv(y)
+
+        return y.sum((2, 3)) / (y.shape[2] * y.shape[3])
\ No newline at end of file
diff --git a/pitch_controller/models/unet.py b/pitch_controller/models/unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2ba05dd79cfac6e0deabf39034b834b4f701512
--- /dev/null
+++ b/pitch_controller/models/unet.py
@@ -0,0 +1,153 @@
+import math
+import torch
+
+from .base import BaseModule
+from .modules import Mish, Upsample, Downsample, Rezero, Block, ResnetBlock
+from .modules import LinearAttention, Residual, Timesteps, TimbreBlock, PitchPosEmb
+
+from einops import rearrange
+
+
+class UNetPitcher(BaseModule):
+    def __init__(self,
+                 dim_base,
+                 dim_cond,
+                 use_ref_t,
+                 use_embed,
+                 dim_embed=256,
+                 dim_mults=(1, 2, 4),
+                 pitch_type='bins'):
+
+        super(UNetPitcher, self).__init__()
+        self.use_ref_t = use_ref_t
+        self.use_embed = use_embed
+        self.pitch_type = pitch_type
+
+        dim_in = 2
+
+        # time embedding
+        self.time_pos_emb = Timesteps(num_channels=dim_base,
+                                      flip_sin_to_cos=True,
+                                      downscale_freq_shift=0)
+
+        self.mlp = torch.nn.Sequential(torch.nn.Linear(dim_base, dim_base * 4),
+                                       Mish(), torch.nn.Linear(dim_base * 4, dim_base))
+
+        # speaker embedding
+        timbre_total = 0
+        if use_ref_t:
+            self.ref_block = TimbreBlock(out_dim=dim_cond)
+            timbre_total += dim_cond
+        if use_embed:
+            timbre_total += dim_embed
+
+        if timbre_total != 0:
+            self.timbre_block = torch.nn.Sequential(
+                torch.nn.Linear(timbre_total, 4 * dim_cond),
+                Mish(),
+                torch.nn.Linear(4 * dim_cond, dim_cond))
+
+        if use_embed or use_ref_t:
+            dim_in += dim_cond
+
+        self.pitch_pos_emb = PitchPosEmb(dim_cond)
+        self.pitch_mlp = torch.nn.Sequential(
+            torch.nn.Conv1d(dim_cond, dim_cond * 4, 1, stride=1),
+            Mish(),
+            torch.nn.Conv1d(dim_cond * 4, dim_cond, 1, stride=1), )
+        dim_in += dim_cond
+
+        # pitch embedding
+        # if self.pitch_type == 'bins':
+        #     print('using mel bins for f0')
+        # elif self.pitch_type == 'log':
+        #     print('using log bins f0')
+
+        dims = [dim_in, *map(lambda m: dim_base * m, dim_mults)]
+        in_out = list(zip(dims[:-1], dims[1:]))
+        # blocks
+        self.downs = torch.nn.ModuleList([])
+        self.ups = torch.nn.ModuleList([])
+        num_resolutions = len(in_out)
+
+        for ind, (dim_in, dim_out) in enumerate(in_out):
+            is_last = ind >= (num_resolutions - 1)
+            self.downs.append(torch.nn.ModuleList([
+                ResnetBlock(dim_in, dim_out, time_emb_dim=dim_base),
+                ResnetBlock(dim_out, dim_out, time_emb_dim=dim_base),
+                Residual(Rezero(LinearAttention(dim_out))),
+                Downsample(dim_out) if not is_last else torch.nn.Identity()]))
+
+        mid_dim = dims[-1]
+        self.mid_block1 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim_base)
+        self.mid_attn = Residual(Rezero(LinearAttention(mid_dim)))
+        self.mid_block2 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim_base)
+
+        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
+            self.ups.append(torch.nn.ModuleList([
+                ResnetBlock(dim_out * 2, dim_in, time_emb_dim=dim_base),
+                ResnetBlock(dim_in, dim_in, time_emb_dim=dim_base),
+                Residual(Rezero(LinearAttention(dim_in))),
+                Upsample(dim_in)]))
+        self.final_block = Block(dim_base, dim_base)
+        self.final_conv = torch.nn.Conv2d(dim_base, 1, 1)
+
+    def forward(self, x, mean, f0, t, ref=None, embed=None):
+        if not torch.is_tensor(t):
+            t = torch.tensor([t], dtype=torch.long, device=x.device)
+        if len(t.shape) == 0:
+            t = t * torch.ones(x.shape[0], dtype=t.dtype, device=x.device)
+
+        t = self.time_pos_emb(t)
+        t = self.mlp(t)
+
+        x = torch.stack([x, mean], 1)
+
+        f0 = self.pitch_pos_emb(f0)
+        f0 = self.pitch_mlp(f0)
+        f0 = f0.unsqueeze(2)
+        f0 = torch.cat(x.shape[2] * [f0], 2)
+
+        timbre = None
+        if self.use_ref_t:
+            ref = torch.stack([ref], 1)
+            timbre = self.ref_block(ref)
+        if self.use_embed:
+            if timbre is not None:
+                timbre = torch.cat([timbre, embed], 1)
+            else:
+                timbre = embed
+        if timbre is None:
+            # raise Exception("at least use one timbre condition")
+            condition = f0
+        else:
+            timbre = self.timbre_block(timbre).unsqueeze(-1).unsqueeze(-1)
+            timbre = torch.cat(x.shape[2] * [timbre], 2)
+            timbre = torch.cat(x.shape[3] * [timbre], 3)
+            condition = torch.cat([f0, timbre], 1)
+
+        x = torch.cat([x, condition], 1)
+
+        hiddens = []
+        for resnet1, resnet2, attn, downsample in self.downs:
+            x = resnet1(x, t)
+            x = resnet2(x, t)
+            x = attn(x)
+            hiddens.append(x)
+            x = downsample(x)
+
+        x = self.mid_block1(x, t)
+        x = self.mid_attn(x)
+        x = self.mid_block2(x, t)
+
+        for resnet1, resnet2, attn, upsample in self.ups:
+            x = torch.cat((x, hiddens.pop()), dim=1)
+            x = resnet1(x, t)
+            x = resnet2(x, t)
+            x = attn(x)
+            x = upsample(x)
+
+        x = self.final_block(x)
+        output = self.final_conv(x)
+
+        return output.squeeze(1)
\ No newline at end of file
diff --git a/pitch_controller/models/utils.py b/pitch_controller/models/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..985298131db8980f9016c3851eab4505529430b9
--- /dev/null
+++ b/pitch_controller/models/utils.py
@@ -0,0 +1,110 @@
+# Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the MIT License.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# MIT License for more details.
+
+import torch
+import torchaudio
+import numpy as np
+from librosa.filters import mel as librosa_mel_fn
+
+from .base import BaseModule
+
+
+def mse_loss(x, y, mask, n_feats):
+    loss = torch.sum(((x - y)**2) * mask)
+    return loss / (torch.sum(mask) * n_feats)
+
+
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(int(max_length), dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+
+
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+
+
+def fix_len_compatibility(length, num_downsamplings_in_unet=2):
+    while True:
+        if length % (2**num_downsamplings_in_unet) == 0:
+            return length
+        length += 1
+
+
+class PseudoInversion(BaseModule):
+    def __init__(self, n_mels, sampling_rate, n_fft):
+        super(PseudoInversion, self).__init__()
+        self.n_mels = n_mels
+        self.sampling_rate = sampling_rate
+        self.n_fft = n_fft
+        mel_basis = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=0, fmax=8000)
+        mel_basis_inverse = np.linalg.pinv(mel_basis)
+        mel_basis_inverse = torch.from_numpy(mel_basis_inverse).float()
+        self.register_buffer("mel_basis_inverse", mel_basis_inverse)
+
+    def forward(self, log_mel_spectrogram):
+        mel_spectrogram = torch.exp(log_mel_spectrogram)
+        stftm = torch.matmul(self.mel_basis_inverse, mel_spectrogram)
+        return stftm
+
+
+class InitialReconstruction(BaseModule):
+    def __init__(self, n_fft, hop_size):
+        super(InitialReconstruction, self).__init__()
+        self.n_fft = n_fft
+        self.hop_size = hop_size
+        window = torch.hann_window(n_fft).float()
+        self.register_buffer("window", window)
+
+    def forward(self, stftm):
+        real_part = torch.ones_like(stftm, device=stftm.device)
+        imag_part = torch.zeros_like(stftm, device=stftm.device)
+        stft = torch.stack([real_part, imag_part], -1)*stftm.unsqueeze(-1)
+        istft = torch.istft(stft, n_fft=self.n_fft,
+                            hop_length=self.hop_size, win_length=self.n_fft,
+                            window=self.window, center=True)
+        return istft.unsqueeze(1)
+
+
+# Fast Griffin-Lim algorithm as a PyTorch module
+class FastGL(BaseModule):
+    def __init__(self, n_mels, sampling_rate, n_fft, hop_size, momentum=0.99):
+        super(FastGL, self).__init__()
+        self.n_mels = n_mels
+        self.sampling_rate = sampling_rate
+        self.n_fft = n_fft
+        self.hop_size = hop_size
+        self.momentum = momentum
+        self.pi = PseudoInversion(n_mels, sampling_rate, n_fft)
+        self.ir = InitialReconstruction(n_fft, hop_size)
+        window = torch.hann_window(n_fft).float()
+        self.register_buffer("window", window)
+
+    @torch.no_grad()
+    def forward(self, s, n_iters=32):
+        c = self.pi(s)
+        x = self.ir(c)
+        x = x.squeeze(1)
+        c = c.unsqueeze(-1)
+        prev_angles = torch.zeros_like(c, device=c.device)
+        for _ in range(n_iters):        
+            s = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_size, 
+                           win_length=self.n_fft, window=self.window, 
+                           center=True)
+            real_part, imag_part = s.unbind(-1)
+            stftm = torch.sqrt(torch.clamp(real_part**2 + imag_part**2, min=1e-8))
+            angles = s / stftm.unsqueeze(-1)
+            s = c * (angles + self.momentum * (angles - prev_angles))
+            x = torch.istft(s, n_fft=self.n_fft, hop_length=self.hop_size,
+                            win_length=self.n_fft, window=self.window,
+                            center=True)
+            prev_angles = angles
+        return x.unsqueeze(1)
diff --git a/pitch_controller/modules/BigVGAN/LICENSE b/pitch_controller/modules/BigVGAN/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..e9663595cc28938f88d6299acd3ba791542e4c0c
--- /dev/null
+++ b/pitch_controller/modules/BigVGAN/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 NVIDIA CORPORATION.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software. 
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/pitch_controller/modules/BigVGAN/README.md b/pitch_controller/modules/BigVGAN/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a6cff37786a486deb55bc070254027aa492c2e92
--- /dev/null
+++ b/pitch_controller/modules/BigVGAN/README.md
@@ -0,0 +1,95 @@
+## BigVGAN: A Universal Neural Vocoder with Large-Scale Training
+#### Sang-gil Lee, Wei Ping, Boris Ginsburg, Bryan Catanzaro, Sungroh Yoon
+
+<center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800"></center>
+
+
+### [Paper](https://arxiv.org/abs/2206.04658)
+### [Audio demo](https://bigvgan-demo.github.io/)
+
+## Installation
+Clone the repository and install dependencies.
+```shell
+# the codebase has been tested on Python 3.8 / 3.10 with PyTorch 1.12.1 / 1.13 conda binaries
+git clone https://github.com/NVIDIA/BigVGAN
+pip install -r requirements.txt
+```
+
+Create symbolic link to the root of the dataset. The codebase uses filelist with the relative path from the dataset. Below are the example commands for LibriTTS dataset.
+``` shell
+cd LibriTTS && \
+ln -s /path/to/your/LibriTTS/train-clean-100 train-clean-100 && \
+ln -s /path/to/your/LibriTTS/train-clean-360 train-clean-360 && \
+ln -s /path/to/your/LibriTTS/train-other-500 train-other-500 && \
+ln -s /path/to/your/LibriTTS/dev-clean dev-clean && \
+ln -s /path/to/your/LibriTTS/dev-other dev-other && \
+ln -s /path/to/your/LibriTTS/test-clean test-clean && \
+ln -s /path/to/your/LibriTTS/test-other test-other && \
+cd ..
+```
+
+## Training
+Train BigVGAN model. Below is an example command for training BigVGAN using LibriTTS dataset at 24kHz with a full 100-band mel spectrogram as input.
+```shell
+python train.py \
+--config configs/bigvgan_24khz_100band.json \
+--input_wavs_dir LibriTTS \
+--input_training_file LibriTTS/train-full.txt \
+--input_validation_file LibriTTS/val-full.txt \
+--list_input_unseen_wavs_dir LibriTTS LibriTTS \
+--list_input_unseen_validation_file LibriTTS/dev-clean.txt LibriTTS/dev-other.txt \
+--checkpoint_path exp/bigvgan
+```
+
+## Synthesis
+Synthesize from BigVGAN model. Below is an example command for generating audio from the model.
+It computes mel spectrograms using wav files from `--input_wavs_dir` and saves the generated audio to `--output_dir`.
+```shell
+python inference.py \
+--checkpoint_file exp/bigvgan/g_05000000 \
+--input_wavs_dir /path/to/your/input_wav \
+--output_dir /path/to/your/output_wav
+```
+
+`inference_e2e.py` supports synthesis directly from the mel spectrogram saved in `.npy` format, with shapes `[1, channel, frame]` or `[channel, frame]`.
+It loads mel spectrograms from `--input_mels_dir` and saves the generated audio to `--output_dir`.
+
+Make sure that the STFT hyperparameters for mel spectrogram are the same as the model, which are defined in `config.json` of the corresponding model.
+```shell
+python inference_e2e.py \
+--checkpoint_file exp/bigvgan/g_05000000 \
+--input_mels_dir /path/to/your/input_mel \
+--output_dir /path/to/your/output_wav
+```
+
+## Pretrained Models
+We provide the [pretrained models](https://drive.google.com/drive/folders/1e9wdM29d-t3EHUpBb8T4dcHrkYGAXTgq).
+One can download the checkpoints of generator (e.g., g_05000000) and discriminator (e.g., do_05000000) within the listed folders.
+
+|Folder Name|Sampling Rate|Mel band|fmax|Params.|Dataset|Fine-Tuned|
+|------|---|---|---|---|------|---|
+|bigvgan_24khz_100band|24 kHz|100|12000|112M|LibriTTS|No|
+|bigvgan_base_24khz_100band|24 kHz|100|12000|14M|LibriTTS|No|
+|bigvgan_22khz_80band|22 kHz|80|8000|112M|LibriTTS + VCTK + LJSpeech|No|
+|bigvgan_base_22khz_80band|22 kHz|80|8000|14M|LibriTTS + VCTK + LJSpeech|No|
+
+The paper results are based on 24kHz BigVGAN models trained on LibriTTS dataset.
+We also provide 22kHz BigVGAN models with band-limited setup (i.e., fmax=8000) for TTS applications.
+Note that, the latest checkpoints use ``snakebeta`` activation with log scale parameterization, which have the best overall quality.
+
+
+## TODO
+
+Current codebase only provides a plain PyTorch implementation for the filtered nonlinearity. We are working on a fast CUDA kernel implementation, which will be released in the future. 
+
+
+## References
+* [HiFi-GAN](https://github.com/jik876/hifi-gan) (for generator and multi-period discriminator)
+
+* [Snake](https://github.com/EdwardDixon/snake) (for periodic activation)
+
+* [Alias-free-torch](https://github.com/junjun3518/alias-free-torch) (for anti-aliasing)
+
+* [Julius](https://github.com/adefossez/julius) (for low-pass filter)
+
+* [UnivNet](https://github.com/mindslab-ai/univnet) (for multi-resolution discriminator)
\ No newline at end of file
diff --git a/pitch_controller/modules/BigVGAN/__pycache__/env.cpython-310.pyc b/pitch_controller/modules/BigVGAN/__pycache__/env.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b56d1a9e093afa8f993d94a084b8324456c464c3
Binary files /dev/null and b/pitch_controller/modules/BigVGAN/__pycache__/env.cpython-310.pyc differ
diff --git a/pitch_controller/modules/BigVGAN/__pycache__/inference.cpython-310.pyc b/pitch_controller/modules/BigVGAN/__pycache__/inference.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8db7c95c1b8844d87c40655c5e6dc2e60d001ed6
Binary files /dev/null and b/pitch_controller/modules/BigVGAN/__pycache__/inference.cpython-310.pyc differ
diff --git a/pitch_controller/modules/BigVGAN/__pycache__/meldataset.cpython-310.pyc b/pitch_controller/modules/BigVGAN/__pycache__/meldataset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27e63b2b64f00899b7ec2910443d7058cfb05570
Binary files /dev/null and b/pitch_controller/modules/BigVGAN/__pycache__/meldataset.cpython-310.pyc differ
diff --git a/pitch_controller/modules/BigVGAN/__pycache__/models.cpython-310.pyc b/pitch_controller/modules/BigVGAN/__pycache__/models.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ac2618df919f7988dfce771f50d5cef688949ec
Binary files /dev/null and b/pitch_controller/modules/BigVGAN/__pycache__/models.cpython-310.pyc differ
diff --git a/pitch_controller/modules/BigVGAN/__pycache__/utils.cpython-310.pyc b/pitch_controller/modules/BigVGAN/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb138b675d58dd1889d453e13cc0d7a2b8f72177
Binary files /dev/null and b/pitch_controller/modules/BigVGAN/__pycache__/utils.cpython-310.pyc differ
diff --git a/pitch_controller/modules/BigVGAN/activations/__pycache__/activations.cpython-310.pyc b/pitch_controller/modules/BigVGAN/activations/__pycache__/activations.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce906239d790c95b8db90a1cc29ff741f0251aa1
Binary files /dev/null and b/pitch_controller/modules/BigVGAN/activations/__pycache__/activations.cpython-310.pyc differ
diff --git a/pitch_controller/modules/BigVGAN/activations/activations.py b/pitch_controller/modules/BigVGAN/activations/activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..61f2808a5466b3cf4d041059700993af5527dd29
--- /dev/null
+++ b/pitch_controller/modules/BigVGAN/activations/activations.py
@@ -0,0 +1,120 @@
+# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+import torch
+from torch import nn, sin, pow
+from torch.nn import Parameter
+
+
+class Snake(nn.Module):
+    '''
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(Snake, self).__init__()
+        self.in_features = in_features
+
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale: # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else: # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+
+        self.alpha.requires_grad = alpha_trainable
+
+        self.no_div_by_zero = 0.000000001
+
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+
+        return x
+
+
+class SnakeBeta(nn.Module):
+    '''
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale: # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else: # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+
+        self.no_div_by_zero = 0.000000001
+
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+
+        return x
\ No newline at end of file
diff --git a/pitch_controller/modules/BigVGAN/alias_free_torch/__init__.py b/pitch_controller/modules/BigVGAN/alias_free_torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2318b63198250856809c0cb46210a4147b829bc
--- /dev/null
+++ b/pitch_controller/modules/BigVGAN/alias_free_torch/__init__.py
@@ -0,0 +1,6 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+from .filter import *
+from .resample import *
+from .act import *
\ No newline at end of file
diff --git a/pitch_controller/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-310.pyc b/pitch_controller/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4cc31973147be0273b1383745a8adc15b1668d48
Binary files /dev/null and b/pitch_controller/modules/BigVGAN/alias_free_torch/__pycache__/__init__.cpython-310.pyc differ
diff --git a/pitch_controller/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-310.pyc b/pitch_controller/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13167af90a4b6e49dabc891572964efa3ca0ae88
Binary files /dev/null and b/pitch_controller/modules/BigVGAN/alias_free_torch/__pycache__/act.cpython-310.pyc differ
diff --git a/pitch_controller/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-310.pyc b/pitch_controller/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e67190a4c1ae41d9a06db70a0d4849e29cf4fa1
Binary files /dev/null and b/pitch_controller/modules/BigVGAN/alias_free_torch/__pycache__/filter.cpython-310.pyc differ
diff --git a/pitch_controller/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-310.pyc b/pitch_controller/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e33be87a2dfc070e8cdb527e9c7204a90f2f433
Binary files /dev/null and b/pitch_controller/modules/BigVGAN/alias_free_torch/__pycache__/resample.cpython-310.pyc differ
diff --git a/pitch_controller/modules/BigVGAN/alias_free_torch/act.py b/pitch_controller/modules/BigVGAN/alias_free_torch/act.py
new file mode 100644
index 0000000000000000000000000000000000000000..028debd697dd60458aae75010057df038bd3518a
--- /dev/null
+++ b/pitch_controller/modules/BigVGAN/alias_free_torch/act.py
@@ -0,0 +1,28 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch.nn as nn
+from .resample import UpSample1d, DownSample1d
+
+
+class Activation1d(nn.Module):
+    def __init__(self,
+                 activation,
+                 up_ratio: int = 2,
+                 down_ratio: int = 2,
+                 up_kernel_size: int = 12,
+                 down_kernel_size: int = 12):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+
+        return x
\ No newline at end of file
diff --git a/pitch_controller/modules/BigVGAN/alias_free_torch/filter.py b/pitch_controller/modules/BigVGAN/alias_free_torch/filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ad6ea87c1f10ddd94c544037791d7a4634d5ae1
--- /dev/null
+++ b/pitch_controller/modules/BigVGAN/alias_free_torch/filter.py
@@ -0,0 +1,95 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+if 'sinc' in dir(torch):
+    sinc = torch.sinc
+else:
+    # This code is adopted from adefossez's julius.core.sinc under the MIT License
+    # https://adefossez.github.io/julius/julius/core.html
+    #   LICENSE is in incl_licenses directory.
+    def sinc(x: torch.Tensor):
+        """
+        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
+        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
+        """
+        return torch.where(x == 0,
+                           torch.tensor(1., device=x.device, dtype=x.dtype),
+                           torch.sin(math.pi * x) / math.pi / x)
+
+
+# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
+# https://adefossez.github.io/julius/julius/lowpass.html
+#   LICENSE is in incl_licenses directory.
+def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
+    even = (kernel_size % 2 == 0)
+    half_size = kernel_size // 2
+
+    #For kaiser window
+    delta_f = 4 * half_width
+    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if A > 50.:
+        beta = 0.1102 * (A - 8.7)
+    elif A >= 21.:
+        beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.)
+    else:
+        beta = 0.
+    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
+
+    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
+    if even:
+        time = (torch.arange(-half_size, half_size) + 0.5)
+    else:
+        time = torch.arange(kernel_size) - half_size
+    if cutoff == 0:
+        filter_ = torch.zeros_like(time)
+    else:
+        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
+        # Normalize filter to have sum = 1, otherwise we will have a small leakage
+        # of the constant component in the input signal.
+        filter_ /= filter_.sum()
+        filter = filter_.view(1, 1, kernel_size)
+
+    return filter
+
+
+class LowPassFilter1d(nn.Module):
+    def __init__(self,
+                 cutoff=0.5,
+                 half_width=0.6,
+                 stride: int = 1,
+                 padding: bool = True,
+                 padding_mode: str = 'replicate',
+                 kernel_size: int = 12):
+        # kernel_size should be even number for stylegan3 setup,
+        # in this implementation, odd number is also possible.
+        super().__init__()
+        if cutoff < -0.:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.kernel_size = kernel_size
+        self.even = (kernel_size % 2 == 0)
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = stride
+        self.padding = padding
+        self.padding_mode = padding_mode
+        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
+        self.register_buffer("filter", filter)
+
+    #input [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+
+        if self.padding:
+            x = F.pad(x, (self.pad_left, self.pad_right),
+                      mode=self.padding_mode)
+        out = F.conv1d(x, self.filter.expand(C, -1, -1),
+                       stride=self.stride, groups=C)
+
+        return out
\ No newline at end of file
diff --git a/pitch_controller/modules/BigVGAN/alias_free_torch/resample.py b/pitch_controller/modules/BigVGAN/alias_free_torch/resample.py
new file mode 100644
index 0000000000000000000000000000000000000000..750e6c3402cc5ac939c4b9d075246562e0e1d1a7
--- /dev/null
+++ b/pitch_controller/modules/BigVGAN/alias_free_torch/resample.py
@@ -0,0 +1,49 @@
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch.nn as nn
+from torch.nn import functional as F
+from .filter import LowPassFilter1d
+from .filter import kaiser_sinc_filter1d
+
+
+class UpSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.stride = ratio
+        self.pad = self.kernel_size // ratio - 1
+        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+        self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+        filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
+                                      half_width=0.6 / ratio,
+                                      kernel_size=self.kernel_size)
+        self.register_buffer("filter", filter)
+
+    # x: [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+
+        x = F.pad(x, (self.pad, self.pad), mode='replicate')
+        x = self.ratio * F.conv_transpose1d(
+            x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
+        x = x[..., self.pad_left:-self.pad_right]
+
+        return x
+
+
+class DownSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio,
+                                       half_width=0.6 / ratio,
+                                       stride=ratio,
+                                       kernel_size=self.kernel_size)
+
+    def forward(self, x):
+        xx = self.lowpass(x)
+
+        return xx
\ No newline at end of file
diff --git a/pitch_controller/modules/BigVGAN/env.py b/pitch_controller/modules/BigVGAN/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8be238d4db710c8c9a338d336baea0138f18d1f
--- /dev/null
+++ b/pitch_controller/modules/BigVGAN/env.py
@@ -0,0 +1,18 @@
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+import os
+import shutil
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))
\ No newline at end of file
diff --git a/pitch_controller/modules/BigVGAN/inference.py b/pitch_controller/modules/BigVGAN/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..a739344db3ec9ae08560e5477a394cca32d4a6d9
--- /dev/null
+++ b/pitch_controller/modules/BigVGAN/inference.py
@@ -0,0 +1,36 @@
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import glob
+import os
+import argparse
+import json
+import torch
+from scipy.io.wavfile import write
+from .env import AttrDict
+from .utils import MAX_WAV_VALUE
+from .models import BigVGAN as Generator
+import librosa
+
+
+def load_model(model_path, device='cuda'):
+    config_file = os.path.join(os.path.split(model_path)[0], 'config.json')
+    with open(config_file) as f:
+        data = f.read()
+
+    global h
+    json_config = json.loads(data)
+
+    h = AttrDict(json_config)
+
+    generator = Generator(h).to(device)
+
+    cp_dict = torch.load(model_path, map_location=device)
+    generator.load_state_dict(cp_dict['generator'])
+    generator.eval()
+    generator.remove_weight_norm()
+    del cp_dict
+    return generator, h
+
diff --git a/pitch_controller/modules/BigVGAN/models.py b/pitch_controller/modules/BigVGAN/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bb40e0cff7819dcbe69555520253afd64580720
--- /dev/null
+++ b/pitch_controller/modules/BigVGAN/models.py
@@ -0,0 +1,381 @@
+# Copyright (c) 2022 NVIDIA CORPORATION. 
+#   Licensed under the MIT license.
+
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+
+from .activations import activations
+from .utils import init_weights, get_padding
+from .alias_free_torch import *
+
+LRELU_SLOPE = 0.1
+
+
+class AMPBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), activation=None):
+        super(AMPBlock1, self).__init__()
+        self.h = h
+
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+
+        self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers
+
+        if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
+                for _ in range(self.num_layers)
+            ])
+        elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
+                 for _ in range(self.num_layers)
+            ])
+        else:
+            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+
+    def forward(self, x):
+        acts1, acts2 = self.activations[::2], self.activations[1::2]
+        for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
+            xt = a1(x)
+            xt = c1(xt)
+            xt = a2(xt)
+            xt = c2(xt)
+            x = xt + x
+
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class AMPBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), activation=None):
+        super(AMPBlock2, self).__init__()
+        self.h = h
+
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+
+        self.num_layers = len(self.convs) # total number of conv layers
+
+        if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.Snake(channels, alpha_logscale=h.snake_logscale))
+                for _ in range(self.num_layers)
+            ])
+        elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing
+            self.activations = nn.ModuleList([
+                Activation1d(
+                    activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale))
+                 for _ in range(self.num_layers)
+            ])
+        else:
+            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+
+    def forward(self, x):
+        for c, a in zip (self.convs, self.activations):
+            xt = a(x)
+            xt = c(xt)
+            x = xt + x
+
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class BigVGAN(torch.nn.Module):
+    # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks.
+    def __init__(self, h):
+        super(BigVGAN, self).__init__()
+        self.h = h
+
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+
+        # pre conv
+        self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
+
+        # define which AMPBlock to use. BigVGAN uses AMPBlock1 as default
+        resblock = AMPBlock1 if h.resblock == '1' else AMPBlock2
+
+        # transposed conv-based upsamplers. does not apply anti-aliasing
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(nn.ModuleList([
+                weight_norm(ConvTranspose1d(h.upsample_initial_channel // (2 ** i),
+                                            h.upsample_initial_channel // (2 ** (i + 1)),
+                                            k, u, padding=(k - u) // 2))
+            ]))
+
+        # residual blocks using anti-aliased multi-periodicity composition modules (AMP)
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d, activation=h.activation))
+
+        # post conv
+        if h.activation == "snake": # periodic nonlinearity with snake function and anti-aliasing
+            activation_post = activations.Snake(ch, alpha_logscale=h.snake_logscale)
+            self.activation_post = Activation1d(activation=activation_post)
+        elif h.activation == "snakebeta": # periodic nonlinearity with snakebeta function and anti-aliasing
+            activation_post = activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale)
+            self.activation_post = Activation1d(activation=activation_post)
+        else:
+            raise NotImplementedError("activation incorrectly specified. check the config file and look for 'activation'.")
+
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+
+        # weight initialization
+        for i in range(len(self.ups)):
+            self.ups[i].apply(init_weights)
+        self.conv_post.apply(init_weights)
+
+    def forward(self, x):
+        # pre conv
+        x = self.conv_pre(x)
+
+        for i in range(self.num_upsamples):
+            # upsampling
+            for i_up in range(len(self.ups[i])):
+                x = self.ups[i][i_up](x)
+            # AMP blocks
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+
+        # post conv
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            for l_i in l:
+                remove_weight_norm(l_i)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, h, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.d_mult = h.discriminator_channel_mult
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, int(32*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(int(32*self.d_mult), int(128*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(int(128*self.d_mult), int(512*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(int(512*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(int(1024*self.d_mult), int(1024*self.d_mult), (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(int(1024*self.d_mult), 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, h):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.mpd_reshapes = h.mpd_reshapes
+        print("mpd_reshapes: {}".format(self.mpd_reshapes))
+        discriminators = [DiscriminatorP(h, rs, use_spectral_norm=h.use_spectral_norm) for rs in self.mpd_reshapes]
+        self.discriminators = nn.ModuleList(discriminators)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorR(nn.Module):
+    def __init__(self, cfg, resolution):
+        super().__init__()
+
+        self.resolution = resolution
+        assert len(self.resolution) == 3, \
+            "MRD layer requires list with len=3, got {}".format(self.resolution)
+        self.lrelu_slope = LRELU_SLOPE
+
+        norm_f = weight_norm if cfg.use_spectral_norm == False else spectral_norm
+        if hasattr(cfg, "mrd_use_spectral_norm"):
+            print("INFO: overriding MRD use_spectral_norm as {}".format(cfg.mrd_use_spectral_norm))
+            norm_f = weight_norm if cfg.mrd_use_spectral_norm == False else spectral_norm
+        self.d_mult = cfg.discriminator_channel_mult
+        if hasattr(cfg, "mrd_channel_mult"):
+            print("INFO: overriding mrd channel multiplier as {}".format(cfg.mrd_channel_mult))
+            self.d_mult = cfg.mrd_channel_mult
+
+        self.convs = nn.ModuleList([
+            norm_f(nn.Conv2d(1, int(32*self.d_mult), (3, 9), padding=(1, 4))),
+            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 9), stride=(1, 2), padding=(1, 4))),
+            norm_f(nn.Conv2d(int(32*self.d_mult), int(32*self.d_mult), (3, 3), padding=(1, 1))),
+        ])
+        self.conv_post = norm_f(nn.Conv2d(int(32 * self.d_mult), 1, (3, 3), padding=(1, 1)))
+
+    def forward(self, x):
+        fmap = []
+
+        x = self.spectrogram(x)
+        x = x.unsqueeze(1)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, self.lrelu_slope)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+    def spectrogram(self, x):
+        n_fft, hop_length, win_length = self.resolution
+        x = F.pad(x, (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), mode='reflect')
+        x = x.squeeze(1)
+        x = torch.stft(x, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, return_complex=True)
+        x = torch.view_as_real(x)  # [B, F, TT, 2]
+        mag = torch.norm(x, p=2, dim =-1) #[B, F, TT]
+
+        return mag
+
+
+class MultiResolutionDiscriminator(nn.Module):
+    def __init__(self, cfg, debug=False):
+        super().__init__()
+        self.resolutions = cfg.resolutions
+        assert len(self.resolutions) == 3,\
+            "MRD requires list of list with len=3, each element having a list with len=3. got {}".\
+                format(self.resolutions)
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorR(cfg, resolution) for resolution in self.resolutions]
+        )
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(x=y)
+            y_d_g, fmap_g = d(x=y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+
+    return loss*2
+
+
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1-dr)**2)
+        g_loss = torch.mean(dg**2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+
+    return loss, r_losses, g_losses
+
+
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1-dg)**2)
+        gen_losses.append(l)
+        loss += l
+
+    return loss, gen_losses
+
diff --git a/pitch_controller/modules/BigVGAN/utils.py b/pitch_controller/modules/BigVGAN/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed67f356aef6ce3af01b43d97d8aafb31c57b017
--- /dev/null
+++ b/pitch_controller/modules/BigVGAN/utils.py
@@ -0,0 +1,81 @@
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+import glob
+import os
+import matplotlib
+import torch
+from torch.nn.utils import weight_norm
+matplotlib.use("Agg")
+import matplotlib.pylab as plt
+from scipy.io.wavfile import write
+
+MAX_WAV_VALUE = 32768.0
+
+
+def plot_spectrogram(spectrogram):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                   interpolation='none')
+    plt.colorbar(im, ax=ax)
+
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def plot_spectrogram_clipped(spectrogram, clip_max=2.):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                   interpolation='none', vmin=1e-6, vmax=clip_max)
+    plt.colorbar(im, ax=ax)
+
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def apply_weight_norm(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        weight_norm(m)
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)
+
+
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    print("Loading '{}'".format(filepath))
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    print("Complete.")
+    return checkpoint_dict
+
+
+def save_checkpoint(filepath, obj):
+    print("Saving checkpoint to {}".format(filepath))
+    torch.save(obj, filepath)
+    print("Complete.")
+
+
+def scan_checkpoint(cp_dir, prefix):
+    pattern = os.path.join(cp_dir, prefix + '????????')
+    cp_list = glob.glob(pattern)
+    if len(cp_list) == 0:
+        return None
+    return sorted(cp_list)[-1]
+
+def save_audio(audio, path, sr):
+    # wav: torch with 1d shape
+    audio = audio * MAX_WAV_VALUE
+    audio = audio.cpu().numpy().astype('int16')
+    write(path, sr, audio)
\ No newline at end of file
diff --git a/pitch_controller/train_world_tuner_24k.py b/pitch_controller/train_world_tuner_24k.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9b35b9692b9a9add781cfcf16f831cb6bfbff8f
--- /dev/null
+++ b/pitch_controller/train_world_tuner_24k.py
@@ -0,0 +1,237 @@
+import os, json, argparse, yaml
+import numpy as np
+from tqdm import tqdm
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+from torch.cuda.amp import autocast, GradScaler
+
+from diffusers import DDIMScheduler
+
+from dataset import VCDecLPCDataset, VCDecLPCBatchCollate, VCDecLPCTest
+from models.unet import UNetVC
+from modules.BigVGAN.inference import load_model
+from utils import save_plot, save_audio
+from utils import minmax_norm_diff, reverse_minmax_norm_diff
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-config', type=str, default='config/DiffWorld_24k_log.yaml')
+
+parser.add_argument('-seed', type=int, default=98)
+parser.add_argument('-amp', type=bool, default=True)
+parser.add_argument('-compile', type=bool, default=False)
+
+parser.add_argument('-data_dir', type=str, default='../24k_center/')
+parser.add_argument('-lpc_dir', type=str, default='world')
+parser.add_argument('-vocoder_dir', type=str, default='modules/BigVGAN/ckpt/bigvgan_base_24khz_100band/g_05000000')
+
+parser.add_argument('-train_frames', type=int, default=128)
+parser.add_argument('-batch_size', type=int, default=32)
+parser.add_argument('-test_size', type=int, default=1)
+parser.add_argument('-num_workers', type=int, default=4)
+parser.add_argument('-lr', type=float, default=5e-5)
+parser.add_argument('-weight_decay', type=int, default=1e-6)
+
+parser.add_argument('-epochs', type=int, default=80)
+parser.add_argument('-save_every', type=int, default=2)
+parser.add_argument('-log_step', type=int, default=200)
+parser.add_argument('-log_dir', type=str, default='logs_dec_world_24k')
+parser.add_argument('-ckpt_dir', type=str, default='ckpt_world_24k')
+
+args = parser.parse_args()
+args.save_ori = True
+config = yaml.load(open(args.config), Loader=yaml.FullLoader)
+mel_cfg = config['logmel']
+ddpm_cfg = config['ddpm']
+unet_cfg = config['unet']
+f0_type = unet_cfg['pitch_type']
+
+if __name__ == "__main__":
+    torch.manual_seed(args.seed)
+    np.random.seed(args.seed)
+    if torch.cuda.is_available():
+        args.device = 'cuda'
+        torch.cuda.manual_seed(args.seed)
+        torch.cuda.manual_seed_all(args.seed)
+        torch.backends.cuda.matmul.allow_tf32 = True
+        if torch.backends.cudnn.is_available():
+            torch.backends.cudnn.deterministic = True
+            torch.backends.cudnn.benchmark = True
+    else:
+       args.device = 'cpu'
+
+    if os.path.exists(args.log_dir) is False:
+        os.makedirs(args.log_dir)
+
+    if os.path.exists(args.ckpt_dir) is False:
+        os.makedirs(args.ckpt_dir)
+
+    print('Initializing vocoder...')
+    hifigan, cfg = load_model(args.vocoder_dir, device=args.device)
+
+    print('Initializing data loaders...')
+    train_set = VCDecLPCDataset(args.data_dir, subset='train', content_dir=args.lpc_dir, f0_type=f0_type)
+    collate_fn = VCDecLPCBatchCollate(args.train_frames)
+    train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True,
+                              collate_fn=collate_fn, num_workers=args.num_workers, drop_last=True)
+
+    val_set = VCDecLPCTest(args.data_dir, content_dir=args.lpc_dir, f0_type=f0_type)
+    val_loader = DataLoader(val_set, batch_size=1, shuffle=False)
+
+    print('Initializing and loading models...')
+    model = UNetVC(**unet_cfg).to(args.device)
+    print('Number of parameters = %.2fm\n' % (model.nparams / 1e6))
+
+    # prepare DPM scheduler
+    noise_scheduler = DDIMScheduler(num_train_timesteps=ddpm_cfg['num_train_steps'])
+
+    print('Initializing optimizers...')
+    optimizer = torch.optim.AdamW(params=model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+    scaler = GradScaler()
+
+    if args.compile:
+        model = torch.compile(model)
+
+    print('Start training.')
+    global_step = 0
+    for epoch in range(1, args.epochs + 1):
+        print(f'Epoch: {epoch} [iteration: {global_step}]')
+        model.train()
+        losses = []
+
+        for step, batch in enumerate(tqdm(train_loader)):
+            optimizer.zero_grad()
+
+            # make spectrogram range from -1 to 1
+            mel = batch['mel1'].to(args.device)
+            mel = minmax_norm_diff(mel, vmax=mel_cfg['max'], vmin=mel_cfg['min'])
+
+            if unet_cfg["use_ref_t"]:
+                mel_ref = batch['mel2'].to(args.device)
+                mel_ref = minmax_norm_diff(mel_ref, vmax=mel_cfg['max'], vmin=mel_cfg['min'])
+            else:
+                mel_ref = None
+
+            f0 = batch['f0_1'].to(args.device)
+
+            mean = batch['content1'].to(args.device)
+            mean = minmax_norm_diff(mean, vmax=mel_cfg['max'], vmin=mel_cfg['min'])
+
+            noise = torch.randn(mel.shape).to(args.device)
+            timesteps = torch.randint(0, noise_scheduler.num_train_timesteps,
+                                      (args.batch_size,),
+                                      device=args.device, ).long()
+
+            noisy_mel = noise_scheduler.add_noise(mel, noise, timesteps)
+
+            if args.amp:
+                with autocast():
+                    noise_pred = model(x=noisy_mel, mean=mean, f0=f0, t=timesteps, ref=mel_ref, embed=None)
+                    loss = F.mse_loss(noise_pred, noise)
+                scaler.scale(loss).backward()
+                scaler.step(optimizer)
+                scaler.update()
+            else:
+                noise_pred = model(x=noisy_mel, mean=mean, f0=f0, t=timesteps, ref=mel_ref, embed=None)
+                loss = F.mse_loss(noise_pred, noise)
+                # Backward propagation
+                loss.backward()
+                optimizer.step()
+
+            losses.append(loss.item())
+            global_step += 1
+
+            if global_step % args.log_step == 0:
+                losses = np.asarray(losses)
+                # msg = 'Epoch %d: loss = %.4f\n' % (epoch, np.mean(losses))
+                msg = '\nEpoch: [{}][{}]\t' \
+                      'Batch: [{}][{}]\tLoss: {:.6f}\n'.format(epoch,
+                                                               args.epochs,
+                                                               step+1,
+                                                               len(train_loader),
+                                                               np.mean(losses))
+                with open(f'{args.log_dir}/train_dec.log', 'a') as f:
+                    f.write(msg)
+                losses = []
+
+        if epoch % args.save_every > 0:
+            continue
+
+        print('Saving model...\n')
+        ckpt = model.state_dict()
+        torch.save(ckpt, f=f"{args.ckpt_dir}/lpc_vc_{epoch}.pt")
+
+        print('Inference...\n')
+        noise = None
+        noise_scheduler.set_timesteps(ddpm_cfg['inference_steps'])
+        model.eval()
+        with torch.no_grad():
+            for i, batch in enumerate(val_loader):
+                # optimizer.zero_grad()
+                generator = torch.Generator(device=args.device).manual_seed(args.seed)
+
+                mel = batch['mel1'].to(args.device)
+                mel = minmax_norm_diff(mel, vmax=mel_cfg['max'], vmin=mel_cfg['min'])
+
+                if unet_cfg["use_ref_t"]:
+                    mel_ref = batch['mel2'].to(args.device)
+                    mel_ref = minmax_norm_diff(mel_ref, vmax=mel_cfg['max'], vmin=mel_cfg['min'])
+                else:
+                    mel_ref = None
+
+                f0 = batch['f0_1'].to(args.device)
+                embed = batch['embed'].to(args.device)
+
+                mean = batch['content1'].to(args.device)
+                mean = minmax_norm_diff(mean, vmax=mel_cfg['max'], vmin=mel_cfg['min'])
+
+                # make spectrogram range from -1 to 1
+                if noise is None:
+                    noise = torch.randn(mel.shape,
+                                        generator=generator,
+                                        device=args.device,
+                                        )
+                pred = noise
+
+                for t in noise_scheduler.timesteps:
+                    pred = noise_scheduler.scale_model_input(pred, t)
+                    model_output = model(x=pred, mean=mean, f0=f0, t=t, ref=mel_ref, embed=None)
+                    pred = noise_scheduler.step(model_output=model_output,
+                                                timestep=t,
+                                                sample=pred,
+                                                eta=ddpm_cfg['eta'], generator=generator).prev_sample
+
+
+                if os.path.exists(f'{args.log_dir}/audio/{i}/') is False:
+                    os.makedirs(f'{args.log_dir}/audio/{i}/')
+                    os.makedirs(f'{args.log_dir}/pic/{i}/')
+
+                # save pred
+                pred = reverse_minmax_norm_diff(pred, vmax=mel_cfg['max'], vmin=mel_cfg['min'])
+                save_plot(pred.squeeze().cpu(), f'{args.log_dir}/pic/{i}/{epoch}_pred.png')
+                audio = hifigan(pred)
+                save_audio(f'{args.log_dir}/audio/{i}/{epoch}_pred.wav', mel_cfg['sampling_rate'], audio)
+
+                if args.save_ori is True:
+                    # save ref
+                    # mel_ref = reverse_minmax_norm_diff(mel_ref, vmax=mel_cfg['max'], vmin=mel_cfg['min'])
+                    # save_plot(mel_ref.squeeze().cpu(), f'{args.log_dir}/pic/{i}/{epoch}_ref.png')
+                    # audio = hifigan(mel_ref)
+                    # save_audio(f'{args.log_dir}/audio/{i}/{epoch}_ref.wav', mel_cfg['sampling_rate'], audio)
+
+                    # save source
+                    mel = reverse_minmax_norm_diff(mel, vmax=mel_cfg['max'], vmin=mel_cfg['min'])
+                    save_plot(mel.squeeze().cpu(), f'{args.log_dir}/pic/{i}/{epoch}_source.png')
+                    audio = hifigan(mel)
+                    save_audio(f'{args.log_dir}/audio/{i}/{epoch}_source.wav', mel_cfg['sampling_rate'], audio)
+
+                    # save content
+                    mean = reverse_minmax_norm_diff(mean, vmax=mel_cfg['max'], vmin=mel_cfg['min'])
+                    save_plot(mean.squeeze().cpu(), f'{args.log_dir}/pic/{i}/{epoch}_avg.png')
+                    audio = hifigan(mean)
+                    save_audio(f'{args.log_dir}/audio/{i}/{epoch}_avg.wav', mel_cfg['sampling_rate'], audio)
+
+            args.save_ori = False
diff --git a/pitch_controller/utils.py b/pitch_controller/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4aee85dafce5fc9d38ea53ce698e4b0e9262a61
--- /dev/null
+++ b/pitch_controller/utils.py
@@ -0,0 +1,51 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.io import wavfile
+import torch
+from torch.nn import functional as F
+
+
+def repeat_expand_2d(content, target_len):
+    # align content with mel
+
+    src_len = content.shape[-1]
+    target = torch.zeros([content.shape[0], target_len], dtype=torch.float).to(content.device)
+    temp = torch.arange(src_len+1) * target_len / src_len
+    current_pos = 0
+    for i in range(target_len):
+        if i < temp[current_pos+1]:
+            target[:, i] = content[:, current_pos]
+        else:
+            current_pos += 1
+            target[:, i] = content[:, current_pos]
+
+    return target
+
+
+def save_plot(tensor, savepath):
+    plt.style.use('default')
+    fig, ax = plt.subplots(figsize=(12, 3))
+    im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none')
+    plt.colorbar(im, ax=ax)
+    plt.tight_layout()
+    fig.canvas.draw()
+    plt.savefig(savepath)
+    plt.close()
+
+
+def save_audio(file_path, sampling_rate, audio):
+    audio = np.clip(audio.detach().cpu().squeeze().numpy(), -0.999, 0.999)
+    wavfile.write(file_path, sampling_rate, (audio * 32767).astype("int16"))
+
+
+def minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor:
+    tensor = torch.clip(tensor, vmin, vmax)
+    tensor = 2 * (tensor - vmin) / (vmax - vmin) - 1
+    return tensor
+
+
+def reverse_minmax_norm_diff(tensor: torch.Tensor, vmax: float = 2.5, vmin: float = -12) -> torch.Tensor:
+    tensor = torch.clip(tensor, -1.0, 1.0)
+    tensor = (tensor + 1) / 2
+    tensor = tensor * (vmax - vmin) + vmin
+    return tensor
\ No newline at end of file
diff --git a/pitch_predictor/README.md b/pitch_predictor/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e187365c17c2ab0c25a317f8447e7a629ddfa2ae
--- /dev/null
+++ b/pitch_predictor/README.md
@@ -0,0 +1 @@
+# Pitchformer for Score-based Automatic Pitch Correction
diff --git a/pitch_predictor/config/Pitchformer.yaml b/pitch_predictor/config/Pitchformer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bf5802a16e0e2dfe490c790f46d4788f94b9e37f
--- /dev/null
+++ b/pitch_predictor/config/Pitchformer.yaml
@@ -0,0 +1,32 @@
+version: 1.0
+
+unet:  
+  sample_size: [1]
+  # spec_dim: 100
+  in_channels: 102
+  out_channels: 1
+  layers_per_block: 2
+  block_out_channels: [256, 256, 256]
+  down_block_types: 
+    ["DownBlock1D",
+    "AttnDownBlock1D",
+    "AttnDownBlock1D",
+    ]
+  up_block_types:
+    ["AttnUpBlock1D",
+    "AttnUpBlock1D",
+    "UpBlock1D",
+    ]
+
+ddpm:
+  num_train_steps: 1000
+  inference_steps: 100
+  eta: 0.8
+
+logmel:
+  n_mels: 100
+  sampling_rate: 24000
+  n_fft: 1024
+  hop_size: 256
+  max: 2.5
+  min: -12
\ No newline at end of file
diff --git a/pitch_predictor/data/example/f0/2.npy b/pitch_predictor/data/example/f0/2.npy
new file mode 100644
index 0000000000000000000000000000000000000000..52e268335ea4594a567c425a2746e179f887aa87
--- /dev/null
+++ b/pitch_predictor/data/example/f0/2.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b13cdfa6d0a1a6ef29b832b7348c4282abd18784ce391e6f36e8bbd54da9023e
+size 3184
diff --git a/pitch_predictor/data/example/midi/2.midi b/pitch_predictor/data/example/midi/2.midi
new file mode 100644
index 0000000000000000000000000000000000000000..f5e4a0acc1c5ba6e74416306ff5b8604b008aa41
Binary files /dev/null and b/pitch_predictor/data/example/midi/2.midi differ
diff --git a/pitch_predictor/data/example/roll/2.npy b/pitch_predictor/data/example/roll/2.npy
new file mode 100644
index 0000000000000000000000000000000000000000..f729483ed02449fb4a4585364c92853769803d38
--- /dev/null
+++ b/pitch_predictor/data/example/roll/2.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:105bb8565c9cbf846fe8f1b93cd051c00c106bf1dd318169b2ed7dfd8707c431
+size 417920
diff --git a/pitch_predictor/data/example/roll_align/2.npy b/pitch_predictor/data/example/roll_align/2.npy
new file mode 100644
index 0000000000000000000000000000000000000000..fa47ace58a8c46d6c7a2ebf7862bfd7081b67e9e
--- /dev/null
+++ b/pitch_predictor/data/example/roll_align/2.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10f6405ab3a91c36cfe7a6508d74445c08cfe3210a997b9b37286cda4d47639b
+size 3184
diff --git a/pitch_predictor/data/example/vocal/2.wav b/pitch_predictor/data/example/vocal/2.wav
new file mode 100644
index 0000000000000000000000000000000000000000..0217bed7847312941b1a65a3f31f2fe306ac7f8e
Binary files /dev/null and b/pitch_predictor/data/example/vocal/2.wav differ
diff --git a/pitch_predictor/data/example/world/2.npy b/pitch_predictor/data/example/world/2.npy
new file mode 100644
index 0000000000000000000000000000000000000000..87338585dfccfef4c385e851a94e15910f368924
--- /dev/null
+++ b/pitch_predictor/data/example/world/2.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41b53d64476b019714cacf063296021357980461c3c38eedce3d383a3cd3b4ad
+size 152928
diff --git a/pitch_predictor/data/preprocess_csd.py b/pitch_predictor/data/preprocess_csd.py
new file mode 100644
index 0000000000000000000000000000000000000000..a045472c7b3841a574c8cb6a0cfb498b2659b47b
--- /dev/null
+++ b/pitch_predictor/data/preprocess_csd.py
@@ -0,0 +1,99 @@
+import os.path
+import json
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+import textgrid
+import pretty_midi
+import music21
+import librosa
+import soundfile as sf
+
+
+def piano_roll_to_pretty_midi(piano_roll, fs=100, program=0, bpm=120):
+    notes, frames = piano_roll.shape
+    pm = pretty_midi.PrettyMIDI()
+    instrument = pretty_midi.Instrument(program=program, )
+
+    # pad 1 column of zeros so we can acknowledge inital and ending events
+    piano_roll = np.pad(piano_roll, [(0, 0), (1, 1)], 'constant')
+
+    # use changes in velocities to find note on / note off events
+    velocity_changes = np.nonzero(np.diff(piano_roll).T)
+
+    # keep track on velocities and note on times
+    prev_velocities = np.zeros(notes, dtype=int)
+    note_on_time = np.zeros(notes)
+
+    for time, note in zip(*velocity_changes):
+        # use time + 1 because of padding above
+        velocity = piano_roll[note, time + 1]
+        time = time / fs * bpm / 120
+        # time = time / fs
+        if velocity > 0:
+            if prev_velocities[note] == 0:
+                note_on_time[note] = time
+                prev_velocities[note] = velocity
+        else:
+            pm_note = pretty_midi.Note(
+                velocity=prev_velocities[note],
+                pitch=note,
+                start=note_on_time[note],
+                end=time)
+            instrument.notes.append(pm_note)
+            prev_velocities[note] = 0
+    pm.instruments.append(instrument)
+
+    beats = np.array([0, int(pm.get_end_time()+1)])
+    pm.adjust_times(beats, beats * 120 / bpm)
+    # print(beats)
+    return pm
+
+
+f = open('CSD/English/metadata.json', encoding="utf8")
+meta = json.load(f)
+folder = 'CSD/English/'
+for wav in tqdm(os.listdir(folder+'wav')):
+    song_id = wav.replace('.wav', '')
+    midi_id = wav.replace('.wav', '.mid')
+    roll_id = wav.replace('.wav', '.npy')
+
+    wav, sr = librosa.load(folder+'wav/'+wav)
+    midi = pretty_midi.PrettyMIDI(folder+'mid/'+midi_id)
+    roll = midi.get_piano_roll()
+
+    bpm = meta[song_id]['tempo']
+
+    for i in range(int(roll.shape[1])//1000):
+        # print(i)
+        start = i*10
+        end = (i+1)*10
+
+        wav_seg = wav[round(start * sr):round(end * sr)]
+
+        os.makedirs('CSD_segements/'+song_id+'/vocal/', exist_ok=True)
+        os.makedirs('CSD_segements/' + song_id + '/roll/', exist_ok=True)
+        os.makedirs('CSD_segements/' + song_id + '/midi/', exist_ok=True)
+
+        sf.write('CSD_segements/'+song_id+'/vocal/'+str(i)+'.wav', wav_seg, samplerate=sr)
+
+        cur_roll = roll[:, round(100*start):round(100*end)]
+
+        if round((end-start)*100) != cur_roll.shape[1]:
+            print(sentence)
+            print(song_id)
+            print((end-start)*100)
+            print(cur_roll.shape)
+
+        # save npy rolls
+        np.save('CSD_segements/'+song_id+'/roll/'+str(i)+'.npy', cur_roll)
+
+        # save midi files
+        cur_midi = piano_roll_to_pretty_midi(cur_roll, fs=100, bpm=bpm)
+        # cur_midi.write('cache/'+song_id+str(num)+'.midi')
+        cur_midi.write('CSD_segements/'+song_id+'/midi/'+str(i)+'.midi')
+        # fctr = bpm/120
+        # score = music21.converter.Converter()
+        # score.parseFile('cache/'+song_id+str(num)+'.midi')
+        # newscore = score.stream.augmentOrDiminish(fctr)
+        # newscore.write('midi', 'segements/'+song_id+'/midi/'+str(num)+'.midi')
\ No newline at end of file
diff --git a/pitch_predictor/dataset/__pycache__/diffpitch.cpython-310.pyc b/pitch_predictor/dataset/__pycache__/diffpitch.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ceb489e294fe2ed53c34162a7ea58a158e97bb54
Binary files /dev/null and b/pitch_predictor/dataset/__pycache__/diffpitch.cpython-310.pyc differ
diff --git a/pitch_predictor/dataset/diffpitch.py b/pitch_predictor/dataset/diffpitch.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc624bcea9332b1c5b059fbb0f9a045cac10fa35
--- /dev/null
+++ b/pitch_predictor/dataset/diffpitch.py
@@ -0,0 +1,97 @@
+import os
+import random
+import numpy as np
+import pandas as pd
+import librosa
+
+import torch
+import torchaudio
+from torch.utils.data import Dataset
+
+
+def algin_mapping(content, target_len):
+    # align content with mel
+    src_len = content.shape[-1]
+    target = torch.zeros([content.shape[0], target_len], dtype=torch.float).to(content.device)
+    temp = torch.arange(src_len+1) * target_len / src_len
+
+    for i in range(target_len):
+        cur_idx = torch.argmin(torch.abs(temp-i))
+        target[:, i] = content[:, cur_idx]
+    return target
+
+
+def midi_to_hz(midi):
+    idx = torch.zeros(midi.shape[-1])
+    for frame in range(midi.shape[-1]):
+        midi_frame = midi[:, frame]
+        non_zero = midi_frame.nonzero()
+        if len(non_zero) != 0:
+            hz = librosa.midi_to_hz(non_zero[0])
+            idx[frame] = torch.tensor(hz)
+    return idx
+
+
+# training "average voice" encoder
+class DiffPitch(Dataset):
+    def __init__(self, data_dir, subset, frames, content='world', shift=True, log_scale=False):
+        meta = pd.read_csv(data_dir + 'meta.csv')
+        self.data_dir = data_dir
+        self.meta = meta[meta['subset'] == subset]
+        self.frames = frames
+        self.content = content
+        self.shift = shift
+        self.log_scale = log_scale
+
+    def __getitem__(self, index):
+        row = self.meta.iloc[index]
+        folder = row['folder']
+        subfolder = row['subfolder']
+        file_id = row['file_name']
+        folder = os.path.join(self.data_dir, folder)
+        folder = os.path.join(folder, str(subfolder))
+        folder = os.path.join(folder, 'vocal')
+        folder = os.path.join(folder, file_id)
+
+        content_folder = folder.replace('vocal', self.content).replace('.wav', '.npy')
+        content = torch.tensor(np.load(content_folder), dtype=torch.float32)
+        # print(content.shape)
+
+        midi_folder = folder.replace('vocal', 'roll_align').replace('.wav', '.npy')
+        midi = torch.tensor(np.load(midi_folder), dtype=torch.float32)
+        # print(midi.shape)
+        # midi = algin_mapping(midi, content.shape[-1])
+
+        f0_folder = folder.replace('vocal', 'f0').replace('.wav', '.npy')
+        f0 = torch.tensor(np.load(f0_folder), dtype=torch.float32)
+
+        max_start = max(content.shape[-1] - self.frames, 0)
+        start = random.choice(range(max_start)) if max_start > 0 else 0
+        end = min(int(start + self.frames), content.shape[-1])
+
+        out_content = torch.ones((content.shape[0], self.frames)) * np.log(1e-5)
+        out_midi = torch.zeros(self.frames)
+        out_f0 = torch.zeros(self.frames)
+
+        out_content[:, :end-start] = content[:, start:end]
+        out_midi[:end-start] = midi[start:end]
+        out_f0[:end-start] = f0[start:end]
+
+        # out_midi = midi_to_hz(out_midi)
+
+        if self.shift is True:
+            shift = np.random.choice(25, 1)[0]
+            shift = shift - 12
+
+            # midi[midi != 0] += shift
+            out_midi = out_midi*(2**(shift/12))
+            out_f0 = out_f0*(2**(shift/12))
+
+        if self.log_scale:
+            out_midi = 1127 * np.log(1 + out_midi / 700)
+            out_f0 = 1127 * np.log(1 + out_f0 / 700)
+
+        return out_content, out_midi, out_f0
+
+    def __len__(self):
+        return len(self.meta)
\ No newline at end of file
diff --git a/pitch_predictor/models/__pycache__/transformer.cpython-310.pyc b/pitch_predictor/models/__pycache__/transformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0a83cd2f98f308d4fc3be4e8915f7ee4f617ecf
Binary files /dev/null and b/pitch_predictor/models/__pycache__/transformer.cpython-310.pyc differ
diff --git a/pitch_predictor/models/rnn.py b/pitch_predictor/models/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e91cb581b4a7c4b13b78517929c5459726fff6a
--- /dev/null
+++ b/pitch_predictor/models/rnn.py
@@ -0,0 +1,56 @@
+import torch
+import torch.nn as nn
+
+
+class PitchRNN(nn.Module):
+    def __init__(self, n_mels, hidden_size):
+        super(PitchRNN, self).__init__()
+
+        self.sp_linear = nn.Sequential(nn.Conv1d(n_mels, hidden_size*2, kernel_size=1),
+                                       nn.SiLU(),
+                                       nn.Conv1d(hidden_size*2, hidden_size, kernel_size=1),
+                                       nn.SiLU(),)
+
+        self.midi_linear = nn.Sequential(nn.Conv1d(1, hidden_size*2, kernel_size=1),
+                                         nn.SiLU(),
+                                         nn.Conv1d(hidden_size*2, hidden_size, kernel_size=1),
+                                         nn.SiLU(),)
+
+        self.hidden_size = hidden_size
+
+        self.rnn = nn.GRU(input_size=hidden_size*2,
+                          hidden_size=hidden_size,
+                          num_layers=2,
+                          batch_first=True,
+                          bidirectional=True)
+        # self.silu = nn.SiLU()
+
+        self.linear = nn.Sequential(nn.Linear(2*hidden_size, hidden_size),
+                                    nn.SiLU(),
+                                    nn.Linear(hidden_size, 1))
+
+    def forward(self, midi, sp):
+        midi = midi.unsqueeze(1)
+        midi = self.midi_linear(midi)
+        sp = self.sp_linear(sp)
+
+        x = torch.cat([midi, sp], dim=1)
+        x = torch.transpose(x, 1, 2)
+        x, _ = self.rnn(x)
+        # x = self.silu(x)
+
+        x = self.linear(x)
+
+        return x.squeeze(-1)
+
+
+if __name__ == '__main__':
+
+    model = PitchRNN(100, 256)
+
+    x = torch.rand((4, 128))
+    t = torch.randint(0, 1000, (1, )).long()
+    sp = torch.rand((4, 100, 128))
+    midi = torch.rand((4, 128))
+
+    y = model(midi, sp)
\ No newline at end of file
diff --git a/pitch_predictor/models/transformer.py b/pitch_predictor/models/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ffedb36f077ddb4f81ad0ca3cf7e962b737cd11
--- /dev/null
+++ b/pitch_predictor/models/transformer.py
@@ -0,0 +1,134 @@
+import torch
+import torch.nn as nn
+
+import math
+from einops import rearrange
+
+
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=8, dim_head=32, q_norm=True):
+        super(LinearAttention, self).__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = torch.nn.Conv1d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = torch.nn.Conv1d(hidden_dim, dim, 1)
+        self.q_norm = q_norm
+
+    def forward(self, x):
+        # b, l, c = x.shape
+        x = x.permute(0, 2, 1)
+        # b, c, l = x.shape
+
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, 'b (qkv heads c) l -> qkv b heads c l',
+                            heads=self.heads, qkv=3)
+        k = k.softmax(dim=-1)
+        if self.q_norm:
+            q = q.softmax(dim=-2)
+
+        context = torch.einsum('bhdn,bhen->bhde', k, v)
+        out = torch.einsum('bhde,bhdn->bhen', context, q)
+        out = rearrange(out, 'b heads c l -> b (heads c) l',
+                        heads=self.heads)
+        return self.to_out(out).permute(0, 2, 1)
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, dim, n_heads=4, layer_norm_first=True):
+        super(TransformerBlock, self).__init__()
+        dim_head = dim//n_heads
+        self.attention = LinearAttention(dim, heads=n_heads, dim_head=dim_head)
+
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+
+        self.feed_forward = nn.Sequential(nn.Linear(dim, dim*2),
+                                          nn.SiLU(),
+                                          nn.Linear(dim*2, dim))
+
+        self.dropout1 = nn.Dropout(0.2)
+        self.dropout2 = nn.Dropout(0.2)
+
+        self.layer_norm_first = layer_norm_first
+
+    def forward(self, x):
+        nx = self.norm1(x)
+        x = x + self.dropout1(self.attention(nx))
+        nx = self.norm2(x)
+        nx = x + self.dropout2(self.feed_forward(nx))
+        # attention_out = self.attention(x)
+        # attention_residual_out = attention_out + x
+        # # print(attention_residual_out.shape)
+        # norm1_out = self.dropout1(self.norm1(attention_residual_out))
+        #
+        # feed_fwd_out = self.feed_forward(norm1_out)
+        # feed_fwd_residual_out = feed_fwd_out + norm1_out
+        # norm2_out = self.dropout2(self.norm2(feed_fwd_residual_out))
+        return nx
+
+
+class PitchFormer(nn.Module):
+    def __init__(self, n_mels, hidden_size, attn_layers=4):
+        super(PitchFormer, self).__init__()
+
+        self.sp_linear = nn.Sequential(nn.Conv1d(n_mels, hidden_size, kernel_size=1),
+                                       nn.SiLU(),
+                                       nn.Conv1d(hidden_size, hidden_size//2, kernel_size=1)
+                                       )
+
+        self.midi_linear = nn.Sequential(nn.Conv1d(1, hidden_size, kernel_size=1),
+                                         nn.SiLU(),
+                                         nn.Conv1d(hidden_size, hidden_size//2, kernel_size=1),
+                                         )
+
+        self.hidden_size = hidden_size
+
+        self.pos_conv = nn.Conv1d(hidden_size, hidden_size,
+                                  kernel_size=63,
+                                  padding=31,
+                                  )
+        dropout = 0
+        std = math.sqrt((4 * (1.0 - dropout)) / (63 * hidden_size))
+        nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
+        nn.init.constant_(self.pos_conv.bias, 0)
+        self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
+        self.pos_conv = nn.Sequential(self.pos_conv, nn.SiLU())
+
+        self.attn_block = nn.ModuleList([TransformerBlock(hidden_size, 4) for i in range(attn_layers)])
+
+        # self.silu = nn.SiLU()
+
+        self.linear = nn.Sequential(nn.Linear(hidden_size, hidden_size),
+                                    nn.SiLU(),
+                                    nn.Linear(hidden_size, 1))
+
+    def forward(self, midi, sp):
+        midi = midi.unsqueeze(1)
+        midi = self.midi_linear(midi)
+        sp = self.sp_linear(sp)
+
+        x = torch.cat([midi, sp], dim=1)
+
+        # position encoding
+        x_conv = self.pos_conv(x)
+        x = x + x_conv
+
+        # x = self.silu(x)
+        x = x.permute(0, 2, 1)
+        for layer in self.attn_block:
+            x = layer(x)
+
+        x = self.linear(x)
+
+        return x.squeeze(-1)
+
+
+if __name__ == '__main__':
+
+    model = PitchFormer(100, 256)
+
+    x = torch.rand((4, 64))
+    sp = torch.rand((4, 100, 64))
+    midi = torch.rand((4, 64))
+
+    y = model(midi, sp)
\ No newline at end of file
diff --git a/pitch_predictor/train_transformer.py b/pitch_predictor/train_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fd484402dc1562e68e9189f0c1c06aa6a141eda
--- /dev/null
+++ b/pitch_predictor/train_transformer.py
@@ -0,0 +1,246 @@
+import os, json, argparse, yaml
+import numpy as np
+from tqdm import tqdm
+import librosa
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+from torch.cuda.amp import autocast, GradScaler
+
+from dataset.diffpitch import DiffPitch
+from models.transformer import PitchFormer
+from utils import minmax_norm_diff, reverse_minmax_norm_diff, save_curve_plot
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-config', type=str, default='config/DiffPitch.yaml')
+
+parser.add_argument('-seed', type=int, default=9811)
+parser.add_argument('-amp', type=bool, default=False)
+parser.add_argument('-compile', type=bool, default=False)
+
+parser.add_argument('-data_dir', type=str, default='data/')
+parser.add_argument('-content_dir', type=str, default='world')
+
+parser.add_argument('-train_frames', type=int, default=256)
+parser.add_argument('-test_frames', type=int, default=256)
+parser.add_argument('-batch_size', type=int, default=32)
+parser.add_argument('-test_size', type=int, default=1)
+parser.add_argument('-num_workers', type=int, default=4)
+parser.add_argument('-lr', type=float, default=5e-5)
+parser.add_argument('-weight_decay', type=int, default=1e-6)
+
+parser.add_argument('-epochs', type=int, default=1)
+parser.add_argument('-save_every', type=int, default=20)
+parser.add_argument('-log_step', type=int, default=100)
+parser.add_argument('-log_dir', type=str, default='logs_transformer_pitch')
+parser.add_argument('-ckpt_dir', type=str, default='ckpt_transformer_pitch')
+
+args = parser.parse_args()
+args.save_ori = True
+config = yaml.load(open(args.config), Loader=yaml.FullLoader)
+mel_cfg = config['logmel']
+ddpm_cfg = config['ddpm']
+# unet_cfg = config['unet']
+
+
+def RMSE(gen_f0, gt_f0):
+    # Get voiced part
+    gt_f0 = gt_f0[0]
+    gen_f0 = gen_f0[0]
+
+    nonzero_idxs = np.where((gen_f0 != 0) & (gt_f0 != 0))[0]
+    gen_f0_voiced = np.log2(gen_f0[nonzero_idxs])
+    gt_f0_voiced = np.log2(gt_f0[nonzero_idxs])
+    # log F0 RMSE
+    if len(gen_f0_voiced) != 0:
+        f0_rmse = np.sqrt(np.mean((gen_f0_voiced - gt_f0_voiced) ** 2))
+    else:
+        f0_rmse = 0
+    return f0_rmse
+
+
+if __name__ == "__main__":
+    torch.manual_seed(args.seed)
+    np.random.seed(args.seed)
+    if torch.cuda.is_available():
+        args.device = 'cuda'
+        torch.cuda.manual_seed(args.seed)
+        torch.cuda.manual_seed_all(args.seed)
+        torch.backends.cuda.matmul.allow_tf32 = True
+        if torch.backends.cudnn.is_available():
+            torch.backends.cudnn.deterministic = True
+            torch.backends.cudnn.benchmark = True
+    else:
+       args.device = 'cpu'
+
+    if os.path.exists(args.log_dir) is False:
+        os.makedirs(args.log_dir)
+
+    if os.path.exists(args.ckpt_dir) is False:
+        os.makedirs(args.ckpt_dir)
+
+    print('Initializing data loaders...')
+    trainset = DiffPitch('data/', 'train', args.train_frames, shift=True)
+    train_loader = DataLoader(trainset, batch_size=args.batch_size, num_workers=args.num_workers,
+                              drop_last=True, shuffle=True)
+
+    val_set = DiffPitch('data/', 'val', args.test_frames, shift=True)
+    val_loader = DataLoader(val_set, batch_size=1, shuffle=False)
+
+    test_set = DiffPitch('data/', 'test', args.test_frames, shift=True)
+    test_loader = DataLoader(test_set, batch_size=1, shuffle=False)
+
+    real_set = DiffPitch('data/', 'real', args.test_frames, shift=False)
+    read_loader = DataLoader(real_set, batch_size=1, shuffle=False)
+
+    print('Initializing and loading models...')
+    model = PitchFormer(mel_cfg['n_mels'], 512).to(args.device)
+    ckpt = torch.load('ckpt_transformer_pitch/transformer_pitch_460.pt')
+    model.load_state_dict(ckpt)
+
+    print('Initializing optimizers...')
+    optimizer = torch.optim.AdamW(params=model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+    scaler = GradScaler()
+
+    if args.compile:
+        model = torch.compile(model)
+
+    print('Start training.')
+    global_step = 0
+    for epoch in range(1, args.epochs + 1):
+        print(f'Epoch: {epoch} [iteration: {global_step}]')
+        model.train()
+        losses = []
+
+        for step, batch in enumerate(tqdm(train_loader)):
+            optimizer.zero_grad()
+            mel, midi, f0 = batch
+            mel = mel.to(args.device)
+            midi = midi.to(args.device)
+            f0 = f0.to(args.device)
+
+            if args.amp:
+                with autocast():
+                    f0_pred = model(sp=mel, midi=midi)
+                    loss = F.mse_loss(f0_pred, f0)
+                scaler.scale(loss).backward()
+                scaler.step(optimizer)
+                scaler.update()
+            else:
+                f0_pred = model(sp=mel, midi=midi)
+                loss = F.l1_loss(f0_pred, f0)
+                # Backward propagation
+                loss.backward()
+                optimizer.step()
+
+            losses.append(loss.item())
+            global_step += 1
+
+            if global_step % args.log_step == 0:
+                losses = np.asarray(losses)
+                # msg = 'Epoch %d: loss = %.4f\n' % (epoch, np.mean(losses))
+                msg = '\nEpoch: [{}][{}]\t' \
+                      'Batch: [{}][{}]\tLoss: {:.6f}\n'.format(epoch,
+                                                               args.epochs,
+                                                               step+1,
+                                                               len(train_loader),
+                                                               np.mean(losses))
+                with open(f'{args.log_dir}/train_dec.log', 'a') as f:
+                    f.write(msg)
+                losses = []
+
+        if epoch % args.save_every > 0:
+            continue
+
+        print('Saving model...\n')
+        ckpt = model.state_dict()
+        torch.save(ckpt, f=f"{args.ckpt_dir}/transformer_pitch_{epoch}.pt")
+
+        print('Inference...\n')
+        model.eval()
+        with torch.no_grad():
+            val_loss = []
+            val_rmse = []
+            for i, batch in enumerate(val_loader):
+                # optimizer.zero_grad()
+                mel, midi, f0 = batch
+                mel = mel.to(args.device)
+                midi = midi.to(args.device)
+                f0 = f0.to(args.device)
+
+                f0_pred = model(sp=mel, midi=midi)
+
+                # save pred
+                f0_pred[f0_pred < librosa.note_to_hz('C2')] = 0
+                f0_pred[f0_pred > librosa.note_to_hz('C6')] = librosa.note_to_hz('C6')
+
+                val_loss.append(F.l1_loss(f0_pred, f0).item())
+                val_rmse.append(RMSE(f0_pred.cpu().numpy(), f0.cpu().numpy()))
+
+                if i <= 4:
+                    save_path = f'{args.log_dir}/pic/{i}/{epoch}_val.png'
+                    if os.path.exists(os.path.dirname(save_path)) is False:
+                        os.makedirs(os.path.dirname(save_path))
+                    save_curve_plot(f0_pred.cpu().squeeze(), midi.cpu().squeeze(), f0.cpu().squeeze(), save_path)
+                # else:
+                #     break
+
+            msg = '\nEpoch: [{}][{}]\tLoss: {:.6f}\tRMSE:{:.6f}\n'.\
+                format(epoch, args.epochs, np.mean(val_loss), np.mean(val_rmse))
+            with open(f'{args.log_dir}/eval_dec.log', 'a') as f:
+                f.write(msg)
+
+            test_loss = []
+            test_rmse = []
+            for i, batch in enumerate(test_loader):
+                # optimizer.zero_grad()
+                mel, midi, f0 = batch
+                mel = mel.to(args.device)
+                midi = midi.to(args.device)
+                f0 = f0.to(args.device)
+
+                f0_pred = model(sp=mel, midi=midi)
+
+                # save pred
+                f0_pred[f0_pred < librosa.note_to_hz('C2')] = 0
+                f0_pred[f0_pred > librosa.note_to_hz('C6')] = librosa.note_to_hz('C6')
+
+                test_loss.append(F.l1_loss(f0_pred, f0).item())
+                test_rmse.append(RMSE(f0_pred.cpu().numpy(), f0.cpu().numpy()))
+
+                if i <= 4:
+                    save_path = f'{args.log_dir}/pic/{i}/{epoch}_test.png'
+                    if os.path.exists(os.path.dirname(save_path)) is False:
+                        os.makedirs(os.path.dirname(save_path))
+                    save_curve_plot(f0_pred.cpu().squeeze(), midi.cpu().squeeze(), f0.cpu().squeeze(), save_path)
+
+            msg = '\nEpoch: [{}][{}]\tLoss: {:.6f}\tRMSE:{:.6f}\n'. \
+                format(epoch, args.epochs, np.mean(test_loss), np.mean(test_rmse))
+            with open(f'{args.log_dir}/test_dec.log', 'a') as f:
+                f.write(msg)
+
+            for i, batch in enumerate(read_loader):
+                # optimizer.zero_grad()
+                mel, midi, f0 = batch
+                mel = mel.to(args.device)
+                midi = midi.to(args.device)
+                f0 = f0.to(args.device)
+
+                f0_pred = model(sp=mel, midi=midi)
+                f0_pred[f0 == 0] = 0
+
+                # save pred
+                f0_pred[f0_pred < librosa.note_to_hz('C2')] = 0
+                f0_pred[f0_pred > librosa.note_to_hz('C6')] = librosa.note_to_hz('C6')
+
+                save_path = f'{args.log_dir}/pic/{i}/{epoch}_real.png'
+                if os.path.exists(os.path.dirname(save_path)) is False:
+                    os.makedirs(os.path.dirname(save_path))
+                save_curve_plot(f0_pred.cpu().squeeze(), midi.cpu().squeeze(), f0.cpu().squeeze(), save_path)
+
+
+
+
diff --git a/pitch_predictor/utils.py b/pitch_predictor/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f39c5304f5463853870f19e5925c94655af4fc61
--- /dev/null
+++ b/pitch_predictor/utils.py
@@ -0,0 +1,46 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.io import wavfile
+import torch
+import librosa
+from torch.nn import functional as F
+
+
+def save_curve_plot(pred, midi, gt, savepath):
+    plt.style.use('default')
+    fig, ax = plt.subplots(figsize=(12, 3))
+
+    pred[pred == 0] = np.nan
+    midi[midi == 0] = np.nan
+    gt[gt == 0] = np.nan
+
+    # im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none')
+    ax.plot(range(len(pred)), pred, color='tab:green', label='pred')
+    ax.plot(range(len(midi)), midi, color='tab:blue', label='midi')
+    ax.plot(range(len(gt)), gt, color='grey', label='gt')
+    # plt.colorbar(im, ax=ax)
+    plt.tight_layout()
+    fig.canvas.draw()
+    plt.legend()
+    plt.savefig(savepath)
+    plt.close()
+#
+#
+# def save_audio(file_path, sampling_rate, audio):
+#     audio = np.clip(audio.detach().cpu().squeeze().numpy(), -0.999, 0.999)
+#     wavfile.write(file_path, sampling_rate, (audio * 32767).astype("int16"))
+
+
+def minmax_norm_diff(tensor: torch.Tensor, vmax: float = librosa.note_to_hz('C6'),
+                     vmin: float = 0) -> torch.Tensor:
+    tensor = torch.clip(tensor, vmin, vmax)
+    tensor = 2 * (tensor - vmin) / (vmax - vmin) - 1
+    return tensor
+
+
+def reverse_minmax_norm_diff(tensor: torch.Tensor, vmax: float = librosa.note_to_hz('C6'),
+                             vmin: float = 0) -> torch.Tensor:
+    tensor = torch.clip(tensor, -1.0, 1.0)
+    tensor = (tensor + 1) / 2
+    tensor = tensor * (vmax - vmin) + vmin
+    return tensor
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..976247a9d4f771489046246c2f07767b41a84721
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,17 @@
+diffusers
+einops
+fastdtw
+librosa
+matplotlib
+music21
+numpy
+pandas
+pretty_midi
+pysptk
+pyworld
+scipy
+soundfile
+tgt
+torch
+torchaudio
+tqdm
diff --git a/score_based_apc.py b/score_based_apc.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9259d641045d7bfdb6d5aede00f6cebcc28f477
--- /dev/null
+++ b/score_based_apc.py
@@ -0,0 +1,159 @@
+import os.path
+
+import numpy as np
+import pandas as pd
+import torch
+import yaml
+import librosa
+import soundfile as sf
+from tqdm import tqdm
+
+from diffusers import DDIMScheduler
+from pitch_controller.models.unet import UNetPitcher
+from pitch_controller.utils import minmax_norm_diff, reverse_minmax_norm_diff
+from pitch_controller.modules.BigVGAN.inference import load_model
+from utils import get_mel, get_world_mel, get_f0, f0_to_coarse, show_plot, get_matched_f0, log_f0
+from pitch_predictor.models.transformer import PitchFormer
+import pretty_midi
+
+
+def prepare_midi_wav(wav_id, midi_id, sr=24000):
+    midi = pretty_midi.PrettyMIDI(midi_id)
+    roll = midi.get_piano_roll()
+    roll = np.pad(roll, ((0, 0), (0, 1000)), constant_values=0)
+    roll[roll > 0] = 100
+
+    onset = midi.get_onsets()
+    before_onset = list(np.round(onset * 100 - 1).astype(int))
+    roll[:, before_onset] = 0
+
+    wav, sr = librosa.load(wav_id, sr=sr)
+
+    start = 0
+    end = round(100 * len(wav) / sr) / 100
+    # save audio
+    wav_seg = wav[round(start * sr):round(end * sr)]
+    cur_roll = roll[:, round(100 * start):round(100 * end)]
+    return wav_seg, cur_roll
+
+
+def algin_mapping(content, target_len):
+    # align content with mel
+    src_len = content.shape[-1]
+    target = torch.zeros([content.shape[0], target_len], dtype=torch.float).to(content.device)
+    temp = torch.arange(src_len+1) * target_len / src_len
+
+    for i in range(target_len):
+        cur_idx = torch.argmin(torch.abs(temp-i))
+        target[:, i] = content[:, cur_idx]
+    return target
+
+
+def midi_to_hz(midi):
+    idx = torch.zeros(midi.shape[-1])
+    for frame in range(midi.shape[-1]):
+        midi_frame = midi[:, frame]
+        non_zero = midi_frame.nonzero()
+        if len(non_zero) != 0:
+            hz = librosa.midi_to_hz(non_zero[0])
+            idx[frame] = torch.tensor(hz)
+    return idx
+
+
+@torch.no_grad()
+def score_pitcher(source, pitch_ref, model, hifigan, pitcher, steps=50, shift_semi=0, mask_with_source=False):
+    wav, midi = prepare_midi_wav(source, pitch_ref, sr=sr)
+
+    source_mel = get_world_mel(None, sr=sr, wav=wav)
+
+    midi = torch.tensor(midi, dtype=torch.float32)
+    midi = algin_mapping(midi, source_mel.shape[-1])
+    midi = midi_to_hz(midi)
+
+    f0_ori = np.nan_to_num(get_f0(source))
+
+    source_mel = torch.from_numpy(source_mel).float().unsqueeze(0).to(device)
+    f0_ori = torch.from_numpy(f0_ori).float().unsqueeze(0).to(device)
+    midi = midi.unsqueeze(0).to(device)
+
+    f0_pred = pitcher(sp=source_mel, midi=midi)
+    if mask_with_source:
+        # mask unvoiced frames based on original pitch estimation
+        f0_pred[f0_ori == 0] = 0
+    f0_pred = f0_pred.cpu().numpy()[0]
+    # limit range
+    f0_pred[f0_pred < librosa.note_to_hz('C2')] = 0
+    f0_pred[f0_pred > librosa.note_to_hz('C6')] = librosa.note_to_hz('C6')
+
+    f0_pred = f0_pred * (2 ** (shift_semi / 12))
+
+    f0_pred = log_f0(f0_pred, {'f0_bin': 345,
+                               'f0_min': librosa.note_to_hz('C2'),
+                               'f0_max': librosa.note_to_hz('C#6')})
+    f0_pred = torch.from_numpy(f0_pred).float().unsqueeze(0).to(device)
+
+    noise_scheduler = DDIMScheduler(num_train_timesteps=1000)
+    generator = torch.Generator(device=device).manual_seed(2024)
+
+    noise_scheduler.set_timesteps(steps)
+    noise = torch.randn(source_mel.shape, generator=generator, device=device)
+    pred = noise
+    source_x = minmax_norm_diff(source_mel, vmax=max_mel, vmin=min_mel)
+
+    for t in tqdm(noise_scheduler.timesteps):
+        pred = noise_scheduler.scale_model_input(pred, t)
+        model_output = model(x=pred, mean=source_x, f0=f0_pred, t=t, ref=None, embed=None)
+        pred = noise_scheduler.step(model_output=model_output,
+                                    timestep=t,
+                                    sample=pred,
+                                    eta=1, generator=generator).prev_sample
+
+    pred = reverse_minmax_norm_diff(pred, vmax=max_mel, vmin=min_mel)
+
+    pred_audio = hifigan(pred)
+    pred_audio = pred_audio.cpu().squeeze().clamp(-1, 1)
+
+    return pred_audio
+
+
+if __name__ == '__main__':
+    min_mel = np.log(1e-5)
+    max_mel = 2.5
+    sr = 24000
+
+    use_gpu = torch.cuda.is_available()
+    device = 'cuda' if use_gpu else 'cpu'
+
+    # load diffusion model
+    config = yaml.load(open('pitch_controller/config/DiffWorld_24k.yaml'), Loader=yaml.FullLoader)
+    mel_cfg = config['logmel']
+    ddpm_cfg = config['ddpm']
+    unet_cfg = config['unet']
+    model = UNetPitcher(**unet_cfg)
+    unet_path = 'ckpts/world_fixed_40.pt'
+
+    state_dict = torch.load(unet_path)
+    for key in list(state_dict.keys()):
+        state_dict[key.replace('_orig_mod.', '')] = state_dict.pop(key)
+    model.load_state_dict(state_dict)
+    if use_gpu:
+        model.cuda()
+    model.eval()
+
+    #  load vocoder
+    hifi_path = 'ckpts/bigvgan_24khz_100band/g_05000000.pt'
+    hifigan, cfg = load_model(hifi_path, device=device)
+    hifigan.eval()
+
+    # load pitch predictor
+    pitcher = PitchFormer(100, 512).to(device)
+    ckpt = torch.load('ckpts/ckpt_transformer_pitch/transformer_pitch_360.pt')
+    pitcher.load_state_dict(ckpt)
+    pitcher.eval()
+
+    pred_audio = score_pitcher('examples/score_vocal.wav', 'examples/score_midi.midi', model, hifigan, pitcher, steps=50)
+    sf.write('output_score.wav', pred_audio, samplerate=sr)
+
+
+
+
diff --git a/template_based_apc.py b/template_based_apc.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ecdf7a6d92d1e763656e003e4718ea2e5853d3d
--- /dev/null
+++ b/template_based_apc.py
@@ -0,0 +1,89 @@
+import os.path
+
+import numpy as np
+import pandas as pd
+import torch
+import yaml
+import librosa
+import soundfile as sf
+from tqdm import tqdm
+
+from diffusers import DDIMScheduler
+from pitch_controller.models.unet import UNetPitcher
+from pitch_controller.utils import minmax_norm_diff, reverse_minmax_norm_diff
+from pitch_controller.modules.BigVGAN.inference import load_model
+from utils import get_mel, get_world_mel, get_f0, f0_to_coarse, show_plot, get_matched_f0, log_f0
+
+
+@torch.no_grad()
+def template_pitcher(source, pitch_ref, model, hifigan, steps=50, shift_semi=0):
+
+    source_mel = get_world_mel(source, sr=sr)
+
+    f0_ref = get_matched_f0(source, pitch_ref, 'world')
+    f0_ref = f0_ref * 2 ** (shift_semi / 12)
+
+    f0_ref = log_f0(f0_ref, {'f0_bin': 345,
+                             'f0_min': librosa.note_to_hz('C2'),
+                             'f0_max': librosa.note_to_hz('C#6')})
+
+    source_mel = torch.from_numpy(source_mel).float().unsqueeze(0).to(device)
+    f0_ref = torch.from_numpy(f0_ref).float().unsqueeze(0).to(device)
+
+    noise_scheduler = DDIMScheduler(num_train_timesteps=1000)
+    generator = torch.Generator(device=device).manual_seed(2024)
+
+    noise_scheduler.set_timesteps(steps)
+    noise = torch.randn(source_mel.shape, generator=generator, device=device)
+    pred = noise
+    source_x = minmax_norm_diff(source_mel, vmax=max_mel, vmin=min_mel)
+
+    for t in tqdm(noise_scheduler.timesteps):
+        pred = noise_scheduler.scale_model_input(pred, t)
+        model_output = model(x=pred, mean=source_x, f0=f0_ref, t=t, ref=None, embed=None)
+        pred = noise_scheduler.step(model_output=model_output,
+                                    timestep=t,
+                                    sample=pred,
+                                    eta=1, generator=generator).prev_sample
+
+    pred = reverse_minmax_norm_diff(pred, vmax=max_mel, vmin=min_mel)
+
+    pred_audio = hifigan(pred)
+    pred_audio = pred_audio.cpu().squeeze().clamp(-1, 1)
+
+    return pred_audio
+
+
+if __name__ == '__main__':
+    min_mel = np.log(1e-5)
+    max_mel = 2.5
+    sr = 24000
+
+    use_gpu = torch.cuda.is_available()
+    device = 'cuda' if use_gpu else 'cpu'
+
+    # load diffusion model
+    config = yaml.load(open('pitch_controller/config/DiffWorld_24k.yaml'), Loader=yaml.FullLoader)
+    mel_cfg = config['logmel']
+    ddpm_cfg = config['ddpm']
+    unet_cfg = config['unet']
+    model = UNetPitcher(**unet_cfg)
+    unet_path = 'ckpts/world_fixed_40.pt'
+
+    state_dict = torch.load(unet_path)
+    for key in list(state_dict.keys()):
+        state_dict[key.replace('_orig_mod.', '')] = state_dict.pop(key)
+    model.load_state_dict(state_dict)
+    if use_gpu:
+        model.cuda()
+    model.eval()
+
+    #  load vocoder
+    hifi_path = 'ckpts/bigvgan_24khz_100band/g_05000000.pt'
+    hifigan, cfg = load_model(hifi_path, device=device)
+    hifigan.eval()
+
+    pred_audio = template_pitcher('examples/off-key.wav', 'examples/reference.wav', model, hifigan, steps=50, shift_semi=0)
+    sf.write('output_template.wav', pred_audio, samplerate=sr)
+
+
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b1a669482a9314876a5c80a78127bc786b992da
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,206 @@
+import numpy as np
+import torch
+import librosa
+from librosa.core import load
+import matplotlib.pyplot as plt
+import pysptk
+import pyworld as pw
+from fastdtw import fastdtw
+from scipy import spatial
+
+from librosa.filters import mel as librosa_mel_fn
+mel_basis = librosa_mel_fn(sr=24000, n_fft=1024, n_mels=100, fmin=0, fmax=12000)
+
+
+def _get_best_mcep_params(fs):
+    if fs == 16000:
+        return 23, 0.42
+    elif fs == 22050:
+        return 34, 0.45
+    elif fs == 24000:
+        return 34, 0.46
+    elif fs == 44100:
+        return 39, 0.53
+    elif fs == 48000:
+        return 39, 0.55
+    else:
+        raise ValueError(f"Not found the setting for {fs}.")
+
+
+def get_mel(wav_path):
+    wav, _ = load(wav_path, sr=24000)
+    wav = wav[:(wav.shape[0] // 256)*256]
+    wav = np.pad(wav, 384, mode='reflect')
+    stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False)
+    stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9))
+    mel_spectrogram = np.matmul(mel_basis, stftm)
+    if mel_spectrogram.shape[-1] % 8 != 0:
+        mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, 8 - mel_spectrogram.shape[-1] % 8)), 'minimum')
+
+    log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None))
+    return log_mel_spectrogram
+
+
+def get_world_mel(wav_path=None, sr=24000, wav=None):
+    if wav_path is not None:
+        wav, _ = librosa.load(wav_path, sr=24000)
+    wav = (wav * 32767).astype(np.int16)
+    wav = (wav / 32767).astype(np.float64)
+    # wav = wav.astype(np.float64)
+    wav = wav[:(wav.shape[0] // 256) * 256]
+
+    # _f0, t = pw.dio(wav, sr, frame_period=256/sr*1000)
+    _f0, t = pw.dio(wav, sr)
+    f0 = pw.stonemask(wav, _f0, t, sr)
+    sp = pw.cheaptrick(wav, f0, t, sr)
+    ap = pw.d4c(wav, f0, t, sr)
+    wav_hat = pw.synthesize(f0 * 0, sp, ap, sr)
+    # wav_hat = pw.synthesize(f0 * 0, sp, ap, sr, frame_period=256/sr*1000)
+
+    # pyworld output does not pad left
+    wav_hat = wav_hat[:len(wav)]
+    # wav_hat = wav_hat[256//2: len(wav)+256//2]
+    assert len(wav_hat) == len(wav)
+    wav = wav_hat.astype(np.float32)
+    wav = np.pad(wav, 384, mode='reflect')
+    stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False)
+    stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9))
+    mel_spectrogram = np.matmul(mel_basis, stftm)
+    if mel_spectrogram.shape[-1] % 8 != 0:
+        mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, 8 - mel_spectrogram.shape[-1] % 8)), 'minimum')
+
+    log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None))
+    return log_mel_spectrogram
+
+
+def get_f0(wav_path, method='pyin', padding=True):
+    if method == 'pyin':
+        wav, sr = load(wav_path, sr=24000)
+        wav = wav[:(wav.shape[0] // 256) * 256]
+        wav = np.pad(wav, 384, mode='reflect')
+        f0, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=256, center=False, sr=24000,
+                                fmin=librosa.note_to_hz('C2'),
+                                fmax=librosa.note_to_hz('C6'), fill_na=0)
+    elif method == 'world':
+        wav, sr = librosa.load(wav_path, sr=24000)
+        wav = (wav * 32767).astype(np.int16)
+        wav = (wav / 32767).astype(np.float64)
+        _f0, t = pw.dio(wav, fs=24000, frame_period=256/sr*1000,
+                        f0_floor=librosa.note_to_hz('C2'),
+                        f0_ceil=librosa.note_to_hz('C6'))
+        f0 = pw.stonemask(wav, _f0, t, sr)
+        f0 = f0[:-1]
+
+    if padding is True:
+        if f0.shape[-1] % 8 !=0:
+            f0 = np.pad(f0, ((0, 8-f0.shape[-1] % 8)), 'constant', constant_values=0)
+
+    return f0
+
+
+def get_mcep(x, n_fft=1024, n_shift=256, sr=24000):
+    x, sr = load(x, sr=24000)
+    n_frame = (x.shape[0] // 256)
+    x = np.pad(x, 384, mode='reflect')
+    # n_frame = (len(x) - n_fft) // n_shift + 1
+    win = pysptk.sptk.hamming(n_fft)
+    mcep_dim, mcep_alpha = _get_best_mcep_params(sr)
+    mcep = [pysptk.mcep(x[n_shift * i: n_shift * i + n_fft] * win,
+                        mcep_dim, mcep_alpha,
+                        eps=1e-6, etype=1,)
+            for i in range(n_frame)
+            ]
+    mcep = np.stack(mcep)
+    return mcep
+
+
+def get_matched_f0(x, y, method='world', n_fft=1024, n_shift=256):
+    # f0_x = get_f0(x, method='pyin', padding=False)
+    f0_y = get_f0(y, method=method, padding=False)
+    # print(f0_y.max())
+    # print(f0_y.min())
+
+    mcep_x = get_mcep(x, n_fft=n_fft, n_shift=n_shift)
+    mcep_y = get_mcep(y, n_fft=n_fft, n_shift=n_shift)
+
+    _, path = fastdtw(mcep_x, mcep_y, dist=spatial.distance.euclidean)
+    twf = np.array(path).T
+    # f0_x = gen_mcep[twf[0]]
+    nearest = []
+    for i in range(len(f0_y)):
+        idx = np.argmax(1 * twf[0] == i)
+        nearest.append(twf[1][idx])
+
+    f0_y = f0_y[nearest]
+
+    # f0_y = f0_y.astype(np.float32)
+
+    if f0_y.shape[-1] % 8 != 0:
+        f0_y = np.pad(f0_y, ((0, 8 - f0_y.shape[-1] % 8)), 'constant', constant_values=0)
+
+    return f0_y
+
+
+def f0_to_coarse(f0, hparams):
+
+    f0_bin = hparams['f0_bin']
+    f0_max = hparams['f0_max']
+    f0_min = hparams['f0_min']
+    is_torch = isinstance(f0, torch.Tensor)
+    # to mel scale
+    f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+    f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+    f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
+
+    unvoiced = (f0_mel == 0)
+
+    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
+
+    f0_mel[f0_mel <= 1] = 1
+    f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
+
+    f0_mel[unvoiced] = 0
+
+    f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int)
+    assert f0_coarse.max() <= 255 and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min())
+    return f0_coarse
+
+
+def log_f0(f0, hparams):
+    f0_bin = hparams['f0_bin']
+    f0_max = hparams['f0_max']
+    f0_min = hparams['f0_min']
+
+    f0_mel = np.zeros_like(f0)
+    f0_mel[f0 != 0] = 12*np.log2(f0[f0 != 0]/f0_min) + 1
+    f0_mel_min = 12*np.log2(f0_min/f0_min) + 1
+    f0_mel_max = 12*np.log2(f0_max/f0_min) + 1
+
+    unvoiced = (f0_mel == 0)
+
+    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
+
+    f0_mel[f0_mel <= 1] = 1
+    f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
+
+    f0_mel[unvoiced] = 0
+
+    f0_coarse = np.rint(f0_mel).astype(int)
+    assert f0_coarse.max() <= (f0_bin-1) and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min())
+    return f0_coarse
+
+
+def show_plot(tensor):
+    tensor = tensor.squeeze().cpu()
+    # plt.style.use('default')
+    fig, ax = plt.subplots(figsize=(12, 3))
+    im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none')
+    plt.colorbar(im, ax=ax)
+    plt.tight_layout()
+    fig.canvas.draw()
+    plt.show()
+
+
+if __name__ == '__main__':
+    mel = get_mel('target.wav')
+    f0 = get_f0('target.wav')
\ No newline at end of file