File size: 2,989 Bytes
7ef50cb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import dataclasses
import pathlib
import libf0
import librosa
import numpy as np
import resampy
import torch
import torchcrepe
import torchfcpe
import os
# from tools.anyf0.rmvpe import RMVPE
from rvc.lib.predictors.RMVPE import RMVPE0Predictor
from rvc.configs.config import Config
config = Config()
@dataclasses.dataclass
class F0Extractor:
wav_path: pathlib.Path
sample_rate: int = 44100
hop_length: int = 512
f0_min: int = 50
f0_max: int = 1600
method: str = "rmvpe"
x: np.ndarray = dataclasses.field(init=False)
def __post_init__(self):
self.x, self.sample_rate = librosa.load(self.wav_path, sr=self.sample_rate)
@property
def hop_size(self) -> float:
return self.hop_length / self.sample_rate
@property
def wav16k(self) -> np.ndarray:
return resampy.resample(self.x, self.sample_rate, 16000)
def extract_f0(self) -> np.ndarray:
f0 = None
method = self.method
if method == "crepe":
wav16k_torch = torch.FloatTensor(self.wav16k).unsqueeze(0).to(config.device)
f0 = torchcrepe.predict(
wav16k_torch,
sample_rate=16000,
hop_length=160,
batch_size=512,
fmin=self.f0_min,
fmax=self.f0_max,
device=config.device,
)
f0 = f0[0].cpu().numpy()
elif method == "fcpe":
audio = librosa.to_mono(self.x)
audio_length = len(audio)
f0_target_length = (audio_length // self.hop_length) + 1
audio = (
torch.from_numpy(audio)
.float()
.unsqueeze(0)
.unsqueeze(-1)
.to(config.device)
)
model = torchfcpe.spawn_bundled_infer_model(device=config.device)
f0 = model.infer(
audio,
sr=self.sample_rate,
decoder_mode="local_argmax",
threshold=0.006,
f0_min=self.f0_min,
f0_max=self.f0_max,
interp_uv=False,
output_interp_target_length=f0_target_length,
)
f0 = f0.squeeze().cpu().numpy()
elif method == "rmvpe":
model_rmvpe = RMVPE0Predictor(
os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
is_half=config.is_half,
device=config.device,
# hop_length=80
)
f0 = model_rmvpe.infer_from_audio(self.wav16k, thred=0.03)
else:
raise ValueError(f"Unknown method: {self.method}")
return libf0.hz_to_cents(f0, librosa.midi_to_hz(0))
def plot_f0(self, f0):
from matplotlib import pyplot as plt
plt.figure(figsize=(10, 4))
plt.plot(f0)
plt.title(self.method)
plt.xlabel("Time (frames)")
plt.ylabel("F0 (cents)")
plt.show()
|