|
import argparse |
|
import os |
|
|
|
import librosa |
|
import numpy as np |
|
import soundfile as sf |
|
import torch |
|
from tqdm import tqdm |
|
|
|
from lib import dataset |
|
from lib import nets |
|
from lib import spec_utils |
|
from lib import utils |
|
|
|
|
|
class Separator(object): |
|
|
|
def __init__(self, model, device=None, batchsize=1, cropsize=256, postprocess=False): |
|
self.model = model |
|
self.offset = model.offset |
|
self.device = device |
|
self.batchsize = batchsize |
|
self.cropsize = cropsize |
|
self.postprocess = postprocess |
|
|
|
def _postprocess(self, X_spec, mask): |
|
if self.postprocess: |
|
mask_mag = np.abs(mask) |
|
mask_mag = spec_utils.merge_artifacts(mask_mag) |
|
mask = mask_mag * np.exp(1.j * np.angle(mask)) |
|
|
|
y_spec = X_spec * mask |
|
v_spec = X_spec - y_spec |
|
|
|
return y_spec, v_spec |
|
|
|
def _separate(self, X_spec_pad, roi_size): |
|
X_dataset = [] |
|
patches = (X_spec_pad.shape[2] - 2 * self.offset) // roi_size |
|
for i in range(patches): |
|
start = i * roi_size |
|
X_spec_crop = X_spec_pad[:, :, start:start + self.cropsize] |
|
X_dataset.append(X_spec_crop) |
|
|
|
X_dataset = np.asarray(X_dataset) |
|
|
|
self.model.eval() |
|
with torch.no_grad(): |
|
mask_list = [] |
|
|
|
for i in tqdm(range(0, patches, self.batchsize)): |
|
X_batch = X_dataset[i: i + self.batchsize] |
|
X_batch = torch.from_numpy(X_batch).to(self.device) |
|
|
|
mask = self.model.predict_mask(X_batch) |
|
|
|
mask = mask.detach().cpu().numpy() |
|
mask = np.concatenate(mask, axis=2) |
|
mask_list.append(mask) |
|
|
|
mask = np.concatenate(mask_list, axis=2) |
|
|
|
return mask |
|
|
|
def separate(self, X_spec): |
|
n_frame = X_spec.shape[2] |
|
pad_l, pad_r, roi_size = dataset.make_padding(n_frame, self.cropsize, self.offset) |
|
X_spec_pad = np.pad(X_spec, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant') |
|
X_spec_pad /= np.abs(X_spec).max() |
|
|
|
mask = self._separate(X_spec_pad, roi_size) |
|
mask = mask[:, :, :n_frame] |
|
|
|
y_spec, v_spec = self._postprocess(X_spec, mask) |
|
|
|
return y_spec, v_spec |
|
|
|
def separate_tta(self, X_spec): |
|
n_frame = X_spec.shape[2] |
|
pad_l, pad_r, roi_size = dataset.make_padding(n_frame, self.cropsize, self.offset) |
|
X_spec_pad = np.pad(X_spec, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant') |
|
X_spec_pad /= X_spec_pad.max() |
|
|
|
mask = self._separate(X_spec_pad, roi_size) |
|
|
|
pad_l += roi_size // 2 |
|
pad_r += roi_size // 2 |
|
X_spec_pad = np.pad(X_spec, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant') |
|
X_spec_pad /= X_spec_pad.max() |
|
|
|
mask_tta = self._separate(X_spec_pad, roi_size) |
|
mask_tta = mask_tta[:, :, roi_size // 2:] |
|
mask = (mask[:, :, :n_frame] + mask_tta[:, :, :n_frame]) * 0.5 |
|
|
|
y_spec, v_spec = self._postprocess(X_spec, mask) |
|
|
|
return y_spec, v_spec |
|
|
|
|
|
|
|
|
|
def main(gpu=-1, pretrained_model='models/baseline.pth', input_file='', sr=44100, n_fft=2048, |
|
hop_length=1024, batchsize=4, cropsize=256, output_image=False, tta=False, output_dir=""): |
|
|
|
print('loading model...', end=' ') |
|
device = torch.device('cpu') |
|
if gpu >= 0: |
|
if torch.cuda.is_available(): |
|
device = torch.device('cuda:{}'.format(gpu)) |
|
elif torch.backends.mps.is_available() and torch.backends.mps.is_built(): |
|
device = torch.device('mps') |
|
model = nets.CascadedNet(n_fft, hop_length, 32, 128, True) |
|
model.load_state_dict(torch.load(pretrained_model, map_location='cpu')) |
|
model.to(device) |
|
print('done') |
|
|
|
print('loading wave source...', end=' ') |
|
print('loading wave source...', end=' ') |
|
print("Chemin du fichier audio :", input_file) |
|
X, sr = librosa.load(input_file, sr=sr, mono=False, dtype=np.float32, res_type='kaiser_fast') |
|
basename = os.path.splitext(os.path.basename(input_file))[0] |
|
print('done') |
|
|
|
if X.ndim == 1: |
|
X = np.asarray([X, X]) |
|
|
|
print('stft of wave source...', end=' ') |
|
X_spec = spec_utils.wave_to_spectrogram(X, hop_length, n_fft) |
|
print('done') |
|
|
|
sp = Separator( |
|
model=model, |
|
device=device, |
|
batchsize=batchsize, |
|
cropsize=cropsize, |
|
) |
|
|
|
if tta: |
|
y_spec, v_spec = sp.separate_tta(X_spec) |
|
else: |
|
y_spec, v_spec = sp.separate(X_spec) |
|
|
|
print('validating output directory...', end=' ') |
|
if output_dir != "": |
|
output_dir = output_dir.rstrip('/') + '/' |
|
os.makedirs(output_dir, exist_ok=True) |
|
print('done') |
|
|
|
print('inverse stft of instruments...', end=' ') |
|
wave = spec_utils.spectrogram_to_wave(y_spec, hop_length=hop_length) |
|
print('done') |
|
sf.write('{}{}_Instruments.wav'.format(output_dir, basename), wave.T, sr) |
|
|
|
print('inverse stft of vocals...', end=' ') |
|
wave = spec_utils.spectrogram_to_wave(v_spec, hop_length=hop_length) |
|
print('done') |
|
sf.write('{}{}_Vocals_finale.wav'.format(output_dir, basename), wave.T, sr) |
|
|
|
if output_image: |
|
image = spec_utils.spectrogram_to_image(y_spec) |
|
utils.imwrite('{}{}_Instruments.jpg'.format(output_dir, basename), image) |
|
|
|
image = spec_utils.spectrogram_to_image(v_spec) |
|
utils.imwrite('{}{}_Vocals.jpg'.format(output_dir, basename), image) |
|
|
|
|
|
|
|
import os |
|
|
|
|
|
main(input_file=os.getcwd()+'/audio_gnu.wav') |
|
|
|
|