Spaces:

Awell00
/

music_drums_separation

Running on Zero

App Files Files Community

Awell00 commited on Aug 25, 2024

Commit

d3a31f9

verified ·

1 Parent(s): b5c212a

feat!: add inference and utils files for model features and sound separation

Browse files

Files changed (2) hide show

inference.py +187 -0
utils.py +194 -0

inference.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import argparse
+import time
+import librosa
+from tqdm import tqdm
+import sys
+import os
+import glob
+import torch
+import numpy as np
+import soundfile as sf
+import torch.nn as nn
+current_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(current_dir)
+from utils import demix_track, demix_track_demucs, get_model_from_config
+import warnings
+warnings.filterwarnings("ignore")
+def run_folder(model, args, config, device, verbose=False):
+    start_time = time.time()
+    model.eval()
+    all_mixtures_path = glob.glob(args.input_folder + '/*.*')
+    all_mixtures_path.sort()
+    print('Total files found: {}'.format(len(all_mixtures_path)))
+    instruments = config.training.instruments
+    if config.training.target_instrument is not None:
+        instruments = [config.training.target_instrument]
+    if not os.path.isdir(args.store_dir):
+        os.mkdir(args.store_dir)
+    if not verbose:
+        all_mixtures_path = tqdm(all_mixtures_path, desc="Total progress")
+    if args.disable_detailed_pbar:
+        detailed_pbar = False
+    else:
+        detailed_pbar = True
+    for path in all_mixtures_path:
+        print("Starting processing track: ", path)
+        if not verbose:
+            all_mixtures_path.set_postfix({'track': os.path.basename(path)})
+        try:
+            mix, sr = librosa.load(path, sr=44100, mono=False)
+        except Exception as e:
+            print('Cannot read track: {}'.format(path))
+            print('Error message: {}'.format(str(e)))
+            continue
+        # Convert mono to stereo if needed
+        if len(mix.shape) == 1:
+            mix = np.stack([mix, mix], axis=0)
+        mix_orig = mix.copy()
+        if 'normalize' in config.inference:
+            if config.inference['normalize'] is True:
+                mono = mix.mean(0)
+                mean = mono.mean()
+                std = mono.std()
+                mix = (mix - mean) / std
+        if args.use_tta:
+            # orig, channel inverse, polarity inverse
+            track_proc_list = [mix.copy(), mix[::-1].copy(), -1. * mix.copy()]
+        else:
+            track_proc_list = [mix.copy()]
+        full_result = []
+        for single_track in track_proc_list:
+            mixture = torch.tensor(single_track, dtype=torch.float32)
+            if args.model_type == 'htdemucs':
+                waveforms = demix_track_demucs(config, model, mixture, device, pbar=detailed_pbar)
+            else:
+                waveforms = demix_track(config, model, mixture, device, pbar=detailed_pbar)
+            full_result.append(waveforms)
+        # Average all values in single dict
+        waveforms = full_result[0]
+        for i in range(1, len(full_result)):
+            d = full_result[i]
+            for el in d:
+                if i == 2:
+                    waveforms[el] += -1.0 * d[el]
+                elif i == 1:
+                    waveforms[el] += d[el][::-1].copy()
+                else:
+                    waveforms[el] += d[el]
+        for el in waveforms:
+            waveforms[el] = waveforms[el] / len(full_result)
+        file_name, _ = os.path.splitext(os.path.basename(path))
+        song_dir = os.path.join(args.store_dir, file_name)
+        if not os.path.exists(song_dir):
+            os.makedirs(song_dir)
+        model_dir = os.path.join(song_dir, args.model_type)
+        if not os.path.exists(model_dir):
+            os.makedirs(model_dir)
+        for instr in instruments:
+            estimates = waveforms[instr].T
+            if 'normalize' in config.inference:
+                if config.inference['normalize'] is True:
+                    estimates = estimates * std + mean
+            if args.flac_file:
+                output_file = os.path.join(model_dir, f"{file_name}_{instr}.flac")
+                subtype = 'PCM_16' if args.pcm_type == 'PCM_16' else 'PCM_24'
+                sf.write(output_file, estimates, sr, subtype=subtype)
+            else:
+                output_file = os.path.join(model_dir, f"{file_name}_{instr}.wav")
+                sf.write(output_file, estimates, sr, subtype='FLOAT')
+        # Output "instrumental", which is an inverse of 'vocals' (or first stem in list if 'vocals' absent)
+        if args.extract_instrumental:
+            if 'vocals' in instruments:
+                estimates = waveforms['vocals'].T
+            else:
+                estimates = waveforms[instruments[0]].T
+            if 'normalize' in config.inference:
+                if config.inference['normalize'] is True:
+                    estimates = estimates * std + mean
+            if args.flac_file:
+                instrum_file_name = os.path.join(model_dir, f"{file_name}_instrumental.flac")
+                subtype = 'PCM_16' if args.pcm_type == 'PCM_16' else 'PCM_24'
+                sf.write(instrum_file_name, mix_orig.T - estimates, sr, subtype=subtype)
+            else:
+                instrum_file_name = os.path.join(model_dir, f"{file_name}_instrumental.wav")
+                sf.write(instrum_file_name, mix_orig.T - estimates, sr, subtype='FLOAT')
+    time.sleep(1)
+    print("Elapsed time: {:.2f} sec".format(time.time() - start_time))
+def proc_folder_direct(model_type, config_path, start_check_point, input_folder, store_dir, device_ids=[0], extract_instrumental=False, disable_detailed_pbar=False, force_cpu=False, flac_file=False, pcm_type='PCM_24', use_tta=False):
+    device = "cpu"
+    if force_cpu:
+        device = "cpu"
+    elif torch.cuda.is_available():
+        print('CUDA is available, use --force_cpu to disable it.')
+        device = "cuda"
+        device = f'cuda:{device_ids}' if type(device_ids) == int else f'cuda:{device_ids[0]}'
+    elif torch.backends.mps.is_available():
+        device = "mps"
+    print("Using device: ", device)
+    model_load_start_time = time.time()
+    torch.backends.cudnn.benchmark = True
+    model, config = get_model_from_config(model_type, config_path)
+    if start_check_point != '':
+        print('Start from checkpoint: {}'.format(start_check_point))
+        if model_type == 'htdemucs':
+            state_dict = torch.load(start_check_point, map_location=device, weights_only=False)
+            if 'state' in state_dict:
+                state_dict = state_dict['state']
+        else:
+            state_dict = torch.load(start_check_point, map_location=device, weights_only=True)
+        model.load_state_dict(state_dict)
+    print("Instruments: {}".format(config.training.instruments))
+    if type(device_ids) != int:
+        model = nn.DataParallel(model, device_ids=device_ids)
+    model = model.to(device)
+    print("Model load time: {:.2f} sec".format(time.time() - model_load_start_time))
+    args = argparse.Namespace(
+        model_type=model_type,
+        config_path=config_path,
+        start_check_point=start_check_point,
+        input_folder=input_folder,
+        store_dir=store_dir,
+        device_ids=device_ids,
+        extract_instrumental=extract_instrumental,
+        disable_detailed_pbar=disable_detailed_pbar,
+        force_cpu=force_cpu,
+        flac_file=flac_file,
+        pcm_type=pcm_type,
+        use_tta=use_tta
+    )
+    run_folder(model, args, config, device, verbose=True)

utils.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import time
+import numpy as np
+import torch
+import torch.nn as nn
+import yaml
+from ml_collections import ConfigDict
+from omegaconf import OmegaConf
+from tqdm import tqdm
+def get_model_from_config(model_type, config_path):
+    with open(config_path) as f:
+        if model_type == 'htdemucs':
+            config = OmegaConf.load(config_path)
+        else:
+            config = ConfigDict(yaml.load(f, Loader=yaml.FullLoader))
+    if model_type == 'htdemucs':
+        from models.demucs4ht import get_model
+        model = get_model(config)
+    elif model_type == 'mel_band_roformer':
+        from models.bs_roformer import MelBandRoformer
+        model = MelBandRoformer(
+            **dict(config.model)
+        )
+    elif model_type == 'bs_roformer':
+        from models.bs_roformer import BSRoformer
+        model = BSRoformer(
+            **dict(config.model)
+        )
+    elif model_type == 'scnet':
+        from models.scnet import SCNet
+        model = SCNet(
+            **dict(config.model)
+        )
+    else:
+        print('Unknown model: {}'.format(model_type))
+        model = None
+    return model, config
+def _getWindowingArray(window_size, fade_size):
+    fadein = torch.linspace(0, 1, fade_size)
+    fadeout = torch.linspace(1, 0, fade_size)
+    window = torch.ones(window_size)
+    window[-fade_size:] *= fadeout
+    window[:fade_size] *= fadein
+    return window
+def demix_track(config, model, mix, device, pbar=False):
+    C = config.audio.chunk_size
+    N = config.inference.num_overlap
+    fade_size = C // 10
+    step = int(C // N)
+    border = C - step
+    batch_size = config.inference.batch_size
+    length_init = mix.shape[-1]
+    # Do pad from the beginning and end to account floating window results better
+    if length_init > 2 * border and (border > 0):
+        mix = nn.functional.pad(mix, (border, border), mode='reflect')
+    # windowingArray crossfades at segment boundaries to mitigate clicking artifacts
+    windowingArray = _getWindowingArray(C, fade_size)
+    with torch.cuda.amp.autocast(enabled=config.training.use_amp):
+        with torch.inference_mode():
+            if config.training.target_instrument is not None:
+                req_shape = (1, ) + tuple(mix.shape)
+            else:
+                req_shape = (len(config.training.instruments),) + tuple(mix.shape)
+            result = torch.zeros(req_shape, dtype=torch.float32)
+            counter = torch.zeros(req_shape, dtype=torch.float32)
+            i = 0
+            batch_data = []
+            batch_locations = []
+            progress_bar = tqdm(total=mix.shape[1], desc="Processing audio chunks", leave=False) if pbar else None
+            while i < mix.shape[1]:
+                # print(i, i + C, mix.shape[1])
+                part = mix[:, i:i + C].to(device)
+                length = part.shape[-1]
+                if length < C:
+                    if length > C // 2 + 1:
+                        part = nn.functional.pad(input=part, pad=(0, C - length), mode='reflect')
+                    else:
+                        part = nn.functional.pad(input=part, pad=(0, C - length, 0, 0), mode='constant', value=0)
+                batch_data.append(part)
+                batch_locations.append((i, length))
+                i += step
+                if len(batch_data) >= batch_size or (i >= mix.shape[1]):
+                    arr = torch.stack(batch_data, dim=0)
+                    x = model(arr)
+                    window = windowingArray
+                    if i - step == 0:  # First audio chunk, no fadein
+                        window[:fade_size] = 1
+                    elif i >= mix.shape[1]:  # Last audio chunk, no fadeout
+                        window[-fade_size:] = 1
+                    for j in range(len(batch_locations)):
+                        start, l = batch_locations[j]
+                        result[..., start:start+l] += x[j][..., :l].cpu() * window[..., :l]
+                        counter[..., start:start+l] += window[..., :l]
+                    batch_data = []
+                    batch_locations = []
+                if progress_bar:
+                    progress_bar.update(step)
+            if progress_bar:
+                progress_bar.close()
+            estimated_sources = result / counter
+            estimated_sources = estimated_sources.cpu().numpy()
+            np.nan_to_num(estimated_sources, copy=False, nan=0.0)
+            if length_init > 2 * border and (border > 0):
+                # Remove pad
+                estimated_sources = estimated_sources[..., border:-border]
+    if config.training.target_instrument is None:
+        return {k: v for k, v in zip(config.training.instruments, estimated_sources)}
+    else:
+        return {k: v for k, v in zip([config.training.target_instrument], estimated_sources)}
+def demix_track_demucs(config, model, mix, device, pbar=False):
+    S = len(config.training.instruments)
+    C = config.training.samplerate * config.training.segment
+    N = config.inference.num_overlap
+    batch_size = config.inference.batch_size
+    step = C // N
+    # print(S, C, N, step, mix.shape, mix.device)
+    with torch.cuda.amp.autocast(enabled=config.training.use_amp):
+        with torch.inference_mode():
+            req_shape = (S, ) + tuple(mix.shape)
+            result = torch.zeros(req_shape, dtype=torch.float32)
+            counter = torch.zeros(req_shape, dtype=torch.float32)
+            i = 0
+            batch_data = []
+            batch_locations = []
+            progress_bar = tqdm(total=mix.shape[1], desc="Processing audio chunks", leave=False) if pbar else None
+            while i < mix.shape[1]:
+                # print(i, i + C, mix.shape[1])
+                part = mix[:, i:i + C].to(device)
+                length = part.shape[-1]
+                if length < C:
+                    part = nn.functional.pad(input=part, pad=(0, C - length, 0, 0), mode='constant', value=0)
+                batch_data.append(part)
+                batch_locations.append((i, length))
+                i += step
+                if len(batch_data) >= batch_size or (i >= mix.shape[1]):
+                    arr = torch.stack(batch_data, dim=0)
+                    x = model(arr)
+                    for j in range(len(batch_locations)):
+                        start, l = batch_locations[j]
+                        result[..., start:start+l] += x[j][..., :l].cpu()
+                        counter[..., start:start+l] += 1.
+                    batch_data = []
+                    batch_locations = []
+                if progress_bar:
+                    progress_bar.update(step)
+            if progress_bar:
+                progress_bar.close()
+            estimated_sources = result / counter
+            estimated_sources = estimated_sources.cpu().numpy()
+            np.nan_to_num(estimated_sources, copy=False, nan=0.0)
+    if S > 1:
+        return {k: v for k, v in zip(config.training.instruments, estimated_sources)}
+    else:
+        return estimated_sources
+def sdr(references, estimates):
+    # compute SDR for one song
+    delta = 1e-7  # avoid numerical errors
+    num = np.sum(np.square(references), axis=(1, 2))
+    den = np.sum(np.square(references - estimates), axis=(1, 2))
+    num += delta
+    den += delta
+    return 10 * np.log10(num / den)