kindahex
/

hexgen-rvc

Model card Files Files and versions Community

hexgen-rvc / programs /music_separation_code /inference.py

nevreal

Upload Complited files

ecfa0da verified about 2 months ago

raw

history blame contribute delete

8.01 kB

	# coding: utf-8
	__author__ = "Roman Solovyev (ZFTurbo): https://github.com/ZFTurbo/"

	import argparse
	import time
	import librosa
	from tqdm import tqdm
	import sys
	import os
	import glob
	import torch
	import numpy as np
	import soundfile as sf
	import torch.nn as nn

	# Using the embedded version of Python can also correctly import the utils module.
	current_dir = os.path.dirname(os.path.abspath(__file__))
	sys.path.append(current_dir)
	from utils import demix, get_model_from_config

	import warnings

	warnings.filterwarnings("ignore")


	class Args:
	def __init__(
	self,
	input_file,
	store_dir,
	model_type,
	extract_instrumental,
	disable_detailed_pbar,
	flac_file,
	pcm_type,
	use_tta,
	):
	self.input_file = input_file
	self.model_type = model_type
	self.store_dir = store_dir
	self.extract_instrumental = extract_instrumental
	self.disable_detailed_pbar = disable_detailed_pbar
	self.flac_file = flac_file
	self.pcm_type = pcm_type
	self.use_tta = use_tta


	def run_file(model, args, config, device, verbose=False):
	start_time = time.time()
	model.eval()

	if not os.path.isfile(args.input_file):
	print("File not found: {}".format(args.input_file))
	return

	instruments = config.training.instruments.copy()
	if config.training.target_instrument is not None:
	instruments = [config.training.target_instrument]

	if not os.path.isdir(args.store_dir):
	os.mkdir(args.store_dir)

	print("Starting processing track: ", args.input_file)
	try:
	mix, sr = librosa.load(args.input_file, sr=44100, mono=False)
	except Exception as e:
	print("Cannot read track: {}".format(args.input_file))
	print("Error message: {}".format(str(e)))
	return

	# Convert mono to stereo if needed
	if len(mix.shape) == 1:
	mix = np.stack([mix, mix], axis=0)

	mix_orig = mix.copy()
	if "normalize" in config.inference:
	if config.inference["normalize"] is True:
	mono = mix.mean(0)
	mean = mono.mean()
	std = mono.std()
	mix = (mix - mean) / std

	if args.use_tta:
	# orig, channel inverse, polarity inverse
	track_proc_list = [mix.copy(), mix[::-1].copy(), -1.0 * mix.copy()]
	else:
	track_proc_list = [mix.copy()]

	full_result = []
	for mix in track_proc_list:
	waveforms = demix(
	config, model, mix, device, pbar=verbose, model_type=args.model_type
	)
	full_result.append(waveforms)

	# Average all values in single dict
	waveforms = full_result[0]
	for i in range(1, len(full_result)):
	d = full_result[i]
	for el in d:
	if i == 2:
	waveforms[el] += -1.0 * d[el]
	elif i == 1:
	waveforms[el] += d[el][::-1].copy()
	else:
	waveforms[el] += d[el]
	for el in waveforms:
	waveforms[el] = waveforms[el] / len(full_result)

	# Create a new `instr` in instruments list, 'instrumental'
	if args.extract_instrumental:
	instr = "vocals" if "vocals" in instruments else instruments[0]
	instruments.append("instrumental")
	# Output "instrumental", which is an inverse of 'vocals' or the first stem in list if 'vocals' absent
	waveforms["instrumental"] = mix_orig - waveforms[instr]

	for instr in instruments:
	estimates = waveforms[instr].T
	if "normalize" in config.inference:
	if config.inference["normalize"] is True:
	estimates = estimates * std + mean
	file_name, _ = os.path.splitext(os.path.basename(args.input_file))
	if args.flac_file:
	output_file = os.path.join(args.store_dir, f"{file_name}_{instr}.flac")
	subtype = "PCM_16" if args.pcm_type == "PCM_16" else "PCM_24"
	sf.write(output_file, estimates, sr, subtype=subtype)
	else:
	output_file = os.path.join(args.store_dir, f"{file_name}_{instr}.wav")
	sf.write(output_file, estimates, sr, subtype="FLOAT")

	time.sleep(1)
	print("Elapsed time: {:.2f} sec".format(time.time() - start_time))


	def proc_file(args):
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--model_type",
	type=str,
	default="mdx23c",
	help="One of bandit, bandit_v2, bs_roformer, htdemucs, mdx23c, mel_band_roformer, scnet, scnet_unofficial, segm_models, swin_upernet, torchseg",
	)
	parser.add_argument("--config_path", type=str, help="path to config file")
	parser.add_argument(
	"--start_check_point",
	type=str,
	default="",
	help="Initial checkpoint to valid weights",
	)
	parser.add_argument(
	"--input_file", type=str, help="folder with mixtures to process"
	)
	parser.add_argument(
	"--store_dir", default="", type=str, help="path to store results as wav file"
	)
	parser.add_argument(
	"--device_ids", nargs="+", type=int, default=0, help="list of gpu ids"
	)
	parser.add_argument(
	"--extract_instrumental",
	action="store_true",
	help="invert vocals to get instrumental if provided",
	)
	parser.add_argument(
	"--disable_detailed_pbar",
	action="store_true",
	help="disable detailed progress bar",
	)
	parser.add_argument(
	"--force_cpu",
	action="store_true",
	help="Force the use of CPU even if CUDA is available",
	)
	parser.add_argument(
	"--flac_file", action="store_true", help="Output flac file instead of wav"
	)
	parser.add_argument(
	"--pcm_type",
	type=str,
	choices=["PCM_16", "PCM_24"],
	default="PCM_24",
	help="PCM type for FLAC files (PCM_16 or PCM_24)",
	)
	parser.add_argument(
	"--use_tta",
	action="store_true",
	help="Flag adds test time augmentation during inference (polarity and channel inverse). While this triples the runtime, it reduces noise and slightly improves prediction quality.",
	)
	if args is None:
	args = parser.parse_args()
	else:
	args = parser.parse_args(args)

	device = "cpu"
	if args.force_cpu:
	device = "cpu"
	elif torch.cuda.is_available():
	print("CUDA is available, use --force_cpu to disable it.")
	device = "cuda"
	device = (
	f"cuda:{args.device_ids[0]}"
	if type(args.device_ids) == list
	else f"cuda:{args.device_ids}"
	)
	elif torch.backends.mps.is_available():
	device = "mps"

	print("Using device: ", device)

	model_load_start_time = time.time()
	torch.backends.cudnn.benchmark = True

	model, config = get_model_from_config(args.model_type, args.config_path)
	if args.start_check_point != "":
	print("Start from checkpoint: {}".format(args.start_check_point))
	if args.model_type == "htdemucs":
	state_dict = torch.load(
	args.start_check_point, map_location=device, weights_only=False
	)
	# Fix for htdemucs pretrained models
	if "state" in state_dict:
	state_dict = state_dict["state"]
	else:
	state_dict = torch.load(
	args.start_check_point, map_location=device, weights_only=True
	)
	model.load_state_dict(state_dict)
	print("Instruments: {}".format(config.training.instruments))

	# in case multiple CUDA GPUs are used and --device_ids arg is passed
	if (
	type(args.device_ids) == list
	and len(args.device_ids) > 1
	and not args.force_cpu
	):
	model = nn.DataParallel(model, device_ids=args.device_ids)

	model = model.to(device)

	print("Model load time: {:.2f} sec".format(time.time() - model_load_start_time))

	run_file(model, args, config, device, verbose=True)


	if __name__ == "__main__":
	proc_file(None)