r3gm commited on
Commit
e86b53b
·
verified ·
1 Parent(s): b0967c6

Upload 7 files

Browse files
Files changed (7) hide show
  1. README.md +6 -5
  2. app.py +805 -0
  3. mdx_models/data.json +354 -0
  4. packages.txt +1 -0
  5. requirements.txt +3 -0
  6. test.mp3 +0 -0
  7. utils.py +142 -0
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
- title: Audio Separator
3
- emoji: 🌍
4
  colorFrom: purple
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 4.28.3
8
  app_file: app.py
9
- pinned: false
10
  license: mit
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Vocal-Instrumental Audio Separator
3
+ emoji: 🏃
4
  colorFrom: purple
5
+ colorTo: pink
6
  sdk: gradio
7
  sdk_version: 4.28.3
8
  app_file: app.py
9
+ pinned: true
10
  license: mit
11
+ short_description: Vocal and background audio separator
12
  ---
13
 
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,805 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # os.system("pip install ./ort_nightly_gpu-1.17.0.dev20240118002-cp310-cp310-manylinux_2_28_x86_64.whl")
3
+ os.system("pip install ort-nightly-gpu --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ort-cuda-12-nightly/pypi/simple/")
4
+ import gc
5
+ import hashlib
6
+ import queue
7
+ import threading
8
+ import json
9
+ import shlex
10
+ import sys
11
+ import subprocess
12
+ import librosa
13
+ import numpy as np
14
+ import soundfile as sf
15
+ import torch
16
+ from tqdm import tqdm
17
+ from utils import (
18
+ remove_directory_contents,
19
+ create_directories,
20
+ download_manager,
21
+ )
22
+ import random
23
+ import spaces
24
+ from utils import logger
25
+ import onnxruntime as ort
26
+ import warnings
27
+ import spaces
28
+ import gradio as gr
29
+ import logging
30
+ import time
31
+
32
+ warnings.filterwarnings("ignore")
33
+
34
+ title = "<center><strong><font size='7'>Audio🔹separator</font></strong></center>"
35
+ description = "This demo uses the MDX-Net models for vocal and background sound separation."
36
+ theme = "NoCrypt/miku"
37
+
38
+ stem_naming = {
39
+ "Vocals": "Instrumental",
40
+ "Other": "Instruments",
41
+ "Instrumental": "Vocals",
42
+ "Drums": "Drumless",
43
+ "Bass": "Bassless",
44
+ }
45
+
46
+
47
+ class MDXModel:
48
+ def __init__(
49
+ self,
50
+ device,
51
+ dim_f,
52
+ dim_t,
53
+ n_fft,
54
+ hop=1024,
55
+ stem_name=None,
56
+ compensation=1.000,
57
+ ):
58
+ self.dim_f = dim_f
59
+ self.dim_t = dim_t
60
+ self.dim_c = 4
61
+ self.n_fft = n_fft
62
+ self.hop = hop
63
+ self.stem_name = stem_name
64
+ self.compensation = compensation
65
+
66
+ self.n_bins = self.n_fft // 2 + 1
67
+ self.chunk_size = hop * (self.dim_t - 1)
68
+ self.window = torch.hann_window(
69
+ window_length=self.n_fft, periodic=True
70
+ ).to(device)
71
+
72
+ out_c = self.dim_c
73
+
74
+ self.freq_pad = torch.zeros(
75
+ [1, out_c, self.n_bins - self.dim_f, self.dim_t]
76
+ ).to(device)
77
+
78
+ def stft(self, x):
79
+ x = x.reshape([-1, self.chunk_size])
80
+ x = torch.stft(
81
+ x,
82
+ n_fft=self.n_fft,
83
+ hop_length=self.hop,
84
+ window=self.window,
85
+ center=True,
86
+ return_complex=True,
87
+ )
88
+ x = torch.view_as_real(x)
89
+ x = x.permute([0, 3, 1, 2])
90
+ x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
91
+ [-1, 4, self.n_bins, self.dim_t]
92
+ )
93
+ return x[:, :, : self.dim_f]
94
+
95
+ def istft(self, x, freq_pad=None):
96
+ freq_pad = (
97
+ self.freq_pad.repeat([x.shape[0], 1, 1, 1])
98
+ if freq_pad is None
99
+ else freq_pad
100
+ )
101
+ x = torch.cat([x, freq_pad], -2)
102
+ # c = 4*2 if self.target_name=='*' else 2
103
+ x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
104
+ [-1, 2, self.n_bins, self.dim_t]
105
+ )
106
+ x = x.permute([0, 2, 3, 1])
107
+ x = x.contiguous()
108
+ x = torch.view_as_complex(x)
109
+ x = torch.istft(
110
+ x,
111
+ n_fft=self.n_fft,
112
+ hop_length=self.hop,
113
+ window=self.window,
114
+ center=True,
115
+ )
116
+ return x.reshape([-1, 2, self.chunk_size])
117
+
118
+
119
+ class MDX:
120
+ DEFAULT_SR = 44100
121
+ # Unit: seconds
122
+ DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR
123
+ DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR
124
+
125
+ def __init__(
126
+ self, model_path: str, params: MDXModel, processor=0
127
+ ):
128
+ # Set the device and the provider (CPU or CUDA)
129
+ self.device = (
130
+ torch.device(f"cuda:{processor}")
131
+ if processor >= 0
132
+ else torch.device("cpu")
133
+ )
134
+ self.provider = (
135
+ ["CUDAExecutionProvider"]
136
+ if processor >= 0
137
+ else ["CPUExecutionProvider"]
138
+ )
139
+
140
+ self.model = params
141
+
142
+ # Load the ONNX model using ONNX Runtime
143
+ self.ort = ort.InferenceSession(model_path, providers=self.provider)
144
+ # Preload the model for faster performance
145
+ self.ort.run(
146
+ None,
147
+ {"input": torch.rand(1, 4, params.dim_f, params.dim_t).numpy()},
148
+ )
149
+ self.process = lambda spec: self.ort.run(
150
+ None, {"input": spec.cpu().numpy()}
151
+ )[0]
152
+
153
+ self.prog = None
154
+
155
+ @staticmethod
156
+ def get_hash(model_path):
157
+ try:
158
+ with open(model_path, "rb") as f:
159
+ f.seek(-10000 * 1024, 2)
160
+ model_hash = hashlib.md5(f.read()).hexdigest()
161
+ except: # noqa
162
+ model_hash = hashlib.md5(open(model_path, "rb").read()).hexdigest()
163
+
164
+ return model_hash
165
+
166
+ @staticmethod
167
+ def segment(
168
+ wave,
169
+ combine=True,
170
+ chunk_size=DEFAULT_CHUNK_SIZE,
171
+ margin_size=DEFAULT_MARGIN_SIZE,
172
+ ):
173
+ """
174
+ Segment or join segmented wave array
175
+
176
+ Args:
177
+ wave: (np.array) Wave array to be segmented or joined
178
+ combine: (bool) If True, combines segmented wave array.
179
+ If False, segments wave array.
180
+ chunk_size: (int) Size of each segment (in samples)
181
+ margin_size: (int) Size of margin between segments (in samples)
182
+
183
+ Returns:
184
+ numpy array: Segmented or joined wave array
185
+ """
186
+
187
+ if combine:
188
+ # Initializing as None instead of [] for later numpy array concatenation
189
+ processed_wave = None
190
+ for segment_count, segment in enumerate(wave):
191
+ start = 0 if segment_count == 0 else margin_size
192
+ end = None if segment_count == len(wave) - 1 else -margin_size
193
+ if margin_size == 0:
194
+ end = None
195
+ if processed_wave is None: # Create array for first segment
196
+ processed_wave = segment[:, start:end]
197
+ else: # Concatenate to existing array for subsequent segments
198
+ processed_wave = np.concatenate(
199
+ (processed_wave, segment[:, start:end]), axis=-1
200
+ )
201
+
202
+ else:
203
+ processed_wave = []
204
+ sample_count = wave.shape[-1]
205
+
206
+ if chunk_size <= 0 or chunk_size > sample_count:
207
+ chunk_size = sample_count
208
+
209
+ if margin_size > chunk_size:
210
+ margin_size = chunk_size
211
+
212
+ for segment_count, skip in enumerate(
213
+ range(0, sample_count, chunk_size)
214
+ ):
215
+ margin = 0 if segment_count == 0 else margin_size
216
+ end = min(skip + chunk_size + margin_size, sample_count)
217
+ start = skip - margin
218
+
219
+ cut = wave[:, start:end].copy()
220
+ processed_wave.append(cut)
221
+
222
+ if end == sample_count:
223
+ break
224
+
225
+ return processed_wave
226
+
227
+ def pad_wave(self, wave):
228
+ """
229
+ Pad the wave array to match the required chunk size
230
+
231
+ Args:
232
+ wave: (np.array) Wave array to be padded
233
+
234
+ Returns:
235
+ tuple: (padded_wave, pad, trim)
236
+ - padded_wave: Padded wave array
237
+ - pad: Number of samples that were padded
238
+ - trim: Number of samples that were trimmed
239
+ """
240
+ n_sample = wave.shape[1]
241
+ trim = self.model.n_fft // 2
242
+ gen_size = self.model.chunk_size - 2 * trim
243
+ pad = gen_size - n_sample % gen_size
244
+
245
+ # Padded wave
246
+ wave_p = np.concatenate(
247
+ (
248
+ np.zeros((2, trim)),
249
+ wave,
250
+ np.zeros((2, pad)),
251
+ np.zeros((2, trim)),
252
+ ),
253
+ 1,
254
+ )
255
+
256
+ mix_waves = []
257
+ for i in range(0, n_sample + pad, gen_size):
258
+ waves = np.array(wave_p[:, i:i + self.model.chunk_size])
259
+ mix_waves.append(waves)
260
+
261
+ mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(
262
+ self.device
263
+ )
264
+
265
+ return mix_waves, pad, trim
266
+
267
+ def _process_wave(self, mix_waves, trim, pad, q: queue.Queue, _id: int):
268
+ """
269
+ Process each wave segment in a multi-threaded environment
270
+
271
+ Args:
272
+ mix_waves: (torch.Tensor) Wave segments to be processed
273
+ trim: (int) Number of samples trimmed during padding
274
+ pad: (int) Number of samples padded during padding
275
+ q: (queue.Queue) Queue to hold the processed wave segments
276
+ _id: (int) Identifier of the processed wave segment
277
+
278
+ Returns:
279
+ numpy array: Processed wave segment
280
+ """
281
+ mix_waves = mix_waves.split(1)
282
+ with torch.no_grad():
283
+ pw = []
284
+ for mix_wave in mix_waves:
285
+ self.prog.update()
286
+ spec = self.model.stft(mix_wave)
287
+ processed_spec = torch.tensor(self.process(spec))
288
+ processed_wav = self.model.istft(
289
+ processed_spec.to(self.device)
290
+ )
291
+ processed_wav = (
292
+ processed_wav[:, :, trim:-trim]
293
+ .transpose(0, 1)
294
+ .reshape(2, -1)
295
+ .cpu()
296
+ .numpy()
297
+ )
298
+ pw.append(processed_wav)
299
+ processed_signal = np.concatenate(pw, axis=-1)[:, :-pad]
300
+ q.put({_id: processed_signal})
301
+ return processed_signal
302
+
303
+ def process_wave(self, wave: np.array, mt_threads=1):
304
+ """
305
+ Process the wave array in a multi-threaded environment
306
+
307
+ Args:
308
+ wave: (np.array) Wave array to be processed
309
+ mt_threads: (int) Number of threads to be used for processing
310
+
311
+ Returns:
312
+ numpy array: Processed wave array
313
+ """
314
+ self.prog = tqdm(total=0)
315
+ chunk = wave.shape[-1] // mt_threads
316
+ waves = self.segment(wave, False, chunk)
317
+
318
+ # Create a queue to hold the processed wave segments
319
+ q = queue.Queue()
320
+ threads = []
321
+ for c, batch in enumerate(waves):
322
+ mix_waves, pad, trim = self.pad_wave(batch)
323
+ self.prog.total = len(mix_waves) * mt_threads
324
+ thread = threading.Thread(
325
+ target=self._process_wave, args=(mix_waves, trim, pad, q, c)
326
+ )
327
+ thread.start()
328
+ threads.append(thread)
329
+ for thread in threads:
330
+ thread.join()
331
+ self.prog.close()
332
+
333
+ processed_batches = []
334
+ while not q.empty():
335
+ processed_batches.append(q.get())
336
+ processed_batches = [
337
+ list(wave.values())[0]
338
+ for wave in sorted(
339
+ processed_batches, key=lambda d: list(d.keys())[0]
340
+ )
341
+ ]
342
+ assert len(processed_batches) == len(
343
+ waves
344
+ ), "Incomplete processed batches, please reduce batch size!"
345
+ return self.segment(processed_batches, True, chunk)
346
+
347
+
348
+ @spaces.GPU()
349
+ def run_mdx(
350
+ model_params,
351
+ output_dir,
352
+ model_path,
353
+ filename,
354
+ exclude_main=False,
355
+ exclude_inversion=False,
356
+ suffix=None,
357
+ invert_suffix=None,
358
+ denoise=False,
359
+ keep_orig=True,
360
+ m_threads=2,
361
+ device_base="cuda",
362
+ ):
363
+ if device_base == "cuda":
364
+ device = torch.device("cuda:0")
365
+ processor_num = 0
366
+ device_properties = torch.cuda.get_device_properties(device)
367
+ vram_gb = device_properties.total_memory / 1024**3
368
+ m_threads = 1 if vram_gb < 8 else (8 if vram_gb > 32 else 2)
369
+ logger.info(f"threads: {m_threads} vram: {vram_gb}")
370
+ else:
371
+ device = torch.device("cpu")
372
+ processor_num = -1
373
+ m_threads = 1
374
+
375
+ model_hash = MDX.get_hash(model_path)
376
+ mp = model_params.get(model_hash)
377
+ model = MDXModel(
378
+ device,
379
+ dim_f=mp["mdx_dim_f_set"],
380
+ dim_t=2 ** mp["mdx_dim_t_set"],
381
+ n_fft=mp["mdx_n_fft_scale_set"],
382
+ stem_name=mp["primary_stem"],
383
+ compensation=mp["compensate"],
384
+ )
385
+
386
+ mdx_sess = MDX(model_path, model, processor=processor_num)
387
+ wave, sr = librosa.load(filename, mono=False, sr=44100)
388
+ # normalizing input wave gives better output
389
+ peak = max(np.max(wave), abs(np.min(wave)))
390
+ wave /= peak
391
+ if denoise:
392
+ wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (
393
+ mdx_sess.process_wave(wave, m_threads)
394
+ )
395
+ wave_processed *= 0.5
396
+ else:
397
+ wave_processed = mdx_sess.process_wave(wave, m_threads)
398
+ # return to previous peak
399
+ wave_processed *= peak
400
+ stem_name = model.stem_name if suffix is None else suffix
401
+
402
+ main_filepath = None
403
+ if not exclude_main:
404
+ main_filepath = os.path.join(
405
+ output_dir,
406
+ f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
407
+ )
408
+ sf.write(main_filepath, wave_processed.T, sr)
409
+
410
+ invert_filepath = None
411
+ if not exclude_inversion:
412
+ diff_stem_name = (
413
+ stem_naming.get(stem_name)
414
+ if invert_suffix is None
415
+ else invert_suffix
416
+ )
417
+ stem_name = (
418
+ f"{stem_name}_diff" if diff_stem_name is None else diff_stem_name
419
+ )
420
+ invert_filepath = os.path.join(
421
+ output_dir,
422
+ f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
423
+ )
424
+ sf.write(
425
+ invert_filepath,
426
+ (-wave_processed.T * model.compensation) + wave.T,
427
+ sr,
428
+ )
429
+
430
+ if not keep_orig:
431
+ os.remove(filename)
432
+
433
+ del mdx_sess, wave_processed, wave
434
+ gc.collect()
435
+ torch.cuda.empty_cache()
436
+ return main_filepath, invert_filepath
437
+
438
+
439
+ MDX_DOWNLOAD_LINK = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/"
440
+ UVR_MODELS = [
441
+ "UVR-MDX-NET-Voc_FT.onnx",
442
+ "UVR_MDXNET_KARA_2.onnx",
443
+ "Reverb_HQ_By_FoxJoy.onnx",
444
+ "UVR-MDX-NET-Inst_HQ_4.onnx",
445
+ ]
446
+ BASE_DIR = "." # os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
447
+ mdxnet_models_dir = os.path.join(BASE_DIR, "mdx_models")
448
+ output_dir = os.path.join(BASE_DIR, "clean_song_output")
449
+
450
+
451
+ def convert_to_stereo_and_wav(audio_path):
452
+ wave, sr = librosa.load(audio_path, mono=False, sr=44100)
453
+
454
+ # check if mono
455
+ if type(wave[0]) != np.ndarray or audio_path[-4:].lower() != ".wav": # noqa
456
+ stereo_path = f"{os.path.splitext(audio_path)[0]}_stereo.wav"
457
+ stereo_path = os.path.join(output_dir, stereo_path)
458
+
459
+ command = shlex.split(
460
+ f'ffmpeg -y -loglevel error -i "{audio_path}" -ac 2 -f wav "{stereo_path}"'
461
+ )
462
+ sub_params = {
463
+ "stdout": subprocess.PIPE,
464
+ "stderr": subprocess.PIPE,
465
+ "creationflags": subprocess.CREATE_NO_WINDOW
466
+ if sys.platform == "win32"
467
+ else 0,
468
+ }
469
+ process_wav = subprocess.Popen(command, **sub_params)
470
+ output, errors = process_wav.communicate()
471
+ if process_wav.returncode != 0 or not os.path.exists(stereo_path):
472
+ raise Exception("Error processing audio to stereo wav")
473
+
474
+ return stereo_path
475
+ else:
476
+ return audio_path
477
+
478
+
479
+ def get_hash(filepath):
480
+ with open(filepath, 'rb') as f:
481
+ file_hash = hashlib.blake2b()
482
+ while chunk := f.read(8192):
483
+ file_hash.update(chunk)
484
+
485
+ return file_hash.hexdigest()[:18]
486
+
487
+ def random_sleep():
488
+ sleep_time = round(random.uniform(5.2, 7.9), 1)
489
+ time.sleep(sleep_time)
490
+
491
+ def process_uvr_task(
492
+ orig_song_path: str = "aud_test.mp3",
493
+ main_vocals: bool = False,
494
+ dereverb: bool = True,
495
+ song_id: str = "mdx", # folder output name
496
+ only_voiceless: bool = False,
497
+ remove_files_output_dir: bool = False,
498
+ ):
499
+
500
+ device_base = "cuda" if torch.cuda.is_available() else "cpu"
501
+ logger.info(f"Device: {device_base}")
502
+
503
+ if remove_files_output_dir:
504
+ remove_directory_contents(output_dir)
505
+
506
+ with open(os.path.join(mdxnet_models_dir, "data.json")) as infile:
507
+ mdx_model_params = json.load(infile)
508
+
509
+ song_output_dir = os.path.join(output_dir, song_id)
510
+ create_directories(song_output_dir)
511
+ orig_song_path = convert_to_stereo_and_wav(orig_song_path)
512
+
513
+ logger.info(f"onnxruntime device >> {ort.get_device()}")
514
+
515
+ if only_voiceless:
516
+ logger.info("Voiceless Track Separation...")
517
+ return run_mdx(
518
+ mdx_model_params,
519
+ song_output_dir,
520
+ os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Inst_HQ_4.onnx"),
521
+ orig_song_path,
522
+ suffix="Voiceless",
523
+ denoise=False,
524
+ keep_orig=True,
525
+ exclude_inversion=True,
526
+ device_base=device_base,
527
+ )
528
+
529
+ logger.info("Vocal Track Isolation...")
530
+ vocals_path, instrumentals_path = run_mdx(
531
+ mdx_model_params,
532
+ song_output_dir,
533
+ os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Voc_FT.onnx"),
534
+ orig_song_path,
535
+ denoise=True,
536
+ keep_orig=True,
537
+ device_base=device_base,
538
+ )
539
+
540
+ if main_vocals:
541
+ random_sleep()
542
+ msg_main = "Main Voice Separation from Supporting Vocals..."
543
+ logger.info(msg_main)
544
+ gr.Info(msg_main)
545
+ try:
546
+ backup_vocals_path, main_vocals_path = run_mdx(
547
+ mdx_model_params,
548
+ song_output_dir,
549
+ os.path.join(mdxnet_models_dir, "UVR_MDXNET_KARA_2.onnx"),
550
+ vocals_path,
551
+ suffix="Backup",
552
+ invert_suffix="Main",
553
+ denoise=True,
554
+ device_base=device_base,
555
+ )
556
+ except Exception as e:
557
+ if "0:00:" in str(e):
558
+ gr.Info("Waiting 60 seconds for GPU quota")
559
+ time.sleep(56)
560
+ random_sleep()
561
+ backup_vocals_path, main_vocals_path = run_mdx(
562
+ mdx_model_params,
563
+ song_output_dir,
564
+ os.path.join(mdxnet_models_dir, "UVR_MDXNET_KARA_2.onnx"),
565
+ vocals_path,
566
+ suffix="Backup",
567
+ invert_suffix="Main",
568
+ denoise=True,
569
+ device_base=device_base,
570
+ )
571
+ else:
572
+ raise e
573
+ else:
574
+ backup_vocals_path, main_vocals_path = None, vocals_path
575
+
576
+ if dereverb:
577
+ random_sleep()
578
+ msg_dereverb = "Vocal Clarity Enhancement through De-Reverberation..."
579
+ logger.info(msg_dereverb)
580
+ gr.Info(msg_dereverb)
581
+ try:
582
+ _, vocals_dereverb_path = run_mdx(
583
+ mdx_model_params,
584
+ song_output_dir,
585
+ os.path.join(mdxnet_models_dir, "Reverb_HQ_By_FoxJoy.onnx"),
586
+ main_vocals_path,
587
+ invert_suffix="DeReverb",
588
+ exclude_main=True,
589
+ denoise=True,
590
+ device_base=device_base,
591
+ )
592
+ except Exception as e:
593
+ if "0:00:" in str(e):
594
+ gr.Info("Waiting 60 seconds for GPU quota")
595
+ time.sleep(56)
596
+ random_sleep()
597
+ _, vocals_dereverb_path = run_mdx(
598
+ mdx_model_params,
599
+ song_output_dir,
600
+ os.path.join(mdxnet_models_dir, "Reverb_HQ_By_FoxJoy.onnx"),
601
+ main_vocals_path,
602
+ invert_suffix="DeReverb",
603
+ exclude_main=True,
604
+ denoise=True,
605
+ device_base=device_base,
606
+ )
607
+ else:
608
+ raise e
609
+ else:
610
+ vocals_dereverb_path = main_vocals_path
611
+
612
+ return (
613
+ vocals_path,
614
+ instrumentals_path,
615
+ backup_vocals_path,
616
+ main_vocals_path,
617
+ vocals_dereverb_path,
618
+ )
619
+
620
+
621
+ def sound_separate(media_file, stem, main, dereverb):
622
+
623
+ if not media_file:
624
+ raise ValueError("The audio pls")
625
+
626
+ if not stem:
627
+ raise ValueError("Select vocal or background...")
628
+
629
+ hash_audio = str(get_hash(media_file))
630
+
631
+ outputs = []
632
+
633
+ start_time = time.time()
634
+
635
+ if stem == "vocal":
636
+ try:
637
+ _, _, _, _, vocal_audio = process_uvr_task(
638
+ orig_song_path=media_file,
639
+ song_id=hash_audio+"mdx",
640
+ main_vocals=main,
641
+ dereverb=dereverb,
642
+ remove_files_output_dir=False,
643
+ )
644
+ outputs.append(vocal_audio)
645
+ except Exception as error:
646
+ gr.Info(str(error))
647
+ logger.error(str(error))
648
+
649
+ if stem == "background":
650
+
651
+ background_audio, _ = process_uvr_task(
652
+ orig_song_path=media_file,
653
+ song_id=hash_audio+"voiceless",
654
+ only_voiceless=True,
655
+ remove_files_output_dir=False,
656
+ )
657
+ # copy_files(background_audio, ".")
658
+ outputs.append(background_audio)
659
+
660
+ end_time = time.time()
661
+ execution_time = end_time - start_time
662
+ logger.info(f"Execution time: {execution_time} seconds")
663
+
664
+ if not outputs:
665
+ raise Exception("Error in sound separate")
666
+
667
+ return outputs
668
+
669
+
670
+ def audio_conf():
671
+ return gr.File(
672
+ label="Audio file",
673
+ # file_count="multiple",
674
+ type="filepath",
675
+ container=True,
676
+ )
677
+
678
+
679
+ def stem_conf():
680
+ return gr.Radio(
681
+ choices=["vocal", "background"],
682
+ value="vocal",
683
+ label="Vocal",
684
+ # info="",
685
+ )
686
+
687
+
688
+ def main_conf():
689
+ return gr.Checkbox(
690
+ False,
691
+ label="Main",
692
+ # info="",
693
+ )
694
+
695
+
696
+ def dereverb_conf():
697
+ return gr.Checkbox(
698
+ False,
699
+ label="Dereverb",
700
+ # info="",
701
+ visible=True,
702
+ )
703
+
704
+
705
+ def button_conf():
706
+ return gr.Button(
707
+ "Inference",
708
+ variant="primary",
709
+ )
710
+
711
+
712
+ def output_conf():
713
+ return gr.File(
714
+ label="Result",
715
+ file_count="multiple",
716
+ interactive=False,
717
+ )
718
+
719
+
720
+ def show_vocal_components(input_bool):
721
+ param = True if input_bool == "vocal" else False
722
+ return gr.update(visible=param), gr.update(
723
+ visible=param
724
+ )
725
+
726
+
727
+ def get_gui(theme):
728
+ with gr.Blocks(theme=theme) as app:
729
+ gr.Markdown(title)
730
+ gr.Markdown(description)
731
+
732
+ aud = audio_conf()
733
+
734
+ with gr.Column():
735
+ with gr.Row():
736
+ stem_gui = stem_conf()
737
+
738
+
739
+ with gr.Column():
740
+ with gr.Row():
741
+ main_gui = main_conf()
742
+ dereverb_gui = dereverb_conf()
743
+
744
+ stem_gui.change(
745
+ show_vocal_components,
746
+ [stem_gui],
747
+ [main_gui, dereverb_gui],
748
+ )
749
+
750
+ button_base = button_conf()
751
+ output_base = output_conf()
752
+
753
+ button_base.click(
754
+ sound_separate,
755
+ inputs=[
756
+ aud,
757
+ stem_gui,
758
+ main_gui,
759
+ dereverb_gui,
760
+ ],
761
+ outputs=[output_base],
762
+ )
763
+
764
+ gr.Examples(
765
+ examples=[
766
+ [
767
+ "./test.mp3",
768
+ "vocal",
769
+ False,
770
+ False,
771
+ ],
772
+ ],
773
+ fn=sound_separate,
774
+ inputs=[
775
+ aud,
776
+ stem_gui,
777
+ main_gui,
778
+ dereverb_gui,
779
+ ],
780
+ outputs=[output_base],
781
+ cache_examples=False,
782
+ )
783
+
784
+ return app
785
+
786
+
787
+ if __name__ == "__main__":
788
+
789
+ for id_model in UVR_MODELS:
790
+ download_manager(
791
+ os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir
792
+ )
793
+
794
+ app = get_gui(theme)
795
+
796
+ app.queue(default_concurrency_limit=40)
797
+
798
+ app.launch(
799
+ max_threads=40,
800
+ share=False,
801
+ show_error=True,
802
+ quiet=False,
803
+ debug=False,
804
+ )
805
+
mdx_models/data.json ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0ddfc0eb5792638ad5dc27850236c246": {
3
+ "compensate": 1.035,
4
+ "mdx_dim_f_set": 2048,
5
+ "mdx_dim_t_set": 8,
6
+ "mdx_n_fft_scale_set": 6144,
7
+ "primary_stem": "Vocals"
8
+ },
9
+ "26d308f91f3423a67dc69a6d12a8793d": {
10
+ "compensate": 1.035,
11
+ "mdx_dim_f_set": 2048,
12
+ "mdx_dim_t_set": 9,
13
+ "mdx_n_fft_scale_set": 8192,
14
+ "primary_stem": "Other"
15
+ },
16
+ "2cdd429caac38f0194b133884160f2c6": {
17
+ "compensate": 1.045,
18
+ "mdx_dim_f_set": 3072,
19
+ "mdx_dim_t_set": 8,
20
+ "mdx_n_fft_scale_set": 7680,
21
+ "primary_stem": "Instrumental"
22
+ },
23
+ "2f5501189a2f6db6349916fabe8c90de": {
24
+ "compensate": 1.035,
25
+ "mdx_dim_f_set": 2048,
26
+ "mdx_dim_t_set": 8,
27
+ "mdx_n_fft_scale_set": 6144,
28
+ "primary_stem": "Vocals"
29
+ },
30
+ "398580b6d5d973af3120df54cee6759d": {
31
+ "compensate": 1.75,
32
+ "mdx_dim_f_set": 3072,
33
+ "mdx_dim_t_set": 8,
34
+ "mdx_n_fft_scale_set": 7680,
35
+ "primary_stem": "Vocals"
36
+ },
37
+ "488b3e6f8bd3717d9d7c428476be2d75": {
38
+ "compensate": 1.035,
39
+ "mdx_dim_f_set": 3072,
40
+ "mdx_dim_t_set": 8,
41
+ "mdx_n_fft_scale_set": 7680,
42
+ "primary_stem": "Instrumental"
43
+ },
44
+ "4910e7827f335048bdac11fa967772f9": {
45
+ "compensate": 1.035,
46
+ "mdx_dim_f_set": 2048,
47
+ "mdx_dim_t_set": 7,
48
+ "mdx_n_fft_scale_set": 4096,
49
+ "primary_stem": "Drums"
50
+ },
51
+ "53c4baf4d12c3e6c3831bb8f5b532b93": {
52
+ "compensate": 1.043,
53
+ "mdx_dim_f_set": 3072,
54
+ "mdx_dim_t_set": 8,
55
+ "mdx_n_fft_scale_set": 7680,
56
+ "primary_stem": "Vocals"
57
+ },
58
+ "5d343409ef0df48c7d78cce9f0106781": {
59
+ "compensate": 1.075,
60
+ "mdx_dim_f_set": 3072,
61
+ "mdx_dim_t_set": 8,
62
+ "mdx_n_fft_scale_set": 7680,
63
+ "primary_stem": "Vocals"
64
+ },
65
+ "5f6483271e1efb9bfb59e4a3e6d4d098": {
66
+ "compensate": 1.035,
67
+ "mdx_dim_f_set": 2048,
68
+ "mdx_dim_t_set": 9,
69
+ "mdx_n_fft_scale_set": 6144,
70
+ "primary_stem": "Vocals"
71
+ },
72
+ "65ab5919372a128e4167f5e01a8fda85": {
73
+ "compensate": 1.035,
74
+ "mdx_dim_f_set": 2048,
75
+ "mdx_dim_t_set": 8,
76
+ "mdx_n_fft_scale_set": 8192,
77
+ "primary_stem": "Other"
78
+ },
79
+ "6703e39f36f18aa7855ee1047765621d": {
80
+ "compensate": 1.035,
81
+ "mdx_dim_f_set": 2048,
82
+ "mdx_dim_t_set": 9,
83
+ "mdx_n_fft_scale_set": 16384,
84
+ "primary_stem": "Bass"
85
+ },
86
+ "6b31de20e84392859a3d09d43f089515": {
87
+ "compensate": 1.035,
88
+ "mdx_dim_f_set": 2048,
89
+ "mdx_dim_t_set": 8,
90
+ "mdx_n_fft_scale_set": 6144,
91
+ "primary_stem": "Vocals"
92
+ },
93
+ "867595e9de46f6ab699008295df62798": {
94
+ "compensate": 1.03,
95
+ "mdx_dim_f_set": 3072,
96
+ "mdx_dim_t_set": 8,
97
+ "mdx_n_fft_scale_set": 7680,
98
+ "primary_stem": "Vocals"
99
+ },
100
+ "a3cd63058945e777505c01d2507daf37": {
101
+ "compensate": 1.03,
102
+ "mdx_dim_f_set": 2048,
103
+ "mdx_dim_t_set": 8,
104
+ "mdx_n_fft_scale_set": 6144,
105
+ "primary_stem": "Vocals"
106
+ },
107
+ "b33d9b3950b6cbf5fe90a32608924700": {
108
+ "compensate": 1.03,
109
+ "mdx_dim_f_set": 3072,
110
+ "mdx_dim_t_set": 8,
111
+ "mdx_n_fft_scale_set": 7680,
112
+ "primary_stem": "Vocals"
113
+ },
114
+ "c3b29bdce8c4fa17ec609e16220330ab": {
115
+ "compensate": 1.035,
116
+ "mdx_dim_f_set": 2048,
117
+ "mdx_dim_t_set": 8,
118
+ "mdx_n_fft_scale_set": 16384,
119
+ "primary_stem": "Bass"
120
+ },
121
+ "ceed671467c1f64ebdfac8a2490d0d52": {
122
+ "compensate": 1.035,
123
+ "mdx_dim_f_set": 3072,
124
+ "mdx_dim_t_set": 8,
125
+ "mdx_n_fft_scale_set": 7680,
126
+ "primary_stem": "Instrumental"
127
+ },
128
+ "d2a1376f310e4f7fa37fb9b5774eb701": {
129
+ "compensate": 1.035,
130
+ "mdx_dim_f_set": 3072,
131
+ "mdx_dim_t_set": 8,
132
+ "mdx_n_fft_scale_set": 7680,
133
+ "primary_stem": "Instrumental"
134
+ },
135
+ "d7bff498db9324db933d913388cba6be": {
136
+ "compensate": 1.035,
137
+ "mdx_dim_f_set": 2048,
138
+ "mdx_dim_t_set": 8,
139
+ "mdx_n_fft_scale_set": 6144,
140
+ "primary_stem": "Vocals"
141
+ },
142
+ "d94058f8c7f1fae4164868ae8ae66b20": {
143
+ "compensate": 1.035,
144
+ "mdx_dim_f_set": 2048,
145
+ "mdx_dim_t_set": 8,
146
+ "mdx_n_fft_scale_set": 6144,
147
+ "primary_stem": "Vocals"
148
+ },
149
+ "dc41ede5961d50f277eb846db17f5319": {
150
+ "compensate": 1.035,
151
+ "mdx_dim_f_set": 2048,
152
+ "mdx_dim_t_set": 9,
153
+ "mdx_n_fft_scale_set": 4096,
154
+ "primary_stem": "Drums"
155
+ },
156
+ "e5572e58abf111f80d8241d2e44e7fa4": {
157
+ "compensate": 1.028,
158
+ "mdx_dim_f_set": 3072,
159
+ "mdx_dim_t_set": 8,
160
+ "mdx_n_fft_scale_set": 7680,
161
+ "primary_stem": "Instrumental"
162
+ },
163
+ "e7324c873b1f615c35c1967f912db92a": {
164
+ "compensate": 1.03,
165
+ "mdx_dim_f_set": 3072,
166
+ "mdx_dim_t_set": 8,
167
+ "mdx_n_fft_scale_set": 7680,
168
+ "primary_stem": "Vocals"
169
+ },
170
+ "1c56ec0224f1d559c42fd6fd2a67b154": {
171
+ "compensate": 1.025,
172
+ "mdx_dim_f_set": 2048,
173
+ "mdx_dim_t_set": 8,
174
+ "mdx_n_fft_scale_set": 5120,
175
+ "primary_stem": "Instrumental"
176
+ },
177
+ "f2df6d6863d8f435436d8b561594ff49": {
178
+ "compensate": 1.035,
179
+ "mdx_dim_f_set": 3072,
180
+ "mdx_dim_t_set": 8,
181
+ "mdx_n_fft_scale_set": 7680,
182
+ "primary_stem": "Instrumental"
183
+ },
184
+ "b06327a00d5e5fbc7d96e1781bbdb596": {
185
+ "compensate": 1.035,
186
+ "mdx_dim_f_set": 3072,
187
+ "mdx_dim_t_set": 8,
188
+ "mdx_n_fft_scale_set": 6144,
189
+ "primary_stem": "Instrumental"
190
+ },
191
+ "94ff780b977d3ca07c7a343dab2e25dd": {
192
+ "compensate": 1.039,
193
+ "mdx_dim_f_set": 3072,
194
+ "mdx_dim_t_set": 8,
195
+ "mdx_n_fft_scale_set": 6144,
196
+ "primary_stem": "Instrumental"
197
+ },
198
+ "73492b58195c3b52d34590d5474452f6": {
199
+ "compensate": 1.043,
200
+ "mdx_dim_f_set": 3072,
201
+ "mdx_dim_t_set": 8,
202
+ "mdx_n_fft_scale_set": 7680,
203
+ "primary_stem": "Vocals"
204
+ },
205
+ "970b3f9492014d18fefeedfe4773cb42": {
206
+ "compensate": 1.009,
207
+ "mdx_dim_f_set": 3072,
208
+ "mdx_dim_t_set": 8,
209
+ "mdx_n_fft_scale_set": 7680,
210
+ "primary_stem": "Vocals"
211
+ },
212
+ "1d64a6d2c30f709b8c9b4ce1366d96ee": {
213
+ "compensate": 1.035,
214
+ "mdx_dim_f_set": 2048,
215
+ "mdx_dim_t_set": 8,
216
+ "mdx_n_fft_scale_set": 5120,
217
+ "primary_stem": "Instrumental"
218
+ },
219
+ "203f2a3955221b64df85a41af87cf8f0": {
220
+ "compensate": 1.035,
221
+ "mdx_dim_f_set": 3072,
222
+ "mdx_dim_t_set": 8,
223
+ "mdx_n_fft_scale_set": 6144,
224
+ "primary_stem": "Instrumental"
225
+ },
226
+ "291c2049608edb52648b96e27eb80e95": {
227
+ "compensate": 1.035,
228
+ "mdx_dim_f_set": 3072,
229
+ "mdx_dim_t_set": 8,
230
+ "mdx_n_fft_scale_set": 6144,
231
+ "primary_stem": "Instrumental"
232
+ },
233
+ "ead8d05dab12ec571d67549b3aab03fc": {
234
+ "compensate": 1.035,
235
+ "mdx_dim_f_set": 3072,
236
+ "mdx_dim_t_set": 8,
237
+ "mdx_n_fft_scale_set": 6144,
238
+ "primary_stem": "Instrumental"
239
+ },
240
+ "cc63408db3d80b4d85b0287d1d7c9632": {
241
+ "compensate": 1.033,
242
+ "mdx_dim_f_set": 3072,
243
+ "mdx_dim_t_set": 8,
244
+ "mdx_n_fft_scale_set": 6144,
245
+ "primary_stem": "Instrumental"
246
+ },
247
+ "cd5b2989ad863f116c855db1dfe24e39": {
248
+ "compensate": 1.035,
249
+ "mdx_dim_f_set": 3072,
250
+ "mdx_dim_t_set": 9,
251
+ "mdx_n_fft_scale_set": 6144,
252
+ "primary_stem": "Other"
253
+ },
254
+ "55657dd70583b0fedfba5f67df11d711": {
255
+ "compensate": 1.022,
256
+ "mdx_dim_f_set": 3072,
257
+ "mdx_dim_t_set": 8,
258
+ "mdx_n_fft_scale_set": 6144,
259
+ "primary_stem": "Instrumental"
260
+ },
261
+ "b6bccda408a436db8500083ef3491e8b": {
262
+ "compensate": 1.02,
263
+ "mdx_dim_f_set": 3072,
264
+ "mdx_dim_t_set": 8,
265
+ "mdx_n_fft_scale_set": 7680,
266
+ "primary_stem": "Instrumental"
267
+ },
268
+ "8a88db95c7fb5dbe6a095ff2ffb428b1": {
269
+ "compensate": 1.026,
270
+ "mdx_dim_f_set": 2048,
271
+ "mdx_dim_t_set": 8,
272
+ "mdx_n_fft_scale_set": 5120,
273
+ "primary_stem": "Instrumental"
274
+ },
275
+ "b78da4afc6512f98e4756f5977f5c6b9": {
276
+ "compensate": 1.021,
277
+ "mdx_dim_f_set": 3072,
278
+ "mdx_dim_t_set": 8,
279
+ "mdx_n_fft_scale_set": 7680,
280
+ "primary_stem": "Instrumental"
281
+ },
282
+ "77d07b2667ddf05b9e3175941b4454a0": {
283
+ "compensate": 1.021,
284
+ "mdx_dim_f_set": 3072,
285
+ "mdx_dim_t_set": 8,
286
+ "mdx_n_fft_scale_set": 7680,
287
+ "primary_stem": "Vocals"
288
+ },
289
+ "0f2a6bc5b49d87d64728ee40e23bceb1": {
290
+ "compensate": 1.019,
291
+ "mdx_dim_f_set": 2560,
292
+ "mdx_dim_t_set": 8,
293
+ "mdx_n_fft_scale_set": 5120,
294
+ "primary_stem": "Instrumental"
295
+ },
296
+ "b02be2d198d4968a121030cf8950b492": {
297
+ "compensate": 1.020,
298
+ "mdx_dim_f_set": 2560,
299
+ "mdx_dim_t_set": 8,
300
+ "mdx_n_fft_scale_set": 5120,
301
+ "primary_stem": "No Crowd"
302
+ },
303
+ "2154254ee89b2945b97a7efed6e88820": {
304
+ "config_yaml": "model_2_stem_061321.yaml"
305
+ },
306
+ "063aadd735d58150722926dcbf5852a9": {
307
+ "config_yaml": "model_2_stem_061321.yaml"
308
+ },
309
+ "fe96801369f6a148df2720f5ced88c19": {
310
+ "config_yaml": "model3.yaml"
311
+ },
312
+ "02e8b226f85fb566e5db894b9931c640": {
313
+ "config_yaml": "model2.yaml"
314
+ },
315
+ "e3de6d861635ab9c1d766149edd680d6": {
316
+ "config_yaml": "model1.yaml"
317
+ },
318
+ "3f2936c554ab73ce2e396d54636bd373": {
319
+ "config_yaml": "modelB.yaml"
320
+ },
321
+ "890d0f6f82d7574bca741a9e8bcb8168": {
322
+ "config_yaml": "modelB.yaml"
323
+ },
324
+ "63a3cb8c37c474681049be4ad1ba8815": {
325
+ "config_yaml": "modelB.yaml"
326
+ },
327
+ "a7fc5d719743c7fd6b61bd2b4d48b9f0": {
328
+ "config_yaml": "modelA.yaml"
329
+ },
330
+ "3567f3dee6e77bf366fcb1c7b8bc3745": {
331
+ "config_yaml": "modelA.yaml"
332
+ },
333
+ "a28f4d717bd0d34cd2ff7a3b0a3d065e": {
334
+ "config_yaml": "modelA.yaml"
335
+ },
336
+ "c9971a18da20911822593dc81caa8be9": {
337
+ "config_yaml": "sndfx.yaml"
338
+ },
339
+ "57d94d5ed705460d21c75a5ac829a605": {
340
+ "config_yaml": "sndfx.yaml"
341
+ },
342
+ "e7a25f8764f25a52c1b96c4946e66ba2": {
343
+ "config_yaml": "sndfx.yaml"
344
+ },
345
+ "104081d24e37217086ce5fde09147ee1": {
346
+ "config_yaml": "model_2_stem_061321.yaml"
347
+ },
348
+ "1e6165b601539f38d0a9330f3facffeb": {
349
+ "config_yaml": "model_2_stem_061321.yaml"
350
+ },
351
+ "fe0108464ce0d8271be5ab810891bd7c": {
352
+ "config_yaml": "model_2_stem_full_band.yaml"
353
+ }
354
+ }
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ soundfile
2
+ librosa
3
+ torch==2.2.0
test.mp3 ADDED
Binary file (51.9 kB). View file
 
utils.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, zipfile, shutil, subprocess, shlex, sys # noqa
2
+ from urllib.parse import urlparse
3
+ import re
4
+ import logging
5
+
6
+
7
+ def load_file_from_url(
8
+ url: str,
9
+ model_dir: str,
10
+ file_name: str | None = None,
11
+ overwrite: bool = False,
12
+ progress: bool = True,
13
+ ) -> str:
14
+ """Download a file from `url` into `model_dir`,
15
+ using the file present if possible.
16
+
17
+ Returns the path to the downloaded file.
18
+ """
19
+ os.makedirs(model_dir, exist_ok=True)
20
+ if not file_name:
21
+ parts = urlparse(url)
22
+ file_name = os.path.basename(parts.path)
23
+ cached_file = os.path.abspath(os.path.join(model_dir, file_name))
24
+
25
+ # Overwrite
26
+ if os.path.exists(cached_file):
27
+ if overwrite or os.path.getsize(cached_file) == 0:
28
+ remove_files(cached_file)
29
+
30
+ # Download
31
+ if not os.path.exists(cached_file):
32
+ logger.info(f'Downloading: "{url}" to {cached_file}\n')
33
+ from torch.hub import download_url_to_file
34
+
35
+ download_url_to_file(url, cached_file, progress=progress)
36
+ else:
37
+ logger.debug(cached_file)
38
+
39
+ return cached_file
40
+
41
+
42
+ def friendly_name(file: str):
43
+ if file.startswith("http"):
44
+ file = urlparse(file).path
45
+
46
+ file = os.path.basename(file)
47
+ model_name, extension = os.path.splitext(file)
48
+ return model_name, extension
49
+
50
+
51
+ def download_manager(
52
+ url: str,
53
+ path: str,
54
+ extension: str = "",
55
+ overwrite: bool = False,
56
+ progress: bool = True,
57
+ ):
58
+ url = url.strip()
59
+
60
+ name, ext = friendly_name(url)
61
+ name += ext if not extension else f".{extension}"
62
+
63
+ if url.startswith("http"):
64
+ filename = load_file_from_url(
65
+ url=url,
66
+ model_dir=path,
67
+ file_name=name,
68
+ overwrite=overwrite,
69
+ progress=progress,
70
+ )
71
+ else:
72
+ filename = path
73
+
74
+ return filename
75
+
76
+
77
+ def remove_files(file_list):
78
+ if isinstance(file_list, str):
79
+ file_list = [file_list]
80
+
81
+ for file in file_list:
82
+ if os.path.exists(file):
83
+ os.remove(file)
84
+
85
+
86
+ def remove_directory_contents(directory_path):
87
+ """
88
+ Removes all files and subdirectories within a directory.
89
+
90
+ Parameters:
91
+ directory_path (str): Path to the directory whose
92
+ contents need to be removed.
93
+ """
94
+ if os.path.exists(directory_path):
95
+ for filename in os.listdir(directory_path):
96
+ file_path = os.path.join(directory_path, filename)
97
+ try:
98
+ if os.path.isfile(file_path):
99
+ os.remove(file_path)
100
+ elif os.path.isdir(file_path):
101
+ shutil.rmtree(file_path)
102
+ except Exception as e:
103
+ logger.error(f"Failed to delete {file_path}. Reason: {e}")
104
+ logger.info(f"Content in '{directory_path}' removed.")
105
+ else:
106
+ logger.error(f"Directory '{directory_path}' does not exist.")
107
+
108
+
109
+ # Create directory if not exists
110
+ def create_directories(directory_path):
111
+ if isinstance(directory_path, str):
112
+ directory_path = [directory_path]
113
+ for one_dir_path in directory_path:
114
+ if not os.path.exists(one_dir_path):
115
+ os.makedirs(one_dir_path)
116
+ logger.debug(f"Directory '{one_dir_path}' created.")
117
+
118
+
119
+ def setup_logger(name_log):
120
+ logger = logging.getLogger(name_log)
121
+ logger.setLevel(logging.INFO)
122
+
123
+ _default_handler = logging.StreamHandler() # Set sys.stderr as stream.
124
+ _default_handler.flush = sys.stderr.flush
125
+ logger.addHandler(_default_handler)
126
+
127
+ logger.propagate = False
128
+
129
+ handlers = logger.handlers
130
+
131
+ for handler in handlers:
132
+ formatter = logging.Formatter("[%(levelname)s] >> %(message)s")
133
+ handler.setFormatter(formatter)
134
+
135
+ # logger.handlers
136
+
137
+ return logger
138
+
139
+
140
+ logger = setup_logger("ss")
141
+ logger.setLevel(logging.INFO)
142
+