Spaces:

Awell00
/

music_drums_separation

Running on Zero

App Files Files Community

Awell00 commited on Aug 25, 2024

Commit

10f7cab

verified ·

1 Parent(s): 46cbed5

feat: upload configs files for inference

Browse files

Files changed (5) hide show

configs/config_bs_roformer_instrumental.yaml +138 -0
configs/config_htdemucs_bass.yaml +119 -0
configs/config_mel_band_roformer_denoise.yaml +72 -0
configs/config_mel_band_roformer_vocals.yaml +72 -0
configs/config_scnet_other.yaml +83 -0

configs/config_bs_roformer_instrumental.yaml ADDED Viewed

	@@ -0,0 +1,138 @@

+audio:
+  chunk_size: 131584
+  dim_f: 1024
+  dim_t: 256
+  hop_length: 512
+  n_fft: 2048
+  num_channels: 2
+  sample_rate: 44100
+  min_mean_abs: 0.001
+model:
+  dim: 384
+  depth: 12
+  stereo: true
+  num_stems: 1
+  time_transformer_depth: 1
+  freq_transformer_depth: 1
+  linear_transformer_depth: 0
+  freqs_per_bands: !!python/tuple
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 12
+    - 12
+    - 12
+    - 12
+    - 12
+    - 12
+    - 12
+    - 12
+    - 24
+    - 24
+    - 24
+    - 24
+    - 24
+    - 24
+    - 24
+    - 24
+    - 48
+    - 48
+    - 48
+    - 48
+    - 48
+    - 48
+    - 48
+    - 48
+    - 128
+    - 129
+  dim_head: 64
+  heads: 8
+  attn_dropout: 0.1
+  ff_dropout: 0.1
+  flash_attn: true
+  dim_freqs_in: 1025
+  stft_n_fft: 2048
+  stft_hop_length: 512
+  stft_win_length: 2048
+  stft_normalized: false
+  mask_estimator_depth: 2
+  multi_stft_resolution_loss_weight: 1.0
+  multi_stft_resolutions_window_sizes: !!python/tuple
+  - 4096
+  - 2048
+  - 1024
+  - 512
+  - 256
+  multi_stft_hop_size: 147
+  multi_stft_normalized: False
+training:
+  batch_size: 4
+  gradient_accumulation_steps: 1
+  grad_clip: 0
+  instruments:
+  - vocals
+  - other
+  lr: 5.0e-05
+  patience: 2
+  reduce_factor: 0.95
+  target_instrument: other
+  num_epochs: 1000
+  num_steps: 1000
+  q: 0.95
+  coarse_loss_clip: true
+  ema_momentum: 0.999
+  optimizer: adam
+  other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
+  use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
+augmentations:
+  enable: true # enable or disable all augmentations (to fast disable if needed)
+  loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
+  loudness_min: 0.5
+  loudness_max: 1.5
+  mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
+  mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
+    - 0.2
+    - 0.02
+  mixup_loudness_min: 0.5
+  mixup_loudness_max: 1.5
+inference:
+  batch_size: 8
+  dim_t: 512
+  num_overlap: 2

configs/config_htdemucs_bass.yaml ADDED Viewed

	@@ -0,0 +1,119 @@

+audio:
+  chunk_size: 485100 # samplerate * segment
+  min_mean_abs: 0.001
+  hop_length: 1024
+training:
+  batch_size: 8
+  gradient_accumulation_steps: 1
+  grad_clip: 0
+  segment: 11
+  shift: 1
+  samplerate: 44100
+  channels: 2
+  normalize: true
+  instruments: ['drums', 'bass', 'other', 'vocals']
+  target_instrument: null
+  num_epochs: 1000
+  num_steps: 1000
+  optimizer: adam
+  lr: 9.0e-05
+  patience: 2
+  reduce_factor: 0.95
+  q: 0.95
+  coarse_loss_clip: true
+  ema_momentum: 0.999
+  other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
+  use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
+augmentations:
+  enable: true # enable or disable all augmentations (to fast disable if needed)
+  loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
+  loudness_min: 0.5
+  loudness_max: 1.5
+inference:
+  num_overlap: 4
+  batch_size: 8
+model: htdemucs
+htdemucs:  # see demucs/htdemucs.py for a detailed description
+  # Channels
+  channels: 48
+  channels_time:
+  growth: 2
+  # STFT
+  num_subbands: 1
+  nfft: 4096
+  wiener_iters: 0
+  end_iters: 0
+  wiener_residual: false
+  cac: true
+  # Main structure
+  depth: 4
+  rewrite: true
+  # Frequency Branch
+  multi_freqs: []
+  multi_freqs_depth: 3
+  freq_emb: 0.2
+  emb_scale: 10
+  emb_smooth: true
+  # Convolutions
+  kernel_size: 8
+  stride: 4
+  time_stride: 2
+  context: 1
+  context_enc: 0
+  # normalization
+  norm_starts: 4
+  norm_groups: 4
+  # DConv residual branch
+  dconv_mode: 3
+  dconv_depth: 2
+  dconv_comp: 8
+  dconv_init: 1e-3
+  # Before the Transformer
+  bottom_channels: 512
+  # CrossTransformer
+  # ------ Common to all
+  # Regular parameters
+  t_layers: 5
+  t_hidden_scale: 4.0
+  t_heads: 8
+  t_dropout: 0.0
+  t_layer_scale: True
+  t_gelu: True
+  # ------------- Positional Embedding
+  t_emb: sin
+  t_max_positions: 10000 # for the scaled embedding
+  t_max_period: 10000.0
+  t_weight_pos_embed: 1.0
+  t_cape_mean_normalize: True
+  t_cape_augment: True
+  t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
+  t_sin_random_shift: 0
+  # ------------- norm before a transformer encoder
+  t_norm_in: True
+  t_norm_in_group: False
+  # ------------- norm inside the encoder
+  t_group_norm: False
+  t_norm_first: True
+  t_norm_out: True
+  # ------------- optim
+  t_weight_decay: 0.0
+  t_lr:
+  # ------------- sparsity
+  t_sparse_self_attn: False
+  t_sparse_cross_attn: False
+  t_mask_type: diag
+  t_mask_random_seed: 42
+  t_sparse_attn_window: 400
+  t_global_window: 100
+  t_sparsity: 0.95
+  t_auto_sparsity: False
+  # Cross Encoder First (False)
+  t_cross_first: False
+  # Weight init
+  rescale: 0.1

configs/config_mel_band_roformer_denoise.yaml ADDED Viewed

	@@ -0,0 +1,72 @@

+audio:
+  chunk_size: 352800
+  dim_f: 1024
+  dim_t: 256
+  hop_length: 441
+  n_fft: 2048
+  num_channels: 2
+  sample_rate: 44100
+  min_mean_abs: 000
+model:
+  dim: 384
+  depth: 6
+  stereo: true
+  num_stems: 1
+  time_transformer_depth: 1
+  freq_transformer_depth: 1
+  num_bands: 60
+  dim_head: 64
+  heads: 8
+  attn_dropout: 0
+  ff_dropout: 0
+  flash_attn: True
+  dim_freqs_in: 1025
+  sample_rate: 44100  # needed for mel filter bank from librosa
+  stft_n_fft: 2048
+  stft_hop_length: 441
+  stft_win_length: 2048
+  stft_normalized: False
+  mask_estimator_depth: 2
+  multi_stft_resolution_loss_weight: 1.0
+  multi_stft_resolutions_window_sizes: !!python/tuple
+  - 4096
+  - 2048
+  - 1024
+  - 512
+  - 256
+  multi_stft_hop_size: 147
+  multi_stft_normalized: False
+training:
+  batch_size: 2
+  gradient_accumulation_steps: 1
+  grad_clip: 0
+  instruments:
+  - dry
+  - other
+  lr: 1.0e-05
+  patience: 8
+  reduce_factor: 0.95
+  target_instrument: dry
+  num_epochs: 1000
+  num_steps: 4032
+  augmentation: false # enable augmentations by audiomentations and pedalboard
+  augmentation_type: null
+  use_mp3_compress: false # Deprecated
+  augmentation_mix: false # Mix several stems of the same type with some probability
+  augmentation_loudness: false # randomly change loudness of each stem
+  augmentation_loudness_type: 1 # Type 1 or 2
+  augmentation_loudness_min: 0
+  augmentation_loudness_max: 0
+  q: 0.95
+  coarse_loss_clip: false
+  ema_momentum: 0.999
+  optimizer: adam
+  other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
+  use_amp: true
+inference:
+  batch_size: 2
+  dim_t: 256
+  num_overlap: 4

configs/config_mel_band_roformer_vocals.yaml ADDED Viewed

	@@ -0,0 +1,72 @@

+audio:
+  chunk_size: 352800
+  dim_f: 1024
+  dim_t: 256
+  hop_length: 441
+  n_fft: 2048
+  num_channels: 2
+  sample_rate: 44100
+  min_mean_abs: 0.000
+model:
+  dim: 384
+  depth: 6
+  stereo: true
+  num_stems: 1
+  time_transformer_depth: 1
+  freq_transformer_depth: 1
+  num_bands: 60
+  dim_head: 64
+  heads: 8
+  attn_dropout: 0
+  ff_dropout: 0
+  flash_attn: True
+  dim_freqs_in: 1025
+  sample_rate: 44100  # needed for mel filter bank from librosa
+  stft_n_fft: 2048
+  stft_hop_length: 441
+  stft_win_length: 2048
+  stft_normalized: False
+  mask_estimator_depth: 2
+  multi_stft_resolution_loss_weight: 1.0
+  multi_stft_resolutions_window_sizes: !!python/tuple
+  - 4096
+  - 2048
+  - 1024
+  - 512
+  - 256
+  multi_stft_hop_size: 147
+  multi_stft_normalized: False
+training:
+  batch_size: 4
+  gradient_accumulation_steps: 1
+  grad_clip: 0
+  instruments:
+  - vocals
+  - other
+  lr: 1.0e-05
+  patience: 2
+  reduce_factor: 0.95
+  target_instrument: vocals
+  num_epochs: 1000
+  num_steps: 1000
+  augmentation: false # enable augmentations by audiomentations and pedalboard
+  augmentation_type: null
+  use_mp3_compress: false # Deprecated
+  augmentation_mix: false # Mix several stems of the same type with some probability
+  augmentation_loudness: false # randomly change loudness of each stem
+  augmentation_loudness_type: 1 # Type 1 or 2
+  augmentation_loudness_min: 0
+  augmentation_loudness_max: 0
+  q: 0.95
+  coarse_loss_clip: false
+  ema_momentum: 0.999
+  optimizer: adam
+  other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
+  use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
+inference:
+  batch_size: 4
+  dim_t: 256
+  num_overlap: 2

configs/config_scnet_other.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+audio:
+  chunk_size: 485100 # 44100 * 11
+  num_channels: 2
+  sample_rate: 44100
+  min_mean_abs: 0.000
+model:
+  sources:
+    - drums
+    - bass
+    - other
+    - vocals
+  audio_channels: 2
+  dims:
+    - 4
+    - 32
+    - 64
+    - 128
+  nfft: 4096
+  hop_size: 1024
+  win_size: 4096
+  normalized: True
+  band_SR:
+    - 0.175
+    - 0.392
+    - 0.433
+  band_stride:
+    - 1
+    - 4
+    - 16
+  band_kernel:
+    - 3
+    - 4
+    - 16
+  conv_depths:
+    - 3
+    - 2
+    - 1
+  compress: 4
+  conv_kernel: 3
+  num_dplayer: 6
+  expand: 1
+training:
+  batch_size: 10
+  gradient_accumulation_steps: 1
+  grad_clip: 0
+  instruments:
+    - drums
+    - bass
+    - other
+    - vocals
+  lr: 5.0e-04
+  patience: 2
+  reduce_factor: 0.95
+  target_instrument: null
+  num_epochs: 1000
+  num_steps: 1000
+  q: 0.95
+  coarse_loss_clip: true
+  ema_momentum: 0.999
+  optimizer: adam
+  other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
+  use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
+augmentations:
+  enable: true # enable or disable all augmentations (to fast disable if needed)
+  loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
+  loudness_min: 0.5
+  loudness_max: 1.5
+  mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
+  mixup_probs:
+    !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
+    - 0.2
+    - 0.02
+  mixup_loudness_min: 0.5
+  mixup_loudness_max: 1.5
+inference:
+  batch_size: 8
+  dim_t: 256
+  num_overlap: 4
+  normalize: true