Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,670 Bytes
10f7cab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
audio:
chunk_size: 485100 # samplerate * segment
min_mean_abs: 0.001
hop_length: 1024
training:
batch_size: 8
gradient_accumulation_steps: 1
grad_clip: 0
segment: 11
shift: 1
samplerate: 44100
channels: 2
normalize: true
instruments: ['drums', 'bass', 'other', 'vocals']
target_instrument: null
num_epochs: 1000
num_steps: 1000
optimizer: adam
lr: 9.0e-05
patience: 2
reduce_factor: 0.95
q: 0.95
coarse_loss_clip: true
ema_momentum: 0.999
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
augmentations:
enable: true # enable or disable all augmentations (to fast disable if needed)
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
loudness_min: 0.5
loudness_max: 1.5
inference:
num_overlap: 4
batch_size: 8
model: htdemucs
htdemucs: # see demucs/htdemucs.py for a detailed description
# Channels
channels: 48
channels_time:
growth: 2
# STFT
num_subbands: 1
nfft: 4096
wiener_iters: 0
end_iters: 0
wiener_residual: false
cac: true
# Main structure
depth: 4
rewrite: true
# Frequency Branch
multi_freqs: []
multi_freqs_depth: 3
freq_emb: 0.2
emb_scale: 10
emb_smooth: true
# Convolutions
kernel_size: 8
stride: 4
time_stride: 2
context: 1
context_enc: 0
# normalization
norm_starts: 4
norm_groups: 4
# DConv residual branch
dconv_mode: 3
dconv_depth: 2
dconv_comp: 8
dconv_init: 1e-3
# Before the Transformer
bottom_channels: 512
# CrossTransformer
# ------ Common to all
# Regular parameters
t_layers: 5
t_hidden_scale: 4.0
t_heads: 8
t_dropout: 0.0
t_layer_scale: True
t_gelu: True
# ------------- Positional Embedding
t_emb: sin
t_max_positions: 10000 # for the scaled embedding
t_max_period: 10000.0
t_weight_pos_embed: 1.0
t_cape_mean_normalize: True
t_cape_augment: True
t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
t_sin_random_shift: 0
# ------------- norm before a transformer encoder
t_norm_in: True
t_norm_in_group: False
# ------------- norm inside the encoder
t_group_norm: False
t_norm_first: True
t_norm_out: True
# ------------- optim
t_weight_decay: 0.0
t_lr:
# ------------- sparsity
t_sparse_self_attn: False
t_sparse_cross_attn: False
t_mask_type: diag
t_mask_random_seed: 42
t_sparse_attn_window: 400
t_global_window: 100
t_sparsity: 0.95
t_auto_sparsity: False
# Cross Encoder First (False)
t_cross_first: False
# Weight init
rescale: 0.1
|