Spaces:
No application file
No application file
""" | |
DiffSVC architecture with WaveNet denoiser and NSF-HiFiGAN vocoder. | |
Comparing to v1, this version | |
- Doesn't need spec stats anymore. | |
- Added dilation cycle to WaveNet denoiser. | |
- Used the log10 mel spectrogram. | |
- Better matching DiffSinger architecture. | |
""" | |
from fish_diffusion.utils.pitch import pitch_to_scale | |
sampling_rate = 44100 | |
mel_channels = 128 | |
hidden_size = 256 | |
model = dict( | |
type="DiffSVC", | |
diffusion=dict( | |
type="GaussianDiffusion", | |
mel_channels=mel_channels, | |
noise_schedule="linear", | |
timesteps=1000, | |
max_beta=0.01, | |
s=0.008, | |
noise_loss="l1", | |
denoiser=dict( | |
type="WaveNetDenoiser", | |
mel_channels=mel_channels, | |
d_encoder=hidden_size, | |
residual_channels=512, | |
residual_layers=20, | |
dilation_cycle=4, | |
use_linear_bias=True, | |
), | |
sampler_interval=10, | |
spec_min=[-5], | |
spec_max=[0], | |
), | |
text_encoder=dict( | |
type="NaiveProjectionEncoder", | |
input_size=256, | |
output_size=hidden_size, | |
), | |
speaker_encoder=dict( | |
type="NaiveProjectionEncoder", | |
input_size=10, | |
output_size=hidden_size, | |
use_embedding=True, | |
), | |
pitch_encoder=dict( | |
type="NaiveProjectionEncoder", | |
input_size=1, | |
output_size=hidden_size, | |
use_embedding=False, | |
preprocessing=pitch_to_scale, | |
), | |
vocoder=dict( | |
type="NsfHifiGAN", | |
checkpoint_path="checkpoints/nsf_hifigan/model", | |
sampling_rate=sampling_rate, | |
mel_channels=mel_channels, | |
use_natural_log=False, | |
), | |
) | |