|
import json |
|
from tqdm import tqdm |
|
from copy import deepcopy |
|
|
|
import soundfile as sf |
|
import numpy as np |
|
import gradio as gr |
|
import torch |
|
|
|
import random |
|
random.seed(0) |
|
torch.manual_seed(0) |
|
np.random.seed(0) |
|
|
|
from util import print_size, sampling |
|
from network import CleanUNet |
|
import torchaudio |
|
import torchaudio.transforms as T |
|
|
|
SAMPLE_RATE = 22050 |
|
|
|
def load_simple(filename): |
|
wav, sr = torchaudio.load(filename) |
|
resampler = T.Resample(sr, SAMPLE_RATE, dtype=wav.dtype) |
|
resampled_wav = resampler(wav) |
|
return resampled_wav |
|
|
|
CONFIG = "configs/DNS-large-full.json" |
|
CHECKPOINT = "./exp/DNS-large-full/checkpoint/pretrained.pkl" |
|
|
|
|
|
with open(CONFIG) as f: |
|
data = f.read() |
|
config = json.loads(data) |
|
gen_config = config["gen_config"] |
|
global network_config |
|
network_config = config["network_config"] |
|
global train_config |
|
train_config = config["train_config"] |
|
global trainset_config |
|
trainset_config = config["trainset_config"] |
|
|
|
def denoise(filename, ckpt_path = CHECKPOINT, out = "out.wav"): |
|
""" |
|
Denoise audio |
|
Parameters: |
|
output_directory (str): save generated speeches to this path |
|
ckpt_iter (int or 'max'): the pretrained checkpoint to be loaded; |
|
automitically selects the maximum iteration if 'max' is selected |
|
subset (str): training, testing, validation |
|
dump (bool): whether save enhanced (denoised) audio |
|
""" |
|
|
|
|
|
exp_path = train_config["exp_path"] |
|
print('exp_path:', exp_path) |
|
|
|
|
|
loader_config = deepcopy(trainset_config) |
|
loader_config["crop_length_sec"] = 0 |
|
|
|
|
|
net = CleanUNet(**network_config) |
|
print_size(net) |
|
|
|
|
|
checkpoint = torch.load(ckpt_path, map_location='cpu') |
|
net.load_state_dict(checkpoint['model_state_dict']) |
|
net.eval() |
|
|
|
|
|
noisy_audio = load_simple(filename) |
|
|
|
with torch.no_grad(): |
|
with torch.cuda.amp.autocast(): |
|
generated_audio = sampling(net, noisy_audio) |
|
generated_audio = generated_audio[0].squeeze().cpu().numpy() |
|
sf.write(out, np.ravel(generated_audio), SAMPLE_RATE) |
|
|
|
return out |
|
|
|
audio = gr.inputs.Audio(label = "Audio to denoise", type = 'filepath') |
|
inputs = [audio] |
|
outputs = gr.outputs.Audio(label = "Denoised audio", type = 'filepath') |
|
|
|
title = "Speech Denoising in the Waveform Domain with Self-Attention from Nvidia" |
|
|
|
gr.Interface(denoise, inputs, outputs, title=title, enable_queue=True).launch() |