import torch import torchaudio.functional as F import matplotlib.pyplot as plt from pesq import pesq from pystoi import stoi import mir_eval REFERENCE_CHANNEL = 0 def plot_spectrogram(stft, title="Spectrogram", xlim=None): magnitude = stft.abs() spectrogram = 20 * torch.log10(magnitude + 1e-8).numpy() # figure, axis = plt.subplots(1, 1) # img = axis.imshow(spectrogram, cmap="viridis", vmin=-100, vmax=0, origin="lower", aspect="auto") # figure.suptitle(title) # plt.colorbar(img, ax=axis) # plt.show() def plot_mask(mask, title="Mask", xlim=None): mask = mask.numpy() figure, axis = plt.subplots(1, 1) img = axis.imshow(mask, cmap="viridis", origin="lower", aspect="auto") figure.suptitle(title) plt.colorbar(img, ax=axis) plt.show() def si_snr(estimate, reference, epsilon=1e-8): estimate = estimate - estimate.mean() reference = reference - reference.mean() reference_pow = reference.pow(2).mean(axis=1, keepdim=True) mix_pow = (estimate * reference).mean(axis=1, keepdim=True) scale = mix_pow / (reference_pow + epsilon) reference = scale * reference error = estimate - reference reference_pow = reference.pow(2) error_pow = error.pow(2) reference_pow = reference_pow.mean(axis=1) error_pow = error_pow.mean(axis=1) si_snr = 10 * torch.log10(reference_pow) - 10 * torch.log10(error_pow) return si_snr.item() # def generate_mixture(waveform_clean, waveform_noise, target_snr): # power_clean_signal = waveform_clean.pow(2).mean() # power_noise_signal = waveform_noise.pow(2).mean() # current_snr = 10 * torch.log10(power_clean_signal / power_noise_signal) # waveform_noise *= 10 ** (-(target_snr - current_snr) / 20) # return waveform_clean + waveform_noise def generate_mixture(waveform_clean, waveform_noise, target_snr): if waveform_clean.size(1) > waveform_noise.size(1): waveform_noise = F.pad(waveform_noise, (0, waveform_clean.size(1) - waveform_noise.size(1))) elif waveform_noise.size(1) > waveform_clean.size(1): waveform_clean = F.pad(waveform_clean, (0, waveform_noise.size(1) - waveform_clean.size(1))) power_clean_signal = waveform_clean.pow(2).mean() power_noise_signal = waveform_noise.pow(2).mean() current_snr = 10 * torch.log10(power_clean_signal / power_noise_signal) waveform_noise *= 10 ** (-(target_snr - current_snr) / 20) return waveform_clean + waveform_noise def evaluate(estimate, reference): si_snr_score = si_snr(estimate, reference) ( sdr, _, _, _, ) = mir_eval.separation.bss_eval_sources(reference.numpy(), estimate.numpy(), False) pesq_mix = pesq(SAMPLE_RATE, estimate[0].numpy(), reference[0].numpy(), "wb") stoi_mix = stoi(reference[0].numpy(), estimate[0].numpy(), SAMPLE_RATE, extended=False) print(f"SDR score: {sdr[0]}") print(f"Si-SNR score: {si_snr_score}") print(f"PESQ score: {pesq_mix}") print(f"STOI score: {stoi_mix}") def get_irms(stft_clean, stft_noise): mag_clean = stft_clean.abs() ** 2 mag_noise = stft_noise.abs() ** 2 irm_speech = mag_clean / (mag_clean + mag_noise) irm_noise = mag_noise / (mag_clean + mag_noise) return irm_speech[REFERENCE_CHANNEL], irm_noise[REFERENCE_CHANNEL]