Spaces:
Runtime error
Runtime error
import torch | |
import torchaudio | |
import torch.nn.functional as F | |
from torchaudio.utils import download_asset | |
from pesq import pesq | |
from pystoi import stoi | |
import mir_eval | |
from pydub import AudioSegment | |
import matplotlib.pyplot as plt | |
import streamlit as st | |
from helper import plot_spectrogram,plot_mask,si_snr,generate_mixture,evaluate,get_irms | |
target_snr=3 | |
#parameters for STFT | |
N_FFT = 1024 | |
N_HOP = 256 | |
stft = torchaudio.transforms.Spectrogram( | |
n_fft=N_FFT, | |
hop_length=N_HOP, | |
power=None, | |
) | |
istft = torchaudio.transforms.InverseSpectrogram(n_fft=N_FFT, hop_length=N_HOP) | |
#defining a psd transform | |
psd_transform = torchaudio.transforms.PSD() | |
mvdr_transform = torchaudio.transforms.SoudenMVDR() | |
#defining the reference microphone | |
REFERENCE_CHANNEL = 0 | |
#creating a random noise for better calculations | |
SAMPLE_NOISE = download_asset("tutorial-assets/mvdr/noise.wav") | |
waveform_noise, sr2 = torchaudio.load(SAMPLE_NOISE) | |
waveform_noise = waveform_noise.to(torch.double) | |
stft_noise = stft(waveform_noise) | |
def ui(): | |
st.title("Speech Enhancer") | |
st.markdown("Made by Vageesh") | |
#making an audio developer uploader: | |
audio_file = st.file_uploader("Upload an audio file in wav format", type=[ "wav"]) | |
if audio_file is not None: | |
waveform_clean,sr=torchaudio.load(audio_file) | |
waveform_clean = waveform_clean.to(torch.double) | |
stft_clean = stft(waveform_clean) | |
st.text("Your uploaded audio") | |
st.audio(audio_file) | |
#creating a mixture of our audio file and the noise file | |
waveform_mix = generate_mixture(waveform_clean, waveform_noise, target_snr) | |
#making the files into torch double format | |
waveform_mix = waveform_mix.to(torch.double) | |
#computing STFT | |
stft_mix = stft(waveform_mix) | |
#plotting the spectogram | |
spec_img=plot_spectrogram(stft_mix) | |
st.image(spec_img,captions='Spectrogram of Mixture Speech (dB)') | |
#showing mixed audio in streamlit | |
torchaudio.save("./waveform_mix.wav", waveform_mix, sr) | |
st.audio("./waveform_mix.wav") | |
#getting the irms | |
irm_speech, irm_noise = get_irms(stft_clean, stft_noise) | |
#getting the psd speech | |
psd_speech = psd_transform(stft_mix, irm_speech) | |
psd_noise = psd_transform(stft_mix, irm_noise) | |
stft_souden = mvdr_transform(stft_mix, psd_speech, psd_noise, reference_channel=REFERENCE_CHANNEL) | |
waveform_souden = istft(stft_souden, length=waveform_mix.shape[-1]) | |
#plotting the cleaned audio and hearing it | |
spec_clean_img=plot_spectrogram(stft_souden) | |
waveform_souden = waveform_souden.reshape(1, -1) | |
st.image(spec_clean_img,captions='Spectrogram of Mixture Speech (dB)') | |
torchaudio.save("./waveform_souden.wav", waveform_souden, sr) | |
st.audio("./waveform_souden.wav") | |
if __name__=="__main__": | |
ui() | |