Spaces:
Runtime error
Runtime error
File size: 2,760 Bytes
1f9348b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import torch
import torchaudio
import torchaudio.functional as F
from torchaudio.utils import download_asset
from pesq import pesq
from pystoi import stoi
import mir_eval
from pydub import AudioSegment
import matplotlib.pyplot as plt
import streamlit as st
from helper import plot_spectrogram,plot_mask,si_snr,generate_mixture,evaluate,get_irms
target_snr=3
#parameters for STFT
N_FFT = 1024
N_HOP = 256
stft = torchaudio.transforms.Spectrogram(
n_fft=N_FFT,
hop_length=N_HOP,
power=None,
)
istft = torchaudio.transforms.InverseSpectrogram(n_fft=N_FFT, hop_length=N_HOP)
#defining a psd transform
psd_transform = torchaudio.transforms.PSD()
mvdr_transform = torchaudio.transforms.SoudenMVDR()
#defining the reference microphone
REFERENCE_CHANNEL = 0
#creating a random noise for better calculations
SAMPLE_NOISE = download_asset("tutorial-assets/mvdr/noise.wav")
waveform_noise, sr2 = torchaudio.load(SAMPLE_NOISE)
waveform_noise = waveform_noise.to(torch.double)
stft_noise = stft(waveform_noise)
def ui():
st.title("Speech Enhancer")
st.markdown("Made by Vageesh")
#making an audio developer uploader:
audio_file = st.file_uploader("Upload an audio file in wav format", type=[ "wav"])
if audio_file is not None:
waveform_clean,sr=torchaudio.load(audio_file)
waveform_clean = waveform_mix.to(torch.double)
stft_clean = stft(waveform_clean)
st.text("Your uploaded audio")
st.audio(waveform_clean)
#creating a mixture of our audio file and the noise file
waveform_mix = generate_mixture(waveform_clean, waveform_noise, target_snr)
#making the files into torch double format
waveform_mix = waveform_mix.to(torch.double)
#computing STFT
stft_mix = stft(waveform_mix)
#plotting the spectogram
spec_img=plot_spectrogram(stft_mix)
st.image(spec_img,captions='Spectrogram of Mixture Speech (dB)')
#showing mixed audio in streamlit
st.audio(waveform_mix)
#getting the irms
irm_speech, irm_noise = get_irms(stft_clean, stft_noise)
#getting the psd speech
psd_speech = psd_transform(stft_mix, irm_speech)
psd_noise = psd_transform(stft_mix, irm_noise)
stft_souden = mvdr_transform(stft_mix, psd_speech, psd_noise, reference_channel=REFERENCE_CHANNEL)
waveform_souden = istft(stft_souden, length=waveform_mix.shape[-1])
#plotting the cleaned audio and hearing it
spec_clean_img=plot_spectrogram(stft_souden)
waveform_souden = waveform_souden.reshape(1, -1)
st.image(spec_clean_img,captions='Spectrogram of Mixture Speech (dB)')
st.audio(waveform_souden)
|