|
import gradio as gr |
|
import librosa |
|
import pickle |
|
import numpy as np |
|
import spafe |
|
from spafe.frequencies import dominant_frequencies |
|
from spafe.features.mfcc import mfcc, imfcc |
|
from spafe.features.bfcc import bfcc |
|
from spafe.features.cqcc import cqcc |
|
from spafe.features.gfcc import erb_spectrogram |
|
from spafe.features.lfcc import linear_spectrogram |
|
from spafe.features.msrcc import msrcc |
|
from spafe.features.ngcc import ngcc |
|
from spafe.utils.preprocessing import SlidingWindow |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
def dominant_freq_density(min_dom_freq,max_dom_freq,signal,sr): |
|
dom_f = dominant_frequencies.get_dominant_frequencies(signal,sr,nfft=512,butter_filter=True) |
|
dom_f = dom_f[(dom_f>min_dom_freq) & (dom_f<max_dom_freq)] |
|
h,e = np.histogram(dom_f,bins = range(min_dom_freq,max_dom_freq,100),density=True) |
|
return h |
|
|
|
def dominant_freq(x): |
|
return dominant_freq_density(100,1000,x['y'],x['sr']) |
|
def apply_mfcc(x): |
|
return np.mean(np.nan_to_num(mfcc(x['y'],fs=x['sr'],pre_emph=1,pre_emph_coeff=0.97,window=SlidingWindow(0.03, 0.015, "hamming"),nfilts=128,nfft=512,low_freq=50,high_freq=4000,normalize="mvn"),posinf=0,neginf=0),axis=0) |
|
|
|
def apply_bfcc(x): |
|
return np.mean(np.nan_to_num(bfcc(x['y'],fs=x['sr'],pre_emph=1,pre_emph_coeff=0.97,window=SlidingWindow(0.03, 0.015, "hamming"),nfilts=128,nfft=512,low_freq=50,high_freq=4000,normalize="mvn"),posinf=0,neginf=0),axis=0) |
|
def apply_cqcc(x): |
|
return np.mean(np.nan_to_num(cqcc(x['y'],fs=x['sr'],pre_emph=True,pre_emph_coeff=0.97,window=SlidingWindow(0.03, 0.015, "hamming"),nfft=512,low_freq=0,high_freq=None,number_of_octaves=7,number_of_bins_per_octave=24,spectral_threshold=0.005,f0=120,q_rate=1.0),posinf=0,neginf=0),axis=0) |
|
|
|
|
|
def apply_gfcc(x): |
|
return np.mean(np.nan_to_num(erb_spectrogram(x['y'],fs=x['sr'],pre_emph=True,pre_emph_coeff=0.97,window=SlidingWindow(0.03, 0.015, "hamming"),nfilts=24,nfft=512,low_freq=0,high_freq=None,scale='constant',fbanks=None,conversion_approach='Glasberg')[0],posinf=0,neginf=0),axis=0) |
|
|
|
def apply_lfcc(x): |
|
return np.mean(np.nan_to_num(linear_spectrogram(x['y'],fs=x['sr'],pre_emph=True,pre_emph_coeff=0.97,window=SlidingWindow(0.03, 0.015, "hamming"),nfilts=24,nfft=512,low_freq=0,high_freq=None,scale='constant',fbanks=None)[0],posinf=0,neginf=0),axis=0) |
|
|
|
def apply_msrcc(x): |
|
return np.mean(np.nan_to_num(msrcc(x['y'],fs=x['sr'],num_ceps=13,pre_emph=True,pre_emph_coeff=0.97,window=SlidingWindow(0.03, 0.015, "hamming"),nfilts=24,nfft=512,low_freq=0,high_freq=None,scale='ascendant',gamma=-0.14285714285714285,dct_type=2,use_energy=False,lifter=None,normalize=None,fbanks=None,conversion_approach='Oshaghnessy'),posinf=0,neginf=0),axis=0) |
|
|
|
def apply_ngcc(x): |
|
return np.mean(np.nan_to_num(ngcc(x['y'],fs=x['sr'],num_ceps=13,pre_emph=True,pre_emph_coeff=0.97,window=SlidingWindow(0.03, 0.015, "hamming"),nfilts=24,nfft=512,low_freq=0,high_freq=None,scale='constant',dct_type=2,use_energy=False,lifter=None,normalize=None,fbanks=None,conversion_approach='Glasberg'),posinf=0,neginf=0),axis=0) |
|
|
|
def load_model(checkpoint): |
|
model = pickle.load(open(checkpoint, 'rb')) |
|
return model |
|
|
|
def extract_features(audio): |
|
y, sr = librosa.load(audio) |
|
features = [] |
|
dom_freq = dominant_freq({'y':y, 'sr':sr}) |
|
features.append(dom_freq) |
|
mfcc = apply_mfcc({'y':y, 'sr':sr}) |
|
features.append(mfcc) |
|
bfcc = apply_bfcc({'y':y, 'sr':sr}) |
|
features.append(bfcc) |
|
cqcc = apply_cqcc({'y':y, 'sr':sr}) |
|
features.append(cqcc) |
|
gfcc = apply_gfcc({'y':y, 'sr':sr}) |
|
features.append(gfcc) |
|
lfcc = apply_lfcc({'y':y, 'sr':sr}) |
|
features.append(lfcc) |
|
msrcc = apply_msrcc({'y':y, 'sr':sr}) |
|
features.append(msrcc) |
|
ngcc = apply_ngcc({'y':y, 'sr':sr}) |
|
features.append(ngcc) |
|
features = np.concatenate(features).flatten() |
|
|
|
return features |
|
|
|
def inference_Verification(audio_1, audio_2): |
|
model = load_model('lda.pkl') |
|
|
|
features1 = extract_features(audio_1) |
|
features2 = extract_features(audio_2) |
|
|
|
|
|
|
|
embed1 = model.transform([features1]) |
|
embed2 = model.transform([features2]) |
|
return cosine_similarity(embed1, embed2).flatten()[0].round(4) |
|
|
|
audio_1 = gr.Audio(sources="upload", type="filepath", label="Audio 1") |
|
audio_2 = gr.Audio(sources="upload", type="filepath", label="Audio 2") |
|
text_output = gr.Textbox(label="Similarity Score") |
|
gr.Interface( |
|
fn=inference_Verification, |
|
inputs=[audio_1, audio_2], |
|
outputs=text_output, |
|
title="Speaker Verification", |
|
description="Speaker Verification on Multilingual dataset.", |
|
).launch() |
|
|