import gradio as gr import librosa import numpy as np import torch from torch import Tensor import torch.nn as nn from model import Model model_path = 'final_model.pth' def load_data(path): X, fs = librosa.load(path) X_pad = pad(X,64600) x_inp = Tensor(X_pad).unsqueeze(0) return x_inp,fs def pad(x, max_len=64600): x_len = x.shape[0] if x_len >= max_len: return x[:max_len] # need to pad num_repeats = int(max_len / x_len)+1 padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0] return padded_x device = 'cuda' if torch.cuda.is_available() else 'cpu' model = Model(None, device) nb_params = sum([param.view(-1).size()[0] for param in model.parameters()]) model =nn.DataParallel(model).to(device) model.load_state_dict(torch.load(model_path, map_location=device)) print("Model loaded : {}".format(model_path)) model.eval() prediction_dict = {0: 'Fake', 1: 'Real'} def Detection(audio): x_inp,fs = load_data(audio) print(x_inp.shape) validity_probs = model(x_inp) validity_probs = torch.nn.functional.softmax(validity_probs, dim=1) emotion = torch.argmax(validity_probs).item() print(emotion) validity = prediction_dict[emotion] # validity as a dictionary of class probabilities # validity = {prediction_dict[i]: float(validity_probs[0][i]) for i in range(2)} return validity audio = gr.Audio(type="filepath", label="Audio") # text_output = gr.Textbox(label="Prediction") text_output = gr.Textbox(label="Real Or Fake") gr.Interface( fn=Detection, inputs=audio, outputs=text_output, title="Audio Deepfake Detection", description="Audio Deepfake Detection.", ).launch()