from flask import Flask, render_template, request, jsonify from transformers import Wav2Vec2FeatureExtractor, UniSpeechSatForXVector import torchaudio import torch import io import librosa from scipy.spatial.distance import cosine import numpy as np import os import time # brew install ffmpeg # pip install flask transformers librosa torch torchaudio app = Flask(__name__, static_url_path='/static') # https://www.youtube.com/watch?v=NjR6TyHgAho first 30s mp3_file_path = "arnold.mp3" # https://neets.ai/ "With great power comes great responsibility" mp3_file_path2 = 'arnold2.wav' flag1="" flag2="" with open("flag1.txt") as f: flag1=f.read() with open("flag2.txt") as f: flag2=f.read() # Load feature extractor and model themodel = "microsoft/unispeech-sat-large-sv" if os.path.exists("model"): themodel = "model" feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(themodel) model = UniSpeechSatForXVector.from_pretrained(themodel) # Preprocess audio function to convert audio to mono 16khz def preprocess_audio(audio_data): waveform, sample_rate = torchaudio.load(audio_data) if waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True) if sample_rate != 16000: waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform) waveform = waveform.squeeze().numpy() return waveform # Preload expected audio mp3_audio = preprocess_audio(mp3_file_path) mp3_inputs = feature_extractor(mp3_audio, return_tensors="pt") mp3_embeddings = model(**mp3_inputs).embeddings mp3_embeddings_normalized = torch.nn.functional.normalize(mp3_embeddings, dim=-1).cpu() mp3_audio2 = open(mp3_file_path2, 'rb').read() @app.route('/') def index(): return render_template('index.html') @app.route('/chal2') def chal2(): return render_template('chal2.html') # Hugging faces doesn't run an api for us anymore so processing data against a model needs to be done locally now # https://www.ktskumar.com/2021/12/introduction-to-voice-authentication-using-javascript/ # https://hf.space/gradioiframe/microsoft/unispeech-speaker-verification/api/predict # You can buy something similiar through Azure, perhaps microsoft just wanted to commercialize this @app.route('/compare_audio', methods=['POST']) def compare_audio(): try: start_time = time.time() # Get the recorded audio file from the frontend recorded_audio = request.files['audio_data'] # Preprocess recorded audio audio_data = preprocess_audio(recorded_audio) inputs = feature_extractor(audio_data, return_tensors="pt") embeddings = model(**inputs).embeddings embeddings_normalized = torch.nn.functional.normalize(embeddings, dim=-1).cpu() # Load and preprocess MP3 file for comparison global mp3_embeddings_normalized # Calculate cosine similarity cosine_sim = torch.nn.CosineSimilarity(dim=-1) similarity = cosine_sim(embeddings_normalized, mp3_embeddings_normalized).item() similarity = round(similarity, 3) end = time.time()-start_time threshold = 0.89 # Adjust the threshold as needed if similarity < threshold: result = "Authorization Failed! " + str(similarity) + " < 0.890"+" in "+str(round(end,3))+"s"+"
Do your best Terminator impression" else: result = "Good job! Match: " + str(similarity) + "
" + flag1 + "
Click here to open the next challenge"+"
processed in "+str(round(end,3))+"s" return jsonify({'result': result}) except Exception as e: print("Caught: "+str(e)) return jsonify({'error': 'An error occurred during audio comparison. Im fragile please dont abuse.' }) def extract_mfcc(audio_bytes): # Preprocess audio waveform = preprocess_audio2(audio_bytes) # Extract MFCC coefficients mfcc = librosa.feature.mfcc(y=waveform, sr=16000, n_mfcc=13) return mfcc def preprocess_audio2(audio_bytes): # Load the audio bytes into torchaudio waveform waveform, sample_rate = torchaudio.load(io.BytesIO(audio_bytes)) # Ensure the audio has a single channel (mono) if waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True) # Resample the audio to 16kHz if needed if sample_rate != 16000: waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform) # Trim silence at beginning and end waveform, _ = librosa.effects.trim(waveform, top_db=20) waveform = waveform.squeeze().numpy() return waveform mfcc2 = extract_mfcc(mp3_audio2) @app.route('/compare_audio2', methods=['POST']) def compare_audio2(): try: recorded_audio = request.files['audio_data'].read() #mp3_audio = open(mp3_file_path2, 'rb').read() # Compare similarity between audio mfcc1 = extract_mfcc(recorded_audio) #mfcc2 = extract_mfcc(mp3_audio) global mfcc2 similarity = 1 - cosine(np.mean(mfcc1, axis=1), np.mean(mfcc2, axis=1)) similarity = round(similarity, 3) if similarity < 0.940: result = "Authorization Failed! " + str(similarity) + " < 0.940
Say: 'With great power comes great responsibility' as Arnold Schwarzenegger" else: result = "Good job! Match: " + str(similarity) + "
" + flag2 return jsonify({'result': result}) except Exception as e: print("Caught: "+str(e)) return jsonify({'error': 'An error occurred during audio comparison. Im fragile please dont abuse.'}) if __name__ == '__main__': app.run(host="0.0.0.0", port=8080, debug=True)