File size: 5,672 Bytes
45a4975 868f0a3 45a4975 868f0a3 45a4975 868f0a3 45a4975 868f0a3 45a4975 868f0a3 45a4975 868f0a3 45a4975 868f0a3 45a4975 868f0a3 45a4975 868f0a3 45a4975 868f0a3 45a4975 868f0a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
from flask import Flask, render_template, request, jsonify
from transformers import Wav2Vec2FeatureExtractor, UniSpeechSatForXVector
import torchaudio
import torch
import io
import librosa
from scipy.spatial.distance import cosine
import numpy as np
import os
import time
# brew install ffmpeg
# pip install flask transformers librosa torch torchaudio
app = Flask(__name__, static_url_path='/static')
# https://www.youtube.com/watch?v=NjR6TyHgAho first 30s
mp3_file_path = "arnold.mp3"
# https://neets.ai/ "With great power comes great responsibility"
mp3_file_path2 = 'arnold2.wav'
flag1=""
flag2=""
with open("flag1.txt") as f:
flag1=f.read()
with open("flag2.txt") as f:
flag2=f.read()
# Load feature extractor and model
themodel = "microsoft/unispeech-sat-large-sv"
if os.path.exists("model"):
themodel = "model"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(themodel)
model = UniSpeechSatForXVector.from_pretrained(themodel)
# Preprocess audio function to convert audio to mono 16khz
def preprocess_audio(audio_data):
waveform, sample_rate = torchaudio.load(audio_data)
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
if sample_rate != 16000:
waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
waveform = waveform.squeeze().numpy()
return waveform
# Preload expected audio
mp3_audio = preprocess_audio(mp3_file_path)
mp3_inputs = feature_extractor(mp3_audio, return_tensors="pt")
mp3_embeddings = model(**mp3_inputs).embeddings
mp3_embeddings_normalized = torch.nn.functional.normalize(mp3_embeddings, dim=-1).cpu()
mp3_audio2 = open(mp3_file_path2, 'rb').read()
@app.route('/')
def index():
return render_template('index.html')
@app.route('/chal2')
def chal2():
return render_template('chal2.html')
# Hugging faces doesn't run an api for us anymore so processing data against a model needs to be done locally now
# https://www.ktskumar.com/2021/12/introduction-to-voice-authentication-using-javascript/
# https://hf.space/gradioiframe/microsoft/unispeech-speaker-verification/api/predict
# You can buy something similiar through Azure, perhaps microsoft just wanted to commercialize this
@app.route('/compare_audio', methods=['POST'])
def compare_audio():
try:
start_time = time.time()
# Get the recorded audio file from the frontend
recorded_audio = request.files['audio_data']
# Preprocess recorded audio
audio_data = preprocess_audio(recorded_audio)
inputs = feature_extractor(audio_data, return_tensors="pt")
embeddings = model(**inputs).embeddings
embeddings_normalized = torch.nn.functional.normalize(embeddings, dim=-1).cpu()
# Load and preprocess MP3 file for comparison
global mp3_embeddings_normalized
# Calculate cosine similarity
cosine_sim = torch.nn.CosineSimilarity(dim=-1)
similarity = cosine_sim(embeddings_normalized, mp3_embeddings_normalized).item()
similarity = round(similarity, 3)
end = time.time()-start_time
threshold = 0.89 # Adjust the threshold as needed
if similarity < threshold:
result = "Authorization Failed! " + str(similarity) + " < 0.890"+" in "+str(round(end,3))+"s"+"<br>Do your best Terminator impression"
else:
result = "Good job! Match: " + str(similarity) + "<br>" + flag1 + "<br><a href='/chal2'>Click here to open the next challenge</a>"+"<br>processed in "+str(round(end,3))+"s"
return jsonify({'result': result})
except Exception as e:
print("Caught: "+str(e))
return jsonify({'error': 'An error occurred during audio comparison. Im fragile please dont abuse.' })
def extract_mfcc(audio_bytes):
# Preprocess audio
waveform = preprocess_audio2(audio_bytes)
# Extract MFCC coefficients
mfcc = librosa.feature.mfcc(y=waveform, sr=16000, n_mfcc=13)
return mfcc
def preprocess_audio2(audio_bytes):
# Load the audio bytes into torchaudio waveform
waveform, sample_rate = torchaudio.load(io.BytesIO(audio_bytes))
# Ensure the audio has a single channel (mono)
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# Resample the audio to 16kHz if needed
if sample_rate != 16000:
waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
# Trim silence at beginning and end
waveform, _ = librosa.effects.trim(waveform, top_db=20)
waveform = waveform.squeeze().numpy()
return waveform
mfcc2 = extract_mfcc(mp3_audio2)
@app.route('/compare_audio2', methods=['POST'])
def compare_audio2():
try:
recorded_audio = request.files['audio_data'].read()
#mp3_audio = open(mp3_file_path2, 'rb').read()
# Compare similarity between audio
mfcc1 = extract_mfcc(recorded_audio)
#mfcc2 = extract_mfcc(mp3_audio)
global mfcc2
similarity = 1 - cosine(np.mean(mfcc1, axis=1), np.mean(mfcc2, axis=1))
similarity = round(similarity, 3)
if similarity < 0.940:
result = "Authorization Failed! " + str(similarity) + " < 0.940<br>Say: 'With great power comes great responsibility' as Arnold Schwarzenegger"
else:
result = "Good job! Match: " + str(similarity) + "<br>" + flag2
return jsonify({'result': result})
except Exception as e:
print("Caught: "+str(e))
return jsonify({'error': 'An error occurred during audio comparison. Im fragile please dont abuse.'})
if __name__ == '__main__':
app.run(host="0.0.0.0", port=8080, debug=True)
|