File size: 5,672 Bytes
45a4975
 
 
 
 
 
 
 
 
868f0a3
45a4975
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
868f0a3
 
 
 
 
 
 
 
 
45a4975
 
 
 
 
 
 
 
 
 
 
 
 
 
 
868f0a3
45a4975
 
 
 
 
 
 
 
 
 
868f0a3
45a4975
 
 
 
 
 
868f0a3
45a4975
 
 
868f0a3
45a4975
868f0a3
45a4975
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
868f0a3
 
45a4975
 
 
 
868f0a3
45a4975
 
 
868f0a3
 
45a4975
 
 
 
 
 
 
 
 
 
 
 
 
868f0a3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from flask import Flask, render_template, request, jsonify
from transformers import Wav2Vec2FeatureExtractor, UniSpeechSatForXVector
import torchaudio
import torch
import io
import librosa
from scipy.spatial.distance import cosine
import numpy as np
import os
import time
# brew install ffmpeg
# pip install flask transformers librosa torch torchaudio

app = Flask(__name__, static_url_path='/static')

# https://www.youtube.com/watch?v=NjR6TyHgAho first 30s
mp3_file_path = "arnold.mp3"

# https://neets.ai/ "With great power comes great responsibility"
mp3_file_path2 = 'arnold2.wav'

flag1=""
flag2=""

with open("flag1.txt") as f:
 flag1=f.read()
with open("flag2.txt") as f:
 flag2=f.read()

# Load feature extractor and model
themodel = "microsoft/unispeech-sat-large-sv"
if os.path.exists("model"):
    themodel = "model"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(themodel)
model = UniSpeechSatForXVector.from_pretrained(themodel)

# Preprocess audio function to convert audio to mono 16khz
def preprocess_audio(audio_data):
    waveform, sample_rate = torchaudio.load(audio_data)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
    waveform = waveform.squeeze().numpy()
    return waveform


# Preload expected audio
mp3_audio = preprocess_audio(mp3_file_path)
mp3_inputs = feature_extractor(mp3_audio, return_tensors="pt")
mp3_embeddings = model(**mp3_inputs).embeddings
mp3_embeddings_normalized = torch.nn.functional.normalize(mp3_embeddings, dim=-1).cpu()

mp3_audio2 = open(mp3_file_path2, 'rb').read()

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/chal2')
def chal2():
    return render_template('chal2.html')

# Hugging faces doesn't run an api for us anymore so processing data against a model needs to be done locally now
# https://www.ktskumar.com/2021/12/introduction-to-voice-authentication-using-javascript/
# https://hf.space/gradioiframe/microsoft/unispeech-speaker-verification/api/predict
# You can buy something similiar through Azure, perhaps microsoft just wanted to commercialize this
@app.route('/compare_audio', methods=['POST'])
def compare_audio():
    try:
        start_time = time.time()
        # Get the recorded audio file from the frontend
        recorded_audio = request.files['audio_data']

        # Preprocess recorded audio
        audio_data = preprocess_audio(recorded_audio)
        inputs = feature_extractor(audio_data, return_tensors="pt")
        embeddings = model(**inputs).embeddings
        embeddings_normalized = torch.nn.functional.normalize(embeddings, dim=-1).cpu()

        # Load and preprocess MP3 file for comparison
        global mp3_embeddings_normalized

        # Calculate cosine similarity
        cosine_sim = torch.nn.CosineSimilarity(dim=-1)
        similarity = cosine_sim(embeddings_normalized, mp3_embeddings_normalized).item()

        similarity = round(similarity, 3)
        end = time.time()-start_time

        threshold = 0.89  # Adjust the threshold as needed
        if similarity < threshold:
            result = "Authorization Failed! " + str(similarity) + " < 0.890"+" in "+str(round(end,3))+"s"+"<br>Do your best Terminator impression"
        else:
            result = "Good job! Match: " + str(similarity) + "<br>" + flag1 + "<br><a href='/chal2'>Click here to open the next challenge</a>"+"<br>processed in "+str(round(end,3))+"s"

        return jsonify({'result': result})
    except Exception as e:
        print("Caught: "+str(e))
        return jsonify({'error': 'An error occurred during audio comparison. Im fragile please dont abuse.' })

def extract_mfcc(audio_bytes):
    # Preprocess audio
    waveform = preprocess_audio2(audio_bytes)
    
    # Extract MFCC coefficients
    mfcc = librosa.feature.mfcc(y=waveform, sr=16000, n_mfcc=13)

    return mfcc

def preprocess_audio2(audio_bytes):
    # Load the audio bytes into torchaudio waveform
    waveform, sample_rate = torchaudio.load(io.BytesIO(audio_bytes))
    
    # Ensure the audio has a single channel (mono)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Resample the audio to 16kHz if needed
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)

    # Trim silence at beginning and end
    waveform, _ = librosa.effects.trim(waveform, top_db=20)
   
    waveform = waveform.squeeze().numpy()

    return waveform

mfcc2 = extract_mfcc(mp3_audio2)

@app.route('/compare_audio2', methods=['POST'])
def compare_audio2():
    try:
        recorded_audio = request.files['audio_data'].read()
        #mp3_audio = open(mp3_file_path2, 'rb').read()

        # Compare similarity between audio
        mfcc1 = extract_mfcc(recorded_audio)
        #mfcc2 = extract_mfcc(mp3_audio)
        global mfcc2
        similarity = 1 - cosine(np.mean(mfcc1, axis=1), np.mean(mfcc2, axis=1))
        similarity = round(similarity, 3)
        if similarity < 0.940:
            result = "Authorization Failed! " + str(similarity) + " < 0.940<br>Say: 'With great power comes great responsibility' as Arnold Schwarzenegger"
        else:
            result = "Good job! Match: " + str(similarity) + "<br>" + flag2

        return jsonify({'result': result})
    except Exception as e:
        print("Caught: "+str(e))
        return jsonify({'error': 'An error occurred during audio comparison. Im fragile please dont abuse.'})

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=8080, debug=True)