aliasgerovs commited on
Commit
ee2ac99
·
verified ·
1 Parent(s): c32ead2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +196 -0
app.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torchaudio
4
+ from speechbrain.pretrained import SpeakerRecognition
5
+ import torch.nn as nn
6
+ from transformers import AutoModel
7
+ import os
8
+ from huggingface_hub import hf_hub_download
9
+
10
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
11
+ speaker_model = SpeakerRecognition.from_hparams(
12
+ source="speechbrain/spkrec-ecapa-voxceleb",
13
+ savedir="tmp",
14
+ run_opts={"device": device}
15
+ )
16
+
17
+ class PretrainedTransformerClassifier(nn.Module):
18
+ def __init__(self, num_classes=3):
19
+ super().__init__()
20
+ self.transformer = AutoModel.from_pretrained('distilbert/distilroberta-base')
21
+ for param in self.transformer.parameters():
22
+ param.requires_grad = False
23
+
24
+ for param in self.transformer.encoder.layer[-2:].parameters():
25
+ param.requires_grad = True
26
+
27
+ self.embed_projection = nn.Sequential(
28
+ nn.Linear(1, 768),
29
+ nn.LayerNorm(768),
30
+ nn.Dropout(0.1)
31
+ )
32
+
33
+ self.classifier = nn.Sequential(
34
+ nn.Linear(768, 256),
35
+ nn.ReLU(),
36
+ nn.Dropout(0.1),
37
+ nn.Linear(256, num_classes)
38
+ )
39
+
40
+ def forward(self, x):
41
+ x = self.embed_projection(x.unsqueeze(-1))
42
+ if len(x.shape) == 2:
43
+ x = x.unsqueeze(0)
44
+
45
+ attention_mask = torch.ones((x.shape[0], x.shape[1])).to(x.device)
46
+ transformer_output = self.transformer(
47
+ inputs_embeds=x,
48
+ attention_mask=attention_mask,
49
+ return_dict=True
50
+ )
51
+ pooled_output = transformer_output.last_hidden_state[:, 0, :]
52
+ return self.classifier(pooled_output)
53
+
54
+ # Load the model from Huggingface Hub
55
+ def load_model():
56
+ model_path = hf_hub_download(repo_id="polygraf-ai/vexon-voice-authentication", filename="model.pth")
57
+ config_path = hf_hub_download(repo_id="polygraf-ai/vexon-voice-authentication", filename="config.pth")
58
+
59
+ config = torch.load(config_path)
60
+ classifier = PretrainedTransformerClassifier(num_classes=config['num_classes']).to(device)
61
+ classifier.load_state_dict(torch.load(model_path, map_location=device))
62
+ classifier.eval()
63
+
64
+ return classifier, config
65
+
66
+ classifier, model_config = load_model()
67
+
68
+ def extract_embedding(audio_path):
69
+ try:
70
+ signal, fs = torchaudio.load(audio_path)
71
+ signal = signal.to(device)
72
+ embedding = speaker_model.encode_batch(signal)
73
+ return embedding.cpu().detach().numpy().flatten()
74
+ except Exception as e:
75
+ print(f"Error processing {audio_path}: {e}")
76
+ return None
77
+
78
+ def verify_speaker(audio_path1, audio_path2):
79
+ emb1 = extract_embedding(audio_path1)
80
+ emb2 = extract_embedding(audio_path2)
81
+ if emb1 is None or emb2 is None:
82
+ return None
83
+
84
+ tensor1 = torch.tensor(emb1).to(device)
85
+ tensor2 = torch.tensor(emb2).to(device)
86
+ similarity_score = torch.nn.functional.cosine_similarity(
87
+ tensor1, tensor2, dim=0
88
+ ).cpu().item()
89
+ return similarity_score
90
+
91
+ def process_audio(audio1, audio2):
92
+ """
93
+ Process two audio files and return authentication results using the pretrained transformer classifier
94
+ """
95
+ if audio1 is None or audio2 is None:
96
+ return "Please upload both audio files."
97
+
98
+ temp_path1 = "temp_audio1.wav"
99
+ temp_path2 = "temp_audio2.wav"
100
+
101
+ try:
102
+ torchaudio.save(temp_path1,
103
+ torchaudio.load(audio1)[0],
104
+ torchaudio.load(audio1)[1])
105
+ torchaudio.save(temp_path2,
106
+ torchaudio.load(audio2)[0],
107
+ torchaudio.load(audio2)[1])
108
+
109
+ score = verify_speaker(temp_path1, temp_path2)
110
+ if score is None:
111
+ return "Error processing audio files. Please ensure they are valid audio recordings."
112
+
113
+ with torch.no_grad():
114
+ score_tensor = torch.FloatTensor([[score]]).to(device)
115
+ output = classifier(score_tensor)
116
+ prediction = torch.argmax(output, dim=1).item()
117
+
118
+ probabilities = torch.softmax(output, dim=1)[0]
119
+ confidence = probabilities[prediction].item()
120
+
121
+ result = f"""
122
+ 📊 Authentication Results:
123
+
124
+ 🔹 Similarity Score: {score:.4f}
125
+ 🔹 Classification: {model_config['labels'][prediction]}
126
+ 🔹 Confidence: {confidence:.4f}
127
+
128
+ Analysis Details:
129
+ - A similarity score closer to 1.0 indicates higher voice similarity
130
+ - Classification indicates the model's assessment of the voice comparison
131
+ - Confidence shows how certain the model is about its classification
132
+
133
+ {'⚠️ Potential Voice Impersonation Detected!' if prediction > 0 else '✅ Authentic Voice Match'}
134
+ """
135
+
136
+ return result
137
+
138
+ except Exception as e:
139
+ return f"An error occurred: {str(e)}"
140
+
141
+ finally:
142
+ # Clean up temporary files
143
+ if os.path.exists(temp_path1):
144
+ os.remove(temp_path1)
145
+ if os.path.exists(temp_path2):
146
+ os.remove(temp_path2)
147
+
148
+ css = """
149
+ .gradio-container {
150
+ font-family: 'IBM Plex Sans', sans-serif;
151
+ }
152
+ .gr-button {
153
+ color: white;
154
+ border-radius: 8px;
155
+ background: linear-gradient(to right, #2125ff, #4146ff);
156
+ border: none;
157
+ cursor: pointer;
158
+ }
159
+ .gr-button:hover {
160
+ background: linear-gradient(to right, #1f23e6, #3b40e6);
161
+ }
162
+ .footer {
163
+ margin-top: 20px;
164
+ text-align: center;
165
+ border-top: 1px solid #ccc;
166
+ padding-top: 10px;
167
+ }
168
+ """
169
+
170
+ demo = gr.Interface(
171
+ fn=process_audio,
172
+ inputs=[
173
+ gr.Audio(label="Reference Voice Recording", type="filepath"),
174
+ gr.Audio(label="Voice Recording to Verify", type="filepath")
175
+ ],
176
+ outputs=gr.Textbox(label="Authentication Results"),
177
+ title="Vexon Voice Authentication",
178
+ description="""
179
+ Upload two voice recordings to verify if they are from the same person and detect potential voice impersonation attempts.
180
+
181
+ The system uses a pretrained transformer model fine-tuned on voice similarity scores to:
182
+ 1. Calculate a similarity score between the recordings
183
+ 2. Classify the comparison as Real-Real, Real-Fake, or Impersonation
184
+ 3. Provide a confidence score for the classification
185
+
186
+ Note: For best results, ensure recordings are clear and contain speech content.
187
+ """,
188
+ css=css,
189
+ examples=[
190
+ ["reference_sample1.wav", "verify_sample1.wav"],
191
+ ["reference_sample2.wav", "verify_sample2.wav"]
192
+ ]
193
+ )
194
+
195
+ if __name__ == "__main__":
196
+ demo.launch()