Spaces:

polygraf-ai
/

vexon-audio-authentication

Running

App Files Files Community

aliasgerovs commited on 25 days ago

Commit

ee2ac99

verified ·

1 Parent(s): c32ead2

Create app.py

Browse files

Files changed (1) hide show

app.py +196 -0

app.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import gradio as gr
+import torch
+import torchaudio
+from speechbrain.pretrained import SpeakerRecognition
+import torch.nn as nn
+from transformers import AutoModel
+import os
+from huggingface_hub import hf_hub_download
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+speaker_model = SpeakerRecognition.from_hparams(
+    source="speechbrain/spkrec-ecapa-voxceleb",
+    savedir="tmp",
+    run_opts={"device": device}
+)
+class PretrainedTransformerClassifier(nn.Module):
+    def __init__(self, num_classes=3):
+        super().__init__()
+        self.transformer = AutoModel.from_pretrained('distilbert/distilroberta-base')
+        for param in self.transformer.parameters():
+            param.requires_grad = False
+        for param in self.transformer.encoder.layer[-2:].parameters():
+            param.requires_grad = True
+        self.embed_projection = nn.Sequential(
+            nn.Linear(1, 768),
+            nn.LayerNorm(768),
+            nn.Dropout(0.1)
+        )
+        self.classifier = nn.Sequential(
+            nn.Linear(768, 256),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(256, num_classes)
+        )
+    def forward(self, x):
+        x = self.embed_projection(x.unsqueeze(-1))
+        if len(x.shape) == 2:
+            x = x.unsqueeze(0)
+        attention_mask = torch.ones((x.shape[0], x.shape[1])).to(x.device)
+        transformer_output = self.transformer(
+            inputs_embeds=x,
+            attention_mask=attention_mask,
+            return_dict=True
+        )
+        pooled_output = transformer_output.last_hidden_state[:, 0, :]
+        return self.classifier(pooled_output)
+# Load the model from Huggingface Hub
+def load_model():
+    model_path = hf_hub_download(repo_id="polygraf-ai/vexon-voice-authentication", filename="model.pth")
+    config_path = hf_hub_download(repo_id="polygraf-ai/vexon-voice-authentication", filename="config.pth")
+    config = torch.load(config_path)
+    classifier = PretrainedTransformerClassifier(num_classes=config['num_classes']).to(device)
+    classifier.load_state_dict(torch.load(model_path, map_location=device))
+    classifier.eval()
+    return classifier, config
+classifier, model_config = load_model()
+def extract_embedding(audio_path):
+    try:
+        signal, fs = torchaudio.load(audio_path)
+        signal = signal.to(device)
+        embedding = speaker_model.encode_batch(signal)
+        return embedding.cpu().detach().numpy().flatten()
+    except Exception as e:
+        print(f"Error processing {audio_path}: {e}")
+        return None
+def verify_speaker(audio_path1, audio_path2):
+    emb1 = extract_embedding(audio_path1)
+    emb2 = extract_embedding(audio_path2)
+    if emb1 is None or emb2 is None:
+        return None
+    tensor1 = torch.tensor(emb1).to(device)
+    tensor2 = torch.tensor(emb2).to(device)
+    similarity_score = torch.nn.functional.cosine_similarity(
+        tensor1, tensor2, dim=0
+    ).cpu().item()
+    return similarity_score
+def process_audio(audio1, audio2):
+    """
+    Process two audio files and return authentication results using the pretrained transformer classifier
+    """
+    if audio1 is None or audio2 is None:
+        return "Please upload both audio files."
+    temp_path1 = "temp_audio1.wav"
+    temp_path2 = "temp_audio2.wav"
+    try:
+        torchaudio.save(temp_path1,
+                       torchaudio.load(audio1)[0],
+                       torchaudio.load(audio1)[1])
+        torchaudio.save(temp_path2,
+                       torchaudio.load(audio2)[0],
+                       torchaudio.load(audio2)[1])
+        score = verify_speaker(temp_path1, temp_path2)
+        if score is None:
+            return "Error processing audio files. Please ensure they are valid audio recordings."
+        with torch.no_grad():
+            score_tensor = torch.FloatTensor([[score]]).to(device)
+            output = classifier(score_tensor)
+            prediction = torch.argmax(output, dim=1).item()
+        probabilities = torch.softmax(output, dim=1)[0]
+        confidence = probabilities[prediction].item()
+        result = f"""
+        📊 Authentication Results:
+        🔹 Similarity Score: {score:.4f}
+        🔹 Classification: {model_config['labels'][prediction]}
+        🔹 Confidence: {confidence:.4f}
+        Analysis Details:
+        - A similarity score closer to 1.0 indicates higher voice similarity
+        - Classification indicates the model's assessment of the voice comparison
+        - Confidence shows how certain the model is about its classification
+        {'⚠️ Potential Voice Impersonation Detected!' if prediction > 0 else '✅ Authentic Voice Match'}
+        """
+        return result
+    except Exception as e:
+        return f"An error occurred: {str(e)}"
+    finally:
+        # Clean up temporary files
+        if os.path.exists(temp_path1):
+            os.remove(temp_path1)
+        if os.path.exists(temp_path2):
+            os.remove(temp_path2)
+css = """
+.gradio-container {
+    font-family: 'IBM Plex Sans', sans-serif;
+}
+.gr-button {
+    color: white;
+    border-radius: 8px;
+    background: linear-gradient(to right, #2125ff, #4146ff);
+    border: none;
+    cursor: pointer;
+}
+.gr-button:hover {
+    background: linear-gradient(to right, #1f23e6, #3b40e6);
+}
+.footer {
+    margin-top: 20px;
+    text-align: center;
+    border-top: 1px solid #ccc;
+    padding-top: 10px;
+}
+"""
+demo = gr.Interface(
+    fn=process_audio,
+    inputs=[
+        gr.Audio(label="Reference Voice Recording", type="filepath"),
+        gr.Audio(label="Voice Recording to Verify", type="filepath")
+    ],
+    outputs=gr.Textbox(label="Authentication Results"),
+    title="Vexon Voice Authentication",
+    description="""
+    Upload two voice recordings to verify if they are from the same person and detect potential voice impersonation attempts.
+    The system uses a pretrained transformer model fine-tuned on voice similarity scores to:
+    1. Calculate a similarity score between the recordings
+    2. Classify the comparison as Real-Real, Real-Fake, or Impersonation
+    3. Provide a confidence score for the classification
+    Note: For best results, ensure recordings are clear and contain speech content.
+    """,
+    css=css,
+    examples=[
+        ["reference_sample1.wav", "verify_sample1.wav"],
+        ["reference_sample2.wav", "verify_sample2.wav"]
+    ]
+)
+if __name__ == "__main__":
+    demo.launch()