Spaces:
Running
Running
import os | |
import torchaudio | |
import gradio as gr | |
import torch | |
from fastapi import FastAPI, HTTPException, File, UploadFile | |
from speechbrain.inference import SpeakerRecognition | |
from fastapi.responses import JSONResponse | |
# Initialize the speaker verification model | |
speaker_verification = SpeakerRecognition.from_hparams( | |
source="speechbrain/spkrec-ecapa-voxceleb", | |
savedir="tmp_model" | |
) | |
# Function to calculate similarity score | |
def get_similarity(audio1, audio2, sample_rate=16000): | |
try: | |
# Convert numpy arrays to tensors | |
signal1 = torch.tensor(audio1) | |
signal2 = torch.tensor(audio2) | |
# Make sure the signals are in the right shape (2D tensor: (1, N)) | |
if signal1.ndimension() == 1: | |
signal1 = signal1.unsqueeze(0) | |
if signal2.ndimension() == 1: | |
signal2 = signal2.unsqueeze(0) | |
# Get similarity score and prediction | |
score, prediction = speaker_verification.verify_batch(signal1, signal2) | |
return float(score), "Yes" if prediction else "No" | |
except Exception as e: | |
return None, str(e) # Return error message if any exception | |
# API function to compare voices | |
def compare_voices(file1, file2): | |
try: | |
# Gradio Audio returns a tuple of (audio, sample_rate) | |
audio1, _ = file1 # Audio1 is a tuple (numpy_array, sample_rate) | |
audio2, _ = file2 # Audio2 is a tuple (numpy_array, sample_rate) | |
# Get similarity score | |
score, is_same_user = get_similarity(audio1, audio2) | |
if score is None: | |
# Return the error message if processing fails | |
return {"error": is_same_user} | |
# Return a dictionary with the similarity score and prediction | |
return {"Similarity Score": f"{score:.4f}", "Same User Prediction": is_same_user} | |
except Exception as e: | |
# Handle unexpected errors | |
return {"error": str(e)} | |
# FastAPI app | |
app = FastAPI() | |
async def compare_voices_api(file1: UploadFile = File(...), file2: UploadFile = File(...)): | |
""" | |
Compare two audio files and return the similarity score and prediction. | |
""" | |
try: | |
# Process the audio files and return them as numpy arrays | |
file1_data = await file1.read() | |
file2_data = await file2.read() | |
# You need to process these byte strings into numpy arrays | |
# Assuming the audio is decoded into numpy arrays here (e.g., using torchaudio) | |
# For example: | |
# audio1 = torchaudio.load(io.BytesIO(file1_data))[0].numpy() | |
# audio2 = torchaudio.load(io.BytesIO(file2_data))[0].numpy() | |
return {"message": "Processing files directly without saving them."} | |
except Exception as e: | |
raise HTTPException(status_code=400, detail=str(e)) | |
# Gradio interface function | |
def gradio_interface(): | |
return gr.Interface( | |
fn=compare_voices, | |
inputs=[ | |
gr.Audio(type="numpy", label="First Audio File"), # Gradio now gives numpy arrays | |
gr.Audio(type="numpy", label="Second Audio File") # Gradio now gives numpy arrays | |
], | |
outputs="json", # Output results as JSON | |
live=False # No live interface, just the API | |
) | |
# Launch Gradio as a web interface | |
async def startup(): | |
gr.Interface(fn=compare_voices, inputs=[ | |
gr.Audio(type="numpy", label="First Audio File"), # Gradio now gives numpy arrays | |
gr.Audio(type="numpy", label="Second Audio File") # Gradio now gives numpy arrays | |
], outputs="json", live=False).launch(share=True, inline=True) | |
# Running the FastAPI app with Gradio | |
if __name__ == "__main__": | |
import uvicorn | |
uvicorn.run(app, host="0.0.0.0", port=5000) | |