Spaces:
Running
Running
import gradio as gr | |
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor | |
import torch | |
import torchaudio | |
from io import BytesIO | |
# Hugging Face Model Hub'dan modelni yuklash | |
model_name = "Mrkomiljon/voiceGUARD/wav2vec2_finetuned_model" # Hugging Face Model Hub'dagi modelning to'liq nomi | |
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name) | |
processor = Wav2Vec2Processor.from_pretrained(model_name) | |
model.eval() | |
# Device setup | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model.to(device) | |
# Define label mapping | |
id2label = { | |
0: "diffwave", | |
1: "melgan", | |
2: "parallel_wave_gan", | |
3: "Real", | |
4: "wavegrad", | |
5: "wavnet", | |
6: "wavernn" | |
} | |
# Define the prediction function | |
def predict_audio(file): | |
target_sample_rate = 16000 # Model's expected sample rate | |
max_length = target_sample_rate * 10 # 10 seconds in samples | |
try: | |
# Load the audio file | |
audio_bytes = file.read() | |
waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes)) | |
# Resample if the sample rate doesn't match the model's expected rate | |
if sample_rate != target_sample_rate: | |
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate) | |
waveform = resampler(waveform) | |
# Truncate or pad the waveform to ensure consistent input length | |
if waveform.size(1) > max_length: | |
waveform = waveform[:, :max_length] # Truncate | |
elif waveform.size(1) < max_length: | |
waveform = torch.nn.functional.pad(waveform, (0, max_length - waveform.size(1))) # Pad | |
if waveform.ndim > 1: | |
waveform = waveform[0] | |
# Process the audio file | |
inputs = processor( | |
waveform.squeeze().numpy(), | |
sampling_rate=target_sample_rate, | |
return_tensors="pt", | |
padding=True | |
) | |
input_values = inputs["input_values"].to(device) | |
# Perform inference | |
with torch.no_grad(): | |
logits = model(input_values).logits | |
probabilities = torch.nn.functional.softmax(logits, dim=-1) | |
predicted_label = torch.argmax(probabilities, dim=-1).item() | |
confidence = probabilities[0, predicted_label].item() | |
# Map label to class name | |
class_name = id2label.get(predicted_label, "Unknown Class") | |
return { | |
"Class": class_name, | |
"Confidence": round(confidence * 100, 2) | |
} | |
except Exception as e: | |
return {"error": f"Error processing the audio file: {str(e)}"} | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=predict_audio, | |
inputs=gr.Audio(type="file"), | |
outputs=[ | |
gr.Label(label="Predicted Class"), | |
gr.Label(label="Confidence") | |
], | |
title="Audio Classification with Wav2Vec2", | |
description="Upload an audio file to classify it into one of the predefined categories." | |
) | |
# Launch the Gradio app | |
if __name__ == "__main__": | |
iface.launch() | |