import gradio as gr
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import torch
import torchaudio
from io import BytesIO

# Hugging Face Model Hub'dan modelni yuklash
model_name = "Mrkomiljon/voiceGUARD/wav2vec2_finetuned_model"  # Hugging Face Model Hub'dagi modelning to'liq nomi
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name)
model.eval()

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define label mapping
id2label = {
    0: "diffwave",
    1: "melgan",
    2: "parallel_wave_gan",
    3: "Real",
    4: "wavegrad",
    5: "wavnet",
    6: "wavernn"
}

# Define the prediction function
def predict_audio(file):
    target_sample_rate = 16000  # Model's expected sample rate
    max_length = target_sample_rate * 10  # 10 seconds in samples

    try:
        # Load the audio file
        audio_bytes = file.read()
        waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))

        # Resample if the sample rate doesn't match the model's expected rate
        if sample_rate != target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
            waveform = resampler(waveform)

        # Truncate or pad the waveform to ensure consistent input length
        if waveform.size(1) > max_length:
            waveform = waveform[:, :max_length]  # Truncate
        elif waveform.size(1) < max_length:
            waveform = torch.nn.functional.pad(waveform, (0, max_length - waveform.size(1)))  # Pad
        if waveform.ndim > 1:
            waveform = waveform[0]

        # Process the audio file
        inputs = processor(
            waveform.squeeze().numpy(),
            sampling_rate=target_sample_rate,
            return_tensors="pt",
            padding=True
        )
        input_values = inputs["input_values"].to(device)

        # Perform inference
        with torch.no_grad():
            logits = model(input_values).logits
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            predicted_label = torch.argmax(probabilities, dim=-1).item()
            confidence = probabilities[0, predicted_label].item()

        # Map label to class name
        class_name = id2label.get(predicted_label, "Unknown Class")

        return {
            "Class": class_name,
            "Confidence": round(confidence * 100, 2)
        }
    except Exception as e:
        return {"error": f"Error processing the audio file: {str(e)}"}

# Create the Gradio interface
iface = gr.Interface(
    fn=predict_audio,
    inputs=gr.Audio(type="file"),
    outputs=[
        gr.Label(label="Predicted Class"),
        gr.Label(label="Confidence")
    ],
    title="Audio Classification with Wav2Vec2",
    description="Upload an audio file to classify it into one of the predefined categories."
)

# Launch the Gradio app
if __name__ == "__main__":
    iface.launch()