import gradio as gr
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import torch
import torchaudio

# Modelni yuklash
model_name = "Mrkomiljon/voiceGUARD"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name)
model.eval()

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Klass mapping
id2label = {
    0: "diffwave",
    1: "melgan",
    2: "parallel_wave_gan",
    3: "Real",
    4: "wavegrad",
    5: "wavnet",
    6: "wavernn"
}

# Prediction funksiyasi
def predict_audio(file_path):
    target_sample_rate = 16000
    max_length = target_sample_rate * 10

    try:
        # upload audio file
        waveform, sample_rate = torchaudio.load(file_path)

        # Resample agar sample_rate mos kelmasa
        if sample_rate != target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
            waveform = resampler(waveform)

        # Truncate yoki pad
        if waveform.size(1) > max_length:
            waveform = waveform[:, :max_length]
        elif waveform.size(1) < max_length:
            waveform = torch.nn.functional.pad(waveform, (0, max_length - waveform.size(1)))

        if waveform.ndim > 1:
            waveform = waveform[0]

        # Preprocess input
        inputs = processor(
            waveform.numpy(),
            sampling_rate=target_sample_rate,
            return_tensors="pt",
            padding=True
        )
        input_values = inputs["input_values"].to(device)

        # Inference
        with torch.no_grad():
            logits = model(input_values).logits
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            predicted_label = torch.argmax(probabilities, dim=-1).item()
            confidence = probabilities[0, predicted_label].item()

        class_name = id2label.get(predicted_label, "Unknown Class")

        # Return alohida qiymatlar
        return class_name, float(confidence)

    except Exception as e:
        # Xatolik bo'lsa
        return "Error", str(e)

# Gradio interfeysi
iface = gr.Interface(
    fn=predict_audio,
    inputs=gr.Audio(type="filepath"),  # `filepath` parameter use
    outputs=[
        gr.Label(label="Predicted Class"),
        gr.Label(label="Confidence")
    ],
    title="Human or AI-generated voice classification",
    description="Upload an audio file to classify it into one of the predefined categories."
)

if __name__ == "__main__":
    iface.launch()