import gradio as gr
from transformers import pipeline
import os
from pydub import AudioSegment
import tempfile


# Model ID from Hugging Face
model_id = "Quantamhash/Quantum_STT"

# Load the speech recognition pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model_id,
    generate_kwargs={"language": "en", "task": "transcribe"},
    tokenizer=model_id
)

def convert_to_wav(input_path):
    audio = AudioSegment.from_file(input_path)
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
        audio.export(temp_wav.name, format="wav")
        return temp_wav.name

# Transcription function with format check
def transcribe(audio):
    if audio is None:
        return "Please upload an audio file."

    # Optional: validate file extension
    ext = os.path.splitext(audio)[1].lower()
    if ext not in [".caf", ".au", ".opus", ".amr", ".alac", ".aiff", ".wma", ".m4a", ".ogg", ".aac", ".flac", ".wav", ".mp3"]:
        return f"❌ Unsupported file format: {ext}. Please upload .caf, .au, .opus, .amr, .alac, .aiff, .wma, .m4a, .ogg, .aac, .flac, .wav or .mp3 files."

    # Convert to .wav if necessary
    if ext != ".wav":
        audio = convert_to_wav(audio)

    try:
        result = pipe(audio)
        return result["text"]
    except ValueError as e:
        return f"Error processing audio file: {str(e)}"
    except Exception as e:
        return f"An unexpected error occurred: {str(e)}"


# Gradio interface
interface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(
        type="filepath",           # return audio file path
        sources=["upload"],        # restrict to file upload (not mic)
        label="🎵 Upload Audio File"
    ),
    outputs=gr.Textbox(label="📝 Transcription"),
    title="🎙️ Quantum Speech Recognizer",
    description="Upload an audio file (.caf, .au, .opus, .amr, .alac, .aiff, .wma, .m4a, .ogg, .aac, .flac, .wav, .mp3)<br>***to transcribe it using the Quantum_STT model***."
)

# Launch the interface
interface.launch()