Spaces:

Quantamhash
/

Quantum_STT-V1

Running on Zero

File size: 2,038 Bytes

f1b1f12
6c21bf5
fea2ffd
f1b1f12
 
676da5b
ee04627
676da5b
f1b1f12
676da5b
ee04627
676da5b
 
 
 
ee04627
676da5b
 
f1b1f12
 
 
 
 
 
676da5b
 
 
 
 
 
 
b9b4dea
 
676da5b
f1b1f12
 
 
 
 
 
 
 
 
 
 
676da5b
ee04627
676da5b
 
 
 
 
 
 
 
 
 
b9b4dea
676da5b
 
 
ee04627

import gradio as gr
from transformers import pipeline
import os
from pydub import AudioSegment
import tempfile


# Model ID from Hugging Face
model_id = "Quantamhash/Quantum_STT"

# Load the speech recognition pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model_id,
    generate_kwargs={"language": "en", "task": "transcribe"},
    tokenizer=model_id
)

def convert_to_wav(input_path):
    audio = AudioSegment.from_file(input_path)
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
        audio.export(temp_wav.name, format="wav")
        return temp_wav.name

# Transcription function with format check
def transcribe(audio):
    if audio is None:
        return "Please upload an audio file."

    # Optional: validate file extension
    ext = os.path.splitext(audio)[1].lower()
    if ext not in [".caf", ".au", ".opus", ".amr", ".alac", ".aiff", ".wma", ".m4a", ".ogg", ".aac", ".flac", ".wav", ".mp3"]:
        return f"❌ Unsupported file format: {ext}. Please upload .caf, .au, .opus, .amr, .alac, .aiff, .wma, .m4a, .ogg, .aac, .flac, .wav or .mp3 files."

    # Convert to .wav if necessary
    if ext != ".wav":
        audio = convert_to_wav(audio)

    try:
        result = pipe(audio)
        return result["text"]
    except ValueError as e:
        return f"Error processing audio file: {str(e)}"
    except Exception as e:
        return f"An unexpected error occurred: {str(e)}"


# Gradio interface
interface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(
        type="filepath",           # return audio file path
        sources=["upload"],        # restrict to file upload (not mic)
        label="🎵 Upload Audio File"
    ),
    outputs=gr.Textbox(label="📝 Transcription"),
    title="🎙️ Quantum Speech Recognizer",
    description="Upload an audio file (.caf, .au, .opus, .amr, .alac, .aiff, .wma, .m4a, .ogg, .aac, .flac, .wav, .mp3)<br>***to transcribe it using the Quantum_STT model***."
)

# Launch the interface
interface.launch()