import gradio as gr from transformers import pipeline import os from pydub import AudioSegment import tempfile # Model ID from Hugging Face model_id = "Quantamhash/Quantum_STT" # Load the speech recognition pipeline pipe = pipeline( "automatic-speech-recognition", model=model_id, generate_kwargs={"language": "en", "task": "transcribe"}, tokenizer=model_id ) def convert_to_wav(input_path): audio = AudioSegment.from_file(input_path) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: audio.export(temp_wav.name, format="wav") return temp_wav.name # Transcription function with format check def transcribe(audio): if audio is None: return "Please upload an audio file." # Optional: validate file extension ext = os.path.splitext(audio)[1].lower() if ext not in [".caf", ".au", ".opus", ".amr", ".alac", ".aiff", ".wma", ".m4a", ".ogg", ".aac", ".flac", ".wav", ".mp3"]: return f"❌ Unsupported file format: {ext}. Please upload .caf, .au, .opus, .amr, .alac, .aiff, .wma, .m4a, .ogg, .aac, .flac, .wav or .mp3 files." # Convert to .wav if necessary if ext != ".wav": audio = convert_to_wav(audio) try: result = pipe(audio) return result["text"] except ValueError as e: return f"Error processing audio file: {str(e)}" except Exception as e: return f"An unexpected error occurred: {str(e)}" # Gradio interface interface = gr.Interface( fn=transcribe, inputs=gr.Audio( type="filepath", # return audio file path sources=["upload"], # restrict to file upload (not mic) label="🎵 Upload Audio File" ), outputs=gr.Textbox(label="📝 Transcription"), title="🎙️ Quantum Speech Recognizer", description="Upload an audio file (.caf, .au, .opus, .amr, .alac, .aiff, .wma, .m4a, .ogg, .aac, .flac, .wav, .mp3)
***to transcribe it using the Quantum_STT model***." ) # Launch the interface interface.launch()