File size: 2,038 Bytes
f1b1f12
6c21bf5
fea2ffd
f1b1f12
 
676da5b
ee04627
676da5b
f1b1f12
676da5b
ee04627
676da5b
 
 
 
ee04627
676da5b
 
f1b1f12
 
 
 
 
 
676da5b
 
 
 
 
 
 
b9b4dea
 
676da5b
f1b1f12
 
 
 
 
 
 
 
 
 
 
676da5b
ee04627
676da5b
 
 
 
 
 
 
 
 
 
b9b4dea
676da5b
 
 
ee04627
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import gradio as gr
from transformers import pipeline
import os
from pydub import AudioSegment
import tempfile


# Model ID from Hugging Face
model_id = "Quantamhash/Quantum_STT"

# Load the speech recognition pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model_id,
    generate_kwargs={"language": "en", "task": "transcribe"},
    tokenizer=model_id
)

def convert_to_wav(input_path):
    audio = AudioSegment.from_file(input_path)
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
        audio.export(temp_wav.name, format="wav")
        return temp_wav.name

# Transcription function with format check
def transcribe(audio):
    if audio is None:
        return "Please upload an audio file."

    # Optional: validate file extension
    ext = os.path.splitext(audio)[1].lower()
    if ext not in [".caf", ".au", ".opus", ".amr", ".alac", ".aiff", ".wma", ".m4a", ".ogg", ".aac", ".flac", ".wav", ".mp3"]:
        return f"❌ Unsupported file format: {ext}. Please upload .caf, .au, .opus, .amr, .alac, .aiff, .wma, .m4a, .ogg, .aac, .flac, .wav or .mp3 files."

    # Convert to .wav if necessary
    if ext != ".wav":
        audio = convert_to_wav(audio)

    try:
        result = pipe(audio)
        return result["text"]
    except ValueError as e:
        return f"Error processing audio file: {str(e)}"
    except Exception as e:
        return f"An unexpected error occurred: {str(e)}"


# Gradio interface
interface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(
        type="filepath",           # return audio file path
        sources=["upload"],        # restrict to file upload (not mic)
        label="🎡 Upload Audio File"
    ),
    outputs=gr.Textbox(label="πŸ“ Transcription"),
    title="πŸŽ™οΈ Quantum Speech Recognizer",
    description="Upload an audio file (.caf, .au, .opus, .amr, .alac, .aiff, .wma, .m4a, .ogg, .aac, .flac, .wav, .mp3)<br>***to transcribe it using the Quantum_STT model***."
)

# Launch the interface
interface.launch()