import gradio as gr import librosa import openai from constants import * openai.api_key = OPENAI_API_KEY def get_command(command, model, id2label): """ This function get the classification outputs from openai API """ completion = openai.Completion.create( model=model, prompt=f"{command}->", max_tokens=1, temperature=0 ) id = int(completion["choices"][0]["text"].strip()) result = id2label[id] if id in id2label else "unknown" return result def transcribe(audio, text): """ if text provided the function will classify the input directly. if not the audio will be transcribed then the transcription will be classified. """ if text: result = get_command(text, MODEL, id2label) return "Text provided by the user", text_respnses[result], None # Downsample original frequency to 16000hrz input, rate = librosa.load(audio, sr=16000) # getting text transcription inputs = processor(input, sampling_rate=rate, return_tensors="pt") generated_ids = model.generate( inputs["input_features"], attention_mask=inputs["attention_mask"] ) transcription = processor.batch_decode(generated_ids, skip_special_tokens=True) result = get_command(transcription, MODEL, id2label) audio_res = resoponses.get(result)() return transcription, text_respnses[result], audio_res if __name__ == "__main__": gr.Interface( fn=transcribe, inputs=[ gr.Audio(label="", source="microphone", type="filepath"), gr.Textbox(label="If you prefer type your command (more accurate)"), ], outputs=[ gr.Textbox( label="Input Transcription (Please check that this matches what you've said)" ), gr.Textbox(label="Machine Response (Text Version)"), gr.Audio(label="Machine Response (Audio Version)"), ], allow_flagging="auto", ).launch()