import gradio as gr # from gradio import ChatMessage from transformers import WhisperProcessor, WhisperForConditionalGeneration import numpy as np import librosa import json import os from huggingface_hub import InferenceClient hf_token = os.getenv("HF_Token") # def get_token(): # with open("credentials.json","r") as f: # credentials = json.load(f) # return credentials['token'] # hf_token = get_token() client = InferenceClient( "meta-llama/Meta-Llama-3-8B-Instruct", token=hf_token) def chat(audio, chat:list, asr_model:str): if asr_model == "openai/whisper-large-v2": transcription = transcribe_whisper_large_v2(audio) elif asr_model == "openai/whisper-tiny.en": transcription = transcribe_whisper_tiny_en(audio) else: raise ValueError(f"No Model found with the given choice: {asr_model}") chat.append({'role':'user','content':transcription}) response = client.chat_completion( messages=chat, max_tokens=500, stream=False, ).choices[0].message.content chat.append({'role':'assistant','content':response}) return chat def transcribe_whisper_large_v2(audio): sr, audio = audio audio = audio.astype(np.float32) if len(audio.shape) > 2 and audio.shape[1] > 1: audio = np.mean(audio, axis=1) audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features predicted_ids = model.generate(input_features) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False) transcription = processor.tokenizer.normalize(transcription[0]) return transcription def transcribe_whisper_tiny_en(audio): sr, audio = audio audio = audio.astype(np.float32) if len(audio.shape) > 2 and audio.shape[1] > 1: audio = np.mean(audio, axis=1) audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features predicted_ids = model.generate(input_features) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) transcription = processor.tokenizer.normalize(transcription[0]) return transcription def load_model(asr_model_choice:str): global processor global model global model_flag if asr_model_choice == "openai/whisper-large-v2": processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2") model.config.forced_decoder_ids = None model_flag = "openai/whisper-large-v2" elif asr_model_choice == "openai/whisper-tiny.en": model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") model_flag = "openai/whisper-tiny.en" print("Model Loaded: ",model_flag) with gr.Blocks() as app: gr.Markdown("# VoiceBot") gr.Markdown("Welcome to VoiceBot 👋, here is how it works") gr.Markdown("This Bot can only be interacted with through your voice. Press record and say something, after stopping the recoding your audio will be processed directly. You have the option to choose between different models. The model you choose influences the Bot's perfomance to understand what you have said. A better perfomance also comes with longer waiting time. 😕") gr.Markdown("Have fun playing arround 🎉") gr.Markdown("If you have any wishes for models or and idea, feel free to let me know 🙌") chatbot = gr.Chatbot( value=[{ 'role':'System', 'content':'You are a helpfull assitant for an Audio based Chatbot. You are helping Users to order their notes and thoughts.' }], bubble_full_width=False, type="messages" ) with gr.Row(): audio_input = gr.Audio( sources=['microphone'], interactive=True, scale=8 ) with gr.Accordion(label="Settings", open=False): asr_model_choice = gr.Radio( label="Select ASR Model", choices=["openai/whisper-large-v2","openai/whisper-tiny.en"], value="openai/whisper-tiny.en" ) asr_model_choice.change(load_model, asr_model_choice) # Event listener for when the audio recording stops audio_input.stop_recording(fn=chat, inputs=[audio_input, chatbot, asr_model_choice], outputs=chatbot) app.launch()