import gradio as gr # from gradio import ChatMessage from transformers import WhisperProcessor, WhisperForConditionalGeneration import numpy as np import librosa import json import os from huggingface_hub import InferenceClient hf_token = os.getenv("HF_Token") # def get_token(): # with open("credentials.json","r") as f: # credentials = json.load(f) # return credentials['token'] # hf_token = get_token() client = InferenceClient( "meta-llama/Meta-Llama-3-8B-Instruct", token=hf_token) processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2") model.config.forced_decoder_ids = None def chat(audio, chat:list): transcription = transcribe(audio) chat.append({'role':'user','content':transcription}) response = client.chat_completion( messages=chat, max_tokens=500, stream=False, ).choices[0].message.content chat.append({'role':'assistant','content':response}) return chat def transcribe(audio): sr, audio = audio audio = audio.astype(np.float32) if len(audio.shape) > 2 and audio.shape[1] > 1: audio = np.mean(audio, axis=1) audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features predicted_ids = model.generate(input_features) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False) transcription = processor.tokenizer.normalize(transcription[0]) return transcription with gr.Blocks() as app: chatbot = gr.Chatbot( value=[{ 'role':'System', 'content':'You are a helpfull assitant for an Audio based Chatbot. You are helping Users to order their notes and thoughts.' }], bubble_full_width=False, type="messages" ) with gr.Row(): audio_input = gr.Audio( sources=['microphone'], interactive=True, scale=8 ) # mode_option = gr.Radio( # choices=["online", "local"], # scale=1 # ) # Event listener for when the audio recording stops audio_input.stop_recording(fn=chat, inputs=[audio_input, chatbot], outputs=chatbot) app.launch()