VoiceBot / app.py
j-tobias
initial commit
e1e27eb
raw
history blame
2.35 kB
import gradio as gr
# from gradio import ChatMessage
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import numpy as np
import librosa
import json
import os
from huggingface_hub import InferenceClient
hf_token = os.getenv("HF_Token")
# def get_token():
# with open("credentials.json","r") as f:
# credentials = json.load(f)
# return credentials['token']
# hf_token = get_token()
client = InferenceClient(
"meta-llama/Meta-Llama-3-8B-Instruct",
token=hf_token)
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
model.config.forced_decoder_ids = None
def chat(audio, chat:list):
transcription = transcribe(audio)
chat.append({'role':'user','content':transcription})
response = client.chat_completion(
messages=chat,
max_tokens=500,
stream=False,
).choices[0].message.content
chat.append({'role':'assistant','content':response})
return chat
def transcribe(audio):
sr, audio = audio
audio = audio.astype(np.float32)
if len(audio.shape) > 2 and audio.shape[1] > 1:
audio = np.mean(audio, axis=1)
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
transcription = processor.tokenizer.normalize(transcription[0])
return transcription
with gr.Blocks() as app:
chatbot = gr.Chatbot(
value=[{
'role':'System',
'content':'You are a helpfull assitant for an Audio based Chatbot. You are helping Users to order their notes and thoughts.'
}],
bubble_full_width=False,
type="messages"
)
with gr.Row():
audio_input = gr.Audio(
sources=['microphone'],
interactive=True,
scale=8
)
# mode_option = gr.Radio(
# choices=["online", "local"],
# scale=1
# )
# Event listener for when the audio recording stops
audio_input.stop_recording(fn=chat, inputs=[audio_input, chatbot], outputs=chatbot)
app.launch()