from transformers import AutoTokenizer, AutoModelForCausalLM | |
from transformers import pipeline | |
import gradio as gr | |
from fastrtc import (ReplyOnPause, Stream, get_stt_model, get_tts_model) | |
import numpy as np | |
# messages = [ | |
# {"role": "user", "content": "Who are you?"}, | |
# ] | |
# pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-3B-Instruct") | |
# pipe(messages) | |
chatbot = pipeline("text-generation", model="microsoft/DialoGPT-medium") | |
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct") | |
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct") | |
stt_model = get_stt_model() | |
tts_model = get_tts_model() | |
#stream = Stream(ReplyOnPause(echo), modality="audio", mode="send-receive") | |
def echo(audio): | |
prompt = stt_model.stt(audio) | |
# response = sambanova_client.chat.completions.create( | |
# model="Meta-Llama-3.2-3B-Instruct", | |
# messages=[{"role": "user", "content": prompt}], | |
# max_tokens=200, | |
# ) | |
# prompt = response.choices[0].message.content | |
bot_response = chat_with_bot(prompt) | |
audio_chunks = [] | |
for audio_chunk in tts_model.stream_tts_sync(bot_response): | |
audio_chunks.append(audio_chunk) | |
return audio_chunks[0] | |
# for audio_chunk in tts_model.stream_tts_sync(prompt): | |
# yield audio_chunk | |
def process_audio(audio_input): | |
# audio_input is received as a Gradio Audio object, containing a tuple of (sample_rate, numpy array) | |
sample_rate, audio_data = audio_input | |
# Process audio through the stream | |
processed_audio = echo((sample_rate, audio_data)) | |
# Return processed audio to Gradio for output | |
return sample_rate, processed_audio | |
def chat_with_bot(user_input): | |
# Generate a response from the chatbot model | |
response = chatbot(user_input) | |
return response[0]['generated_text'] | |
# interface = gr.Interface( | |
# fn=chat_with_bot, # Function to call for processing the input | |
# inputs=gr.Textbox(label="Enter your message"), # User input (text) | |
# outputs=gr.Textbox(label="Chatbot Response"), # Model output (text) | |
# title="Chat with DialoGPT", # Optional: Add a title to your interface | |
# description="Chat with an AI model powered by DialoGPT!" # Optional: Add a description | |
# ) | |
interface = gr.Interface( | |
fn=process_audio, # The function to process audio | |
inputs=gr.Audio(type="numpy"), # Microphone input (audio) | |
outputs=gr.Audio(type="numpy"), # Audio output (processed) | |
live=True # Make the processing live (if needed) | |
) | |
interface.launch() |