butttler / app.py
David1717's picture
initial commit
08c317d verified
# This Gradio app creates a conversation pipeline that includes speech-to-text using the Whisper model,
# GPT response generation, and text-to-speech using the Google Text-to-Speech API.
# The app uses the microphone input to capture audio, processes it through the pipeline, and returns the GPT response as text and audio.
import gradio as gr
import openai
import whisper
import numpy as np
import os
# Load the Whisper model
model = whisper.load_model("base")
# OpenAI API key for GPT
openai.api_key = 'your_openai_api_key'
# Function to convert speech to text using Whisper
def speech_to_text(audio):
result = model.transcribe(audio)
return result['text']
# Function to get GPT response
def gpt_response(text):
response = openai.Completion.create(
engine="gpt-3.5-turbo",
prompt=text,
max_tokens=100
)
return response.choices[0].text.strip()
# Function to convert text to speech using Google Text-to-Speech API
def text_to_speech_google(text):
from google.cloud import texttospeech
client = texttospeech.TextToSpeechClient()
input_text = texttospeech.SynthesisInput(text=text)
voice = texttospeech.VoiceSelectionParams(
language_code="en-US",
ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
)
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
response = client.synthesize_speech(
input=input_text, voice=voice, audio_config=audio_config
)
# Save the audio response to a file
output_path = "output.mp3"
with open(output_path, "wb") as out:
out.write(response.audio_content)
return output_path
# Function to handle the entire conversation pipeline
def conversation_pipeline(audio):
# Step 1: Convert speech to text
text = speech_to_text(audio)
# Step 2: Get GPT response
response_text = gpt_response(text)
# Step 3: Convert GPT response to speech
response_audio = text_to_speech_google(response_text)
return response_text, response_audio
# Gradio interface
demo = gr.Interface(
fn=conversation_pipeline,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=[gr.Textbox(label="GPT Response"), gr.Audio(label="GPT Response Audio", type="filepath", autoplay=True)]
)
demo.launch(show_error=True)