Vocode-VoiceAI / app.py
Kabilash10's picture
Update app.py
630a9eb verified
import gradio as gr
import requests
import openai
import asyncio
import os
from deepgram import Deepgram
from vocode.streaming.models.transcriber import (
DeepgramTranscriberConfig,
PunctuationEndpointingConfig,
)
from vocode.streaming.models.agent import ChatGPTAgentConfig
from vocode.streaming.models.message import BaseMessage
from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig
from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent
from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
from vocode.streaming.streaming_conversation import StreamingConversation
from vocode.helpers import create_streaming_microphone_input_and_speaker_output
# Fetch API keys and voice IDs from environment variables
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
VOICE_ID = os.getenv("VOICE_ID")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# Initialize OpenAI client
client = openai.OpenAI(api_key=OPENAI_API_KEY)
# Initialize Deepgram
deepgram = Deepgram(DEEPGRAM_API_KEY)
# Function to transcribe audio using Deepgram
async def transcribe_audio(audio_file_path):
with open(audio_file_path, 'rb') as audio_file:
audio_data = audio_file.read()
response = await deepgram.transcription.prerecorded(
{"buffer": audio_data, "mimetype": "audio/wav"},
{'punctuate': True, 'language': 'en'}
)
transcription = response['results']['channels'][0]['alternatives'][0]['transcript']
return transcription
# Function to generate content using OpenAI GPT-4
def generate_content(input_text):
response = client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": input_text}
]
)
generated_text = response.choices[0].message.content.strip()
return generated_text
# Function to convert text to speech using Eleven Labs
def text_to_speech(text):
url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}"
headers = {
"Accept": "audio/mpeg",
"Content-Type": "application/json",
"xi-api-key": ELEVEN_LABS_API_KEY
}
data = {
"text": text,
"voice_settings": {
"stability": 0.75,
"similarity_boost": 0.75
}
}
response = requests.post(url, json=data, headers=headers)
if response.status_code == 200:
with open("output.mp3", "wb") as f:
f.write(response.content)
return "output.mp3"
else:
return f"Error: {response.status_code} - {response.text}"
# Main function to handle the entire process
async def process_audio(audio):
transcription = await transcribe_audio(audio)
generated_text = generate_content(transcription)
audio_file = text_to_speech(generated_text)
return transcription, generated_text, audio_file
# Gradio interface setup
interface = gr.Interface(
fn=lambda audio: asyncio.run(process_audio(audio)),
inputs=gr.Audio(type="filepath", label="Speak into your microphone"),
outputs=[
gr.Textbox(label="Transcription Output"),
gr.Textbox(label="Generated Content"),
gr.Audio(label="Synthesized Speech")
],
title="Speech-to-Text, Content Generation, and Text-to-Speech",
description="Speak into the microphone, and the system will transcribe your speech, generate content, and convert the generated text into speech."
)
# Launch the Gradio interface
interface.launch()