|
|
|
|
|
|
|
|
|
import gradio as gr |
|
import openai |
|
import whisper |
|
import numpy as np |
|
import os |
|
|
|
|
|
model = whisper.load_model("base") |
|
|
|
|
|
openai.api_key = 'your_openai_api_key' |
|
|
|
|
|
def speech_to_text(audio): |
|
result = model.transcribe(audio) |
|
return result['text'] |
|
|
|
|
|
def gpt_response(text): |
|
response = openai.Completion.create( |
|
engine="gpt-3.5-turbo", |
|
prompt=text, |
|
max_tokens=100 |
|
) |
|
return response.choices[0].text.strip() |
|
|
|
|
|
def text_to_speech_google(text): |
|
from google.cloud import texttospeech |
|
client = texttospeech.TextToSpeechClient() |
|
input_text = texttospeech.SynthesisInput(text=text) |
|
|
|
voice = texttospeech.VoiceSelectionParams( |
|
language_code="en-US", |
|
ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL |
|
) |
|
|
|
audio_config = texttospeech.AudioConfig( |
|
audio_encoding=texttospeech.AudioEncoding.MP3 |
|
) |
|
|
|
response = client.synthesize_speech( |
|
input=input_text, voice=voice, audio_config=audio_config |
|
) |
|
|
|
|
|
output_path = "output.mp3" |
|
with open(output_path, "wb") as out: |
|
out.write(response.audio_content) |
|
return output_path |
|
|
|
|
|
def conversation_pipeline(audio): |
|
|
|
text = speech_to_text(audio) |
|
|
|
|
|
response_text = gpt_response(text) |
|
|
|
|
|
response_audio = text_to_speech_google(response_text) |
|
|
|
return response_text, response_audio |
|
|
|
|
|
demo = gr.Interface( |
|
fn=conversation_pipeline, |
|
inputs=gr.Audio(source="microphone", type="filepath"), |
|
outputs=[gr.Textbox(label="GPT Response"), gr.Audio(label="GPT Response Audio", type="filepath", autoplay=True)] |
|
) |
|
|
|
demo.launch(show_error=True) |