File size: 4,391 Bytes
bd3e2e4
3b38860
8cf49b6
 
 
51f98f4
 
 
 
 
 
 
3b38860
65e32e0
bd3e2e4
51f98f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8cf49b6
 
 
 
 
 
 
 
d4b5f92
3db384d
 
d4b5f92
 
 
bd3e2e4
82f0eab
 
97e9b74
82f0eab
 
 
 
 
 
 
 
bd3e2e4
51f98f4
 
 
 
97e9b74
51f98f4
 
 
63f3aa1
 
 
97e9b74
bd3e2e4
d4b5f92
 
9f98382
 
 
 
 
 
 
8cf49b6
82f0eab
8cf49b6
bd3e2e4
d4b5f92
 
 
 
 
ed9cdc7
d4b5f92
 
 
 
 
 
 
 
3db384d
 
 
 
 
d4b5f92
 
 
 
 
 
 
63f3aa1
d4b5f92
 
 
 
 
 
 
65e32e0
 
 
 
d4b5f92
 
 
 
 
 
 
3db384d
65e32e0
 
 
d4b5f92
 
65e32e0
 
 
d4b5f92
3b38860
bd3e2e4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import gradio as gr
from faster_whisper import WhisperModel
import edge_tts
import tempfile
import asyncio
import yaml
import os
import openai

open_ai_client = openai.OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

model = WhisperModel("tiny", compute_type="float32")

with open('config.yml', 'r') as file:
    config = yaml.safe_load(file)

def generate_prompt(personality: str, user_query: str) -> str:
    
    prompt = f'''
{config['prompts']['base']}
{config['prompts'][personality]}

User query:

{user_query} -> '''

    return prompt


def gpt_answer(prompt, personality, chatbot_history):

    print(f'going to send the prompt: {prompt}')

    history_for_gpt_call = [
        {"role": "system", "content": f"You are a helpful assistant, with the personality of a {personality}."}
    ] + chatbot_history + [
        {"role": "user", "content": prompt}
    ]

    completion =  open_ai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=history_for_gpt_call
    )

    # Extract the generated response from the API response
    generated_text = completion.choices[0].message.content.strip()

    return generated_text

# Text-to-speech function
async def text_to_speech(text, voice):   
    communicate = edge_tts.Communicate(text, voice)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path, None

def generate_response(
    # language_level,
    buddy_personality,
    language_choice, user_query_audio,
    chatbot_history
):
    # Convert input audio to text

    language_codes = {'English':'en',
                     'Spanish':'es',
                     'Japanese':'ja'}

    user_query_transcribed_segments, info = model.transcribe(
        audio=user_query_audio,
        language=language_codes[language_choice]
        )
    user_query_transcribed = list(user_query_transcribed_segments)[0].text.strip()

    # Ask llm for response to text
    prompt = generate_prompt(
        personality=buddy_personality,
        user_query=user_query_transcribed
    )

    bot_message = gpt_answer(prompt=prompt,
                             personality=buddy_personality,
                             chatbot_history=chatbot_history)

    chatbot_history.append(gr.ChatMessage(role="user", content=user_query_transcribed))
    chatbot_history.append(gr.ChatMessage(role="assistant", content=bot_message))

    # Convert llm response to audio
    # Return None to reset user input audio and
    # llm response + user inputs in chatbot_history object to be displayed 
    if language_choice == "Spanish":
        voice_short_name =  "es-MX-JorgeNeural"
    elif language_choice == "Japanese":
        voice_short_name = "ja-JP-KeitaNeural"
    else: 
        # default to an english voice otherwise
        voice_short_name = "en-US-BrianNeural"
    bot_message_audio, warning = asyncio.run(text_to_speech(text=bot_message, voice=voice_short_name))
    
    return None, chatbot_history, bot_message_audio

with gr.Blocks() as demo:

    header_section = gr.Markdown(
    """
    # AI Language Buddy!
    Click the **Send Message** button to practice your language skills!
    """)
    
    language = gr.Dropdown(
        choices=['English', 'Spanish', 'Japanese'],
        label='Language Choice',
        value='English'
    )
    
    # language_level = gr.Dropdown(
    #     choices=['Beginner', 'Intermediate', 'Advanced'],
    #     label='Language Level',
    #     value='Beginner'
    # )
    
    personality = gr.Dropdown(
        choices=['Formal Teacher', 'Flirty Friend', 'Sarcastic Bro'],
        label='Language Buddy Personality',
        value='Flirty Friend'
    )

    chatbot = gr.Chatbot(type='messages')
    
    user_input = gr.Audio(
        sources='microphone',
        show_download_button=True,
        type='filepath'
    )

    ai_response = gr.Audio(
        autoplay=True
    )

    converse_button = gr.Button("Send Message")

    clear_button = gr.Button("Clear Convo History")

    converse_button.click(
        fn=generate_response,
        inputs=[
            # language_level,
            personality,
            language,
            user_input,
            chatbot
        ],
        outputs=[user_input,
                 chatbot,
                 ai_response]
    )

demo.launch()