SalexAI commited on
Commit
22bf868
·
verified ·
1 Parent(s): be93d30

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -0
app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import os
3
+ import asyncio
4
+ import base64
5
+ import io
6
+ import traceback
7
+
8
+ import cv2
9
+ import pyaudio
10
+ import PIL.Image
11
+ import mss
12
+ import gradio as gr
13
+
14
+ from google import genai
15
+ from google.genai import types
16
+
17
+ # Audio and video capture config
18
+ FORMAT = pyaudio.paInt16
19
+ CHANNELS = 1
20
+ SEND_SAMPLE_RATE = 16000
21
+ RECEIVE_SAMPLE_RATE = 24000
22
+ CHUNK_SIZE = 1024
23
+
24
+ MODEL = "models/gemini-2.0-flash-live-001"
25
+
26
+ # Initialize Google GenAI client
27
+ client = genai.Client(
28
+ http_options={"api_version": "v1beta"},
29
+ api_key=os.environ.get("GEMINI_API_KEY"),
30
+ )
31
+
32
+ # Live connect configuration
33
+ CONFIG = types.LiveConnectConfig(
34
+ response_modalities=["audio"],
35
+ speech_config=types.SpeechConfig(
36
+ voice_config=types.VoiceConfig(
37
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck")
38
+ )
39
+ ),
40
+ system_instruction=types.Content(
41
+ parts=[types.Part.from_text(text="You are Puck..." )]
42
+ ),
43
+ )
44
+
45
+ # AudioLoop class adapted for Gradio
46
+ class AudioLoop:
47
+ def __init__(self, mode="camera"):
48
+ self.mode = mode
49
+ self.audio_in_queue = None
50
+ self.out_queue = None
51
+ self.session = None
52
+
53
+ async def _get_frame(self, cap):
54
+ ret, frame = cap.read()
55
+ if not ret:
56
+ return None
57
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
58
+ img = PIL.Image.fromarray(frame_rgb)
59
+ img.thumbnail((640, 480))
60
+ buf = io.BytesIO()
61
+ img.save(buf, format="JPEG")
62
+ return buf.getvalue()
63
+
64
+ async def _video_stream(self):
65
+ cap = await asyncio.to_thread(cv2.VideoCapture, 0)
66
+ try:
67
+ while True:
68
+ frame = await self._get_frame(cap)
69
+ if frame is None:
70
+ break
71
+ await self.out_queue.put({"mime_type": "image/jpeg", "data": base64.b64encode(frame).decode()})
72
+ await asyncio.sleep(0.1)
73
+ finally:
74
+ cap.release()
75
+
76
+ async def _audio_stream(self):
77
+ mic_info = pya.get_default_input_device_info()
78
+ stream = await asyncio.to_thread(
79
+ pyaudio.PyAudio().open,
80
+ format=FORMAT,
81
+ channels=CHANNELS,
82
+ rate=SEND_SAMPLE_RATE,
83
+ input=True,
84
+ input_device_index=mic_info['index'],
85
+ frames_per_buffer=CHUNK_SIZE,
86
+ )
87
+ while True:
88
+ data = await asyncio.to_thread(stream.read, CHUNK_SIZE, False)
89
+ await self.out_queue.put({"data": data, "mime_type": "audio/pcm"})
90
+
91
+ async def send_realtime(self):
92
+ while True:
93
+ msg = await self.out_queue.get()
94
+ await self.session.send(input=msg)
95
+
96
+ async def receive_audio(self):
97
+ while True:
98
+ turn = self.session.receive()
99
+ async for response in turn:
100
+ if data := response.data:
101
+ yield (None, data)
102
+ if text := response.text:
103
+ yield (text, None)
104
+
105
+ async def run(self):
106
+ async with client.aio.live.connect(model=MODEL, config=CONFIG) as session:
107
+ self.session = session
108
+ self.audio_in_queue = asyncio.Queue()
109
+ self.out_queue = asyncio.Queue(maxsize=5)
110
+
111
+ tasks = []
112
+ tasks.append(asyncio.create_task(self._audio_stream()))
113
+ if self.mode == "camera":
114
+ tasks.append(asyncio.create_task(self._video_stream()))
115
+ tasks.append(asyncio.create_task(self.send_realtime()))
116
+
117
+ async for text, audio in self.receive_audio():
118
+ yield text, audio
119
+
120
+ for t in tasks:
121
+ t.cancel()
122
+
123
+ # Gradio interface
124
+ async def chat(mode="camera"):
125
+ """Starts a live chat session and yields (text, audio) tuples as they arrive."""
126
+ loop = AudioLoop(mode=mode)
127
+ async for t, a in loop.run():
128
+ yield t, a
129
+
130
+ with gr.Blocks() as demo:
131
+ gr.Markdown("# Gemini Live API Web Chat\nUse your microphone and camera directly from the browser.")
132
+ mode = gr.Radio(choices=["camera", "screen", "none"], value="camera", label="Video Source")
133
+ chatbot = gr.Chatbot()
134
+ with gr.Row():
135
+ start = gr.Button("Start")
136
+ stop = gr.Button("Stop")
137
+ start.click(lambda m: chat(m), inputs=[mode], outputs=[chatbot], _js="(fn, inputs) => {fn(inputs).then(data => console.log(data));}")
138
+ demo.launch(server_name="0.0.0.0", share=True)
139
+
140
+ # requirements.txt
141
+ #
142
+ # google-genai
143
+ # opencv-python
144
+ # pyaudio
145
+ # pillow
146
+ # mss
147
+ # gradio