Spaces:

SalexAI
/

bearos

Runtime error

App Files Files Community

bearos / app.py

SalexAI

Create app.py

22bf868 verified 2 months ago

raw

history blame

4.44 kB

	# app.py
	import os
	import asyncio
	import base64
	import io
	import traceback

	import cv2
	import pyaudio
	import PIL.Image
	import mss
	import gradio as gr

	from google import genai
	from google.genai import types

	# Audio and video capture config
	FORMAT = pyaudio.paInt16
	CHANNELS = 1
	SEND_SAMPLE_RATE = 16000
	RECEIVE_SAMPLE_RATE = 24000
	CHUNK_SIZE = 1024

	MODEL = "models/gemini-2.0-flash-live-001"

	# Initialize Google GenAI client
	client = genai.Client(
	http_options={"api_version": "v1beta"},
	api_key=os.environ.get("GEMINI_API_KEY"),
	)

	# Live connect configuration
	CONFIG = types.LiveConnectConfig(
	response_modalities=["audio"],
	speech_config=types.SpeechConfig(
	voice_config=types.VoiceConfig(
	prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck")
	)
	),
	system_instruction=types.Content(
	parts=[types.Part.from_text(text="You are Puck..." )]
	),
	)

	# AudioLoop class adapted for Gradio
	class AudioLoop:
	def __init__(self, mode="camera"):
	self.mode = mode
	self.audio_in_queue = None
	self.out_queue = None
	self.session = None

	async def _get_frame(self, cap):
	ret, frame = cap.read()
	if not ret:
	return None
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	img = PIL.Image.fromarray(frame_rgb)
	img.thumbnail((640, 480))
	buf = io.BytesIO()
	img.save(buf, format="JPEG")
	return buf.getvalue()

	async def _video_stream(self):
	cap = await asyncio.to_thread(cv2.VideoCapture, 0)
	try:
	while True:
	frame = await self._get_frame(cap)
	if frame is None:
	break
	await self.out_queue.put({"mime_type": "image/jpeg", "data": base64.b64encode(frame).decode()})
	await asyncio.sleep(0.1)
	finally:
	cap.release()

	async def _audio_stream(self):
	mic_info = pya.get_default_input_device_info()
	stream = await asyncio.to_thread(
	pyaudio.PyAudio().open,
	format=FORMAT,
	channels=CHANNELS,
	rate=SEND_SAMPLE_RATE,
	input=True,
	input_device_index=mic_info['index'],
	frames_per_buffer=CHUNK_SIZE,
	)
	while True:
	data = await asyncio.to_thread(stream.read, CHUNK_SIZE, False)
	await self.out_queue.put({"data": data, "mime_type": "audio/pcm"})

	async def send_realtime(self):
	while True:
	msg = await self.out_queue.get()
	await self.session.send(input=msg)

	async def receive_audio(self):
	while True:
	turn = self.session.receive()
	async for response in turn:
	if data := response.data:
	yield (None, data)
	if text := response.text:
	yield (text, None)

	async def run(self):
	async with client.aio.live.connect(model=MODEL, config=CONFIG) as session:
	self.session = session
	self.audio_in_queue = asyncio.Queue()
	self.out_queue = asyncio.Queue(maxsize=5)

	tasks = []
	tasks.append(asyncio.create_task(self._audio_stream()))
	if self.mode == "camera":
	tasks.append(asyncio.create_task(self._video_stream()))
	tasks.append(asyncio.create_task(self.send_realtime()))

	async for text, audio in self.receive_audio():
	yield text, audio

	for t in tasks:
	t.cancel()

	# Gradio interface
	async def chat(mode="camera"):
	"""Starts a live chat session and yields (text, audio) tuples as they arrive."""
	loop = AudioLoop(mode=mode)
	async for t, a in loop.run():
	yield t, a

	with gr.Blocks() as demo:
	gr.Markdown("# Gemini Live API Web Chat\nUse your microphone and camera directly from the browser.")
	mode = gr.Radio(choices=["camera", "screen", "none"], value="camera", label="Video Source")
	chatbot = gr.Chatbot()
	with gr.Row():
	start = gr.Button("Start")
	stop = gr.Button("Stop")
	start.click(lambda m: chat(m), inputs=[mode], outputs=[chatbot], _js="(fn, inputs) => {fn(inputs).then(data => console.log(data));}")
	demo.launch(server_name="0.0.0.0", share=True)

	# requirements.txt
	#
	# google-genai
	# opencv-python
	# pyaudio
	# pillow
	# mss
	# gradio