Spaces:

Futuresony
/

FuturesonyAi

Runtime error

App Files Files Community

Futuresony commited on Dec 14, 2024

Commit

ca69a74

verified ·

1 Parent(s): f3ac400

Update app.py

Browse files

Files changed (1) hide show

app.py +240 -63

app.py CHANGED Viewed

@@ -1,64 +1,241 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("Futuresony/future_ai_12_10_2024.gguf")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-if __name__ == "__main__":
-    demo.launch()

+!pip install utils
+!pip install gradio
 import gradio as gr
+from huggingface_hub import snapshot_download
+from threading import Thread
+import time
+import base64
+import numpy as np
+import requests
+import traceback
+from dataclasses import dataclass, field
+import io
+from pydub import AudioSegment
+import librosa
+from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
+import tempfile
+from server import serve
+repo_id = "gpt-omni/mini-omni"
+snapshot_download(repo_id, local_dir="./checkpoint", revision="main")
+IP = "0.0.0.0"
+PORT = 60808
+thread = Thread(target=serve, daemon=True)
+thread.start()
+API_URL = "http://0.0.0.0:60808/chat"
+# recording parameters
+IN_CHANNELS = 1
+IN_RATE = 24000
+IN_CHUNK = 1024
+IN_SAMPLE_WIDTH = 2
+VAD_STRIDE = 0.5
+# playing parameters
+OUT_CHANNELS = 1
+OUT_RATE = 24000
+OUT_SAMPLE_WIDTH = 2
+OUT_CHUNK = 5760
+OUT_CHUNK = 20 * 4096
+OUT_RATE = 24000
+OUT_CHANNELS = 1
+def run_vad(ori_audio, sr):
+    _st = time.time()
+    try:
+        audio = ori_audio
+        audio = audio.astype(np.float32) / 32768.0
+        sampling_rate = 16000
+        if sr != sampling_rate:
+            audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
+        vad_parameters = {}
+        vad_parameters = VadOptions(**vad_parameters)
+        speech_chunks = get_speech_timestamps(audio, vad_parameters)
+        audio = collect_chunks(audio, speech_chunks)
+        duration_after_vad = audio.shape[0] / sampling_rate
+        if sr != sampling_rate:
+            # resample to original sampling rate
+            vad_audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
+        else:
+            vad_audio = audio
+        vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
+        vad_audio_bytes = vad_audio.tobytes()
+        return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
+    except Exception as e:
+        msg = f"[asr vad error] audio_len: {len(ori_audio)/(sr*2):.3f} s, trace: {traceback.format_exc()}"
+        print(msg)
+        return -1, ori_audio, round(time.time() - _st, 4)
+def warm_up():
+    frames = b"\x00\x00" * 1024 * 2  # 1024 frames of 2 bytes each
+    dur, frames, tcost = run_vad(frames, 16000)
+    print(f"warm up done, time_cost: {tcost:.3f} s")
+warm_up()
+@dataclass
+class AppState:
+    stream: np.ndarray | None = None
+    sampling_rate: int = 0
+    pause_detected: bool = False
+    started_talking: bool =  False
+    stopped: bool = False
+    conversation: list = field(default_factory=list)
+def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
+    """Take in the stream, determine if a pause happened"""
+    temp_audio = audio
+    dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
+    duration = len(audio) / sampling_rate
+    if dur_vad > 0.5 and not state.started_talking:
+        print("started talking")
+        state.started_talking = True
+        return False
+    print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
+    return (duration - dur_vad) > 1
+def speaking(audio_bytes: str):
+    base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
+    files = {"audio": base64_encoded}
+    with requests.post(API_URL, json=files, stream=True) as response:
+        try:
+            for chunk in response.iter_content(chunk_size=OUT_CHUNK):
+                if chunk:
+                    # Create an audio segment from the numpy array
+                    audio_segment = AudioSegment(
+                        chunk,
+                        frame_rate=OUT_RATE,
+                        sample_width=OUT_SAMPLE_WIDTH,
+                        channels=OUT_CHANNELS,
+                    )
+                    # Export the audio segment to MP3 bytes - use a high bitrate to maximise quality
+                    mp3_io = io.BytesIO()
+                    audio_segment.export(mp3_io, format="mp3", bitrate="320k")
+                    # Get the MP3 bytes
+                    mp3_bytes = mp3_io.getvalue()
+                    mp3_io.close()
+                    yield mp3_bytes
+        except Exception as e:
+            raise gr.Error(f"Error during audio streaming: {e}")
+def process_audio(audio: tuple, state: AppState):
+    if state.stream is None:
+        state.stream = audio[1]
+        state.sampling_rate = audio[0]
+    else:
+        state.stream =  np.concatenate((state.stream, audio[1]))
+    pause_detected = determine_pause(state.stream, state.sampling_rate, state)
+    state.pause_detected = pause_detected
+    if state.pause_detected and state.started_talking:
+        return gr.Audio(recording=False), state
+    return None, state
+def response(state: AppState):
+    if not state.pause_detected and not state.started_talking:
+        return None, AppState()
+    audio_buffer = io.BytesIO()
+    segment = AudioSegment(
+        state.stream.tobytes(),
+        frame_rate=state.sampling_rate,
+        sample_width=state.stream.dtype.itemsize,
+        channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
+    )
+    segment.export(audio_buffer, format="wav")
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+        f.write(audio_buffer.getvalue())
+    state.conversation.append({"role": "user",
+                                "content": {"path": f.name,
+                                "mime_type": "audio/wav"}})
+    output_buffer = b""
+    for mp3_bytes in speaking(audio_buffer.getvalue()):
+        output_buffer += mp3_bytes
+        yield mp3_bytes, state
+    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
+        f.write(output_buffer)
+    state.conversation.append({"role": "assistant",
+                    "content": {"path": f.name,
+                                "mime_type": "audio/mp3"}})
+    yield None, AppState(conversation=state.conversation)
+def start_recording_user(state: AppState):
+    if not state.stopped:
+        return gr.Audio(recording=True)
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column():
+            input_audio = gr.Audio(
+                label="Input Audio", sources="microphone", type="numpy"
+            )
+        with gr.Column():
+            chatbot = gr.Chatbot(label="Conversation", type="messages")
+            output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
+    state = gr.State(value=AppState())
+    stream = input_audio.stream(
+        process_audio,
+        [input_audio, state],
+        [input_audio, state],
+        stream_every=0.5,
+        time_limit=30,
+    )
+    respond = input_audio.stop_recording(
+        response,
+        [state],
+        [output_audio, state]
+    )
+    respond.then(lambda s: s.conversation, [state], [chatbot])
+    restart = output_audio.stop(
+        start_recording_user,
+        [state],
+        [input_audio]
+    )
+    cancel = gr.Button("Stop Conversation", variant="stop")
+    cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None,
+                [state, input_audio], cancels=[respond, restart])
+demo.launch()