Spaces:

reecursion
/

task-oriented-dialog-agent

Build error

App Files Files Community

reecursion commited on Apr 17

Commit

df90a53

verified ·

1 Parent(s): 84d30ed

Create speech_conversation_app.py

Browse files

Files changed (1) hide show

speech_conversation_app.py +325 -0

speech_conversation_app.py ADDED Viewed

	@@ -0,0 +1,325 @@

+import os
+import time
+import numpy as np
+import torch
+import gradio as gr
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModelForSpeechSeq2Seq
+from datasets import load_dataset
+import soundfile as sf
+# Global variables to track latency
+latency_ASR = 0.0
+latency_LLM = 0.0
+latency_TTS = 0.0
+# Global variables to store conversation state
+conversation_history = []
+audio_output = None
+# ASR Models
+ASR_OPTIONS = {
+    "Whisper Small": "openai/whisper-small",
+    "Wav2Vec2": "facebook/wav2vec2-base-960h"
+}
+# LLM Models
+LLM_OPTIONS = {
+    "Llama-2 7B Chat": "meta-llama/Llama-2-7b-chat-hf",
+    "Flan-T5 Small": "google/flan-t5-small"
+}
+# TTS Models
+TTS_OPTIONS = {
+    "VITS": "espnet/kan-bayashi_ljspeech_vits",
+    "FastSpeech2": "espnet/kan-bayashi_ljspeech_fastspeech2"
+}
+# Load models
+asr_models = {}
+llm_models = {}
+tts_models = {}
+def load_asr_model(model_name):
+    """Load ASR model from Hugging Face"""
+    global asr_models
+    if model_name not in asr_models:
+        print(f"Loading ASR model: {model_name}")
+        model_id = ASR_OPTIONS[model_name]
+        if "whisper" in model_id:
+            asr_models[model_name] = pipeline("automatic-speech-recognition", model=model_id)
+        else:  # wav2vec2
+            processor = AutoProcessor.from_pretrained(model_id)
+            model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
+            asr_models[model_name] = {"processor": processor, "model": model}
+    return asr_models[model_name]
+def load_llm_model(model_name):
+    """Load LLM model from Hugging Face"""
+    global llm_models
+    if model_name not in llm_models:
+        print(f"Loading LLM model: {model_name}")
+        model_id = LLM_OPTIONS[model_name]
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+        llm_models[model_name] = {
+            "model": model,
+            "tokenizer": tokenizer
+        }
+    return llm_models[model_name]
+def load_tts_model(model_name):
+    """Load TTS model using ESPnet"""
+    global tts_models
+    if model_name not in tts_models:
+        print(f"Loading TTS model: {model_name}")
+        try:
+            # Import ESPnet TTS modules
+            from espnet2.bin.tts_inference import Text2Speech
+            model_id = TTS_OPTIONS[model_name]
+            tts = Text2Speech.from_pretrained(model_id)
+            tts_models[model_name] = tts
+        except ImportError:
+            print("ESPnet not installed. Using mock TTS for demonstration.")
+            tts_models[model_name] = "mock_tts"
+    return tts_models[model_name]
+def transcribe_audio(audio_data, sr, asr_model_name):
+    """Transcribe audio using selected ASR model"""
+    global latency_ASR
+    start_time = time.time()
+    model = load_asr_model(asr_model_name)
+    if "whisper" in ASR_OPTIONS[asr_model_name]:
+        result = model({"array": audio_data, "sampling_rate": sr})
+        transcript = result["text"]
+    else:  # wav2vec2
+        inputs = model["processor"](audio_data, sampling_rate=sr, return_tensors="pt")
+        with torch.no_grad():
+            outputs = model["model"].generate(**inputs)
+        transcript = model["processor"].batch_decode(outputs, skip_special_tokens=True)[0]
+    latency_ASR = time.time() - start_time
+    return transcript
+def generate_response(transcript, llm_model_name, system_prompt):
+    """Generate response using selected LLM model"""
+    global latency_LLM, conversation_history
+    start_time = time.time()
+    model_info = load_llm_model(llm_model_name)
+    model = model_info["model"]
+    tokenizer = model_info["tokenizer"]
+    # Format the prompt based on the model
+    if "llama" in LLM_OPTIONS[llm_model_name].lower():
+        # Format for Llama models
+        if not conversation_history:
+            conversation_history.append({"role": "system", "content": system_prompt})
+        conversation_history.append({"role": "user", "content": transcript})
+        prompt = tokenizer.apply_chat_template(
+            conversation_history,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+    else:
+        # Format for T5 models
+        prompt = f"{system_prompt}\nUser: {transcript}\nAssistant:"
+    # Generate text
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            input_ids,
+            max_new_tokens=100,
+            temperature=0.7,
+            top_p=0.9,
+        )
+    # Decode the response
+    if "llama" in LLM_OPTIONS[llm_model_name].lower():
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract just the assistant's response
+        response = response.split("Assistant: ")[-1].strip()
+    else:
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Add to conversation history
+    conversation_history.append({"role": "assistant", "content": response})
+    latency_LLM = time.time() - start_time
+    return response
+def synthesize_speech(text, tts_model_name):
+    """Synthesize speech using selected TTS model"""
+    global latency_TTS
+    start_time = time.time()
+    tts = load_tts_model(tts_model_name)
+    if tts == "mock_tts":
+        # Mock TTS response for demonstration
+        # In a real implementation, this would use the ESPnet model
+        # Load a sample audio file for demonstration
+        try:
+            sample_rate = 16000
+            # Generate a simple sine wave as demo audio
+            duration = 2  # seconds
+            t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
+            audio_data = 0.5 * np.sin(2 * np.pi * 220 * t)  # 220 Hz sine wave
+        except Exception as e:
+            print(f"Error generating mock audio: {e}")
+            audio_data = np.zeros(16000)  # 1 second of silence
+            sample_rate = 16000
+    else:
+        # Use actual ESPnet TTS model
+        with torch.no_grad():
+            wav = tts(text)["wav"]
+        audio_data = wav.numpy()
+        sample_rate = tts.fs
+    latency_TTS = time.time() - start_time
+    return (sample_rate, audio_data)
+def process_speech(
+    audio_input,
+    asr_option,
+    llm_option,
+    tts_option,
+    system_prompt
+):
+    """Process speech: ASR -> LLM -> TTS pipeline"""
+    global audio_output
+    # Check if audio input is available
+    if audio_input is None:
+        return None, "", "", None
+    # Get audio data
+    sr, audio_data = audio_input
+    # ASR: Speech to text
+    transcript = transcribe_audio(audio_data, sr, asr_option)
+    # LLM: Generate response
+    response = generate_response(transcript, llm_option, system_prompt)
+    # TTS: Text to speech
+    audio_output = synthesize_speech(response, tts_option)
+    # Return results
+    return audio_input, transcript, response, audio_output
+def display_latency():
+    """Display latency information"""
+    return f"""
+    ASR Latency: {latency_ASR:.2f} seconds
+    LLM Latency: {latency_LLM:.2f} seconds
+    TTS Latency: {latency_TTS:.2f} seconds
+    Total Latency: {latency_ASR + latency_LLM + latency_TTS:.2f} seconds
+    """
+def reset_conversation():
+    """Reset the conversation history"""
+    global conversation_history, audio_output
+    conversation_history = []
+    audio_output = None
+    return None, "", "", None, ""
+# Create Gradio interface
+with gr.Blocks(title="Conversational Speech System") as demo:
+    gr.Markdown(
+        """
+        # Conversational Speech System with ASR, LLM, and TTS
+        This demo showcases a complete speech-to-speech conversation system using:
+        - **ASR** (Automatic Speech Recognition) to convert your speech to text
+        - **LLM** (Large Language Model) to generate responses
+        - **TTS** (Text-to-Speech) to convert the responses to speech
+        Speak into your microphone and the system will respond with synthesized speech.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Input components
+            audio_input = gr.Audio(
+                sources=["microphone"],
+                type="numpy",
+                label="Speak here",
+            )
+            system_prompt = gr.Textbox(
+                label="System Prompt (instructions for the LLM)",
+                value="You are a helpful and friendly AI assistant. Keep your responses concise and under 3 sentences."
+            )
+            asr_dropdown = gr.Dropdown(
+                choices=list(ASR_OPTIONS.keys()),
+                value=list(ASR_OPTIONS.keys())[0],
+                label="Select ASR Model"
+            )
+            llm_dropdown = gr.Dropdown(
+                choices=list(LLM_OPTIONS.keys()),
+                value=list(LLM_OPTIONS.keys())[0],
+                label="Select LLM Model"
+            )
+            tts_dropdown = gr.Dropdown(
+                choices=list(TTS_OPTIONS.keys()),
+                value=list(TTS_OPTIONS.keys())[0],
+                label="Select TTS Model"
+            )
+            reset_btn = gr.Button("Reset Conversation")
+        with gr.Column(scale=1):
+            # Output components
+            user_transcript = gr.Textbox(label="Your Speech (ASR Output)")
+            system_response = gr.Textbox(label="AI Response (LLM Output)")
+            audio_output_component = gr.Audio(label="AI Voice Response", autoplay=True)
+            latency_info = gr.Textbox(label="Performance Metrics")
+    # Set up event handlers
+    audio_input.change(
+        process_speech,
+        inputs=[audio_input, asr_dropdown, llm_dropdown, tts_dropdown, system_prompt],
+        outputs=[audio_input, user_transcript, system_response, audio_output_component]
+    ).then(
+        display_latency,
+        inputs=[],
+        outputs=[latency_info]
+    )
+    reset_btn.click(
+        reset_conversation,
+        inputs=[],
+        outputs=[audio_input, user_transcript, system_response, audio_output_component, latency_info]
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()