Spaces:

sofdog
/

realtime-transcription

Sleeping

App Files Files Community

Sofia Casadei commited on Mar 3

Commit

5ef2360

1 Parent(s): aec5df4

first version

Browse files

Files changed (11) hide show

.gitignore +1 -0
Dockerfile +50 -0
README.md +1 -0
app.py +148 -0
requirements.txt +9 -0
static/client.js +127 -0
static/index.html +53 -0
utils/__init__.py +0 -0
utils/device.py +25 -0
utils/logger_config.py +86 -0
utils/turn_server.py +119 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

Dockerfile ADDED Viewed

	@@ -0,0 +1,50 @@

+# Stage 1: Get uv installer
+FROM ghcr.io/astral-sh/uv:0.2.12 as uv
+# Stage 2: Main application image
+FROM python:3.10.12-slim-bookworm
+# Copy uv from first stage
+COPY --from=uv /uv /uv
+# Create virtual environment with uv
+RUN --mount=type=cache,target=/root/.cache/uv \
+    /uv venv /opt/venv
+# Set environment variables
+ENV VIRTUAL_ENV=/opt/venv \
+    PATH="/opt/venv/bin:$PATH"
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    portaudio19-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Create user and set permissions (required for HF Spaces)
+RUN useradd -m -u 1000 user && \
+    chown -R user /opt/venv
+# Switch to user context
+USER user
+WORKDIR /app
+# Set home to user's home directory
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH \
+    HF_HOME=/home/user/.cache/huggingface
+# Copy requirements first for caching
+COPY --chown=user requirements.txt .
+# Install Python packages with uv caching
+RUN --mount=type=cache,target=/home/user/.cache/uv \
+    /uv pip install -r requirements.txt
+# Copy application code
+COPY --chown=user . .
+# Expose FastRTC port (matches HF Spaces default)
+EXPOSE 7860
+# Start the application using uvicorn (FastAPI)
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ emoji: 🐢
 colorFrom: indigo
 colorTo: gray
 sdk: docker
 pinned: false
 ---

 colorFrom: indigo
 colorTo: gray
 sdk: docker
+app_port: 7860
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import os
+import logging
+import gradio as gr
+import numpy as np
+from dotenv import load_dotenv
+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse, HTMLResponse
+from fastrtc import (
+    AdditionalOutputs,
+    ReplyOnPause,
+    Stream,
+    AlgoOptions,
+    SileroVadOptions,
+    audio_to_bytes,
+)
+from transformers import (
+    AutoModelForSpeechSeq2Seq,
+    AutoProcessor,
+    pipeline,
+)
+from transformers.utils import is_flash_attn_2_available
+from utils.logger_config import setup_logging
+from utils.device import get_device, get_torch_and_np_dtypes
+from utils.turn_server import get_rtc_credentials
+load_dotenv()
+setup_logging(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+device = get_device(force_cpu=False)
+torch_dtype, np_dtype = get_torch_and_np_dtypes(device, use_bfloat16=False)
+logger.info(f"Using device: {device}, torch_dtype: {torch_dtype}, np_dtype: {np_dtype}")
+attention = "flash_attention_2" if is_flash_attn_2_available() else "sdpa"
+logger.info(f"Using attention: {attention}")
+model_id = "openai/whisper-large-v3-turbo"
+logger.info(f"Loading Whisper model: {model_id}")
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id,
+    torch_dtype=torch_dtype,
+    low_cpu_mem_usage=True,
+    use_safetensors=True,
+    attn_implementation=attention
+)
+model.to(device)
+processor = AutoProcessor.from_pretrained(model_id)
+transcribe_pipeline = pipeline(
+    task="automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    torch_dtype=torch_dtype,
+    device=device,
+)
+# Warm up the model with empty audio
+logger.info("Warming up Whisper model with dummy input")
+warmup_audio = np.zeros((16000,), dtype=np_dtype)  # 1s of silence
+transcribe_pipeline(warmup_audio)
+logger.info("Model warmup complete")
+async def transcribe(audio: tuple[int, np.ndarray]):
+    sample_rate, audio_array = audio
+    logger.info(f"Sample rate: {sample_rate}Hz, Shape: {audio_array.shape}")
+    outputs = transcribe_pipeline(
+        audio_to_bytes(audio),
+        chunk_length_s=3,
+        batch_size=1,
+        generate_kwargs={
+            'task': 'transcribe',
+            'language': 'english',
+        },
+        #return_timestamps="word"
+    )
+    yield AdditionalOutputs(outputs["text"].strip())
+logger.info("Initializing FastRTC stream")
+stream = Stream(
+    handler=ReplyOnPause(
+        transcribe,
+        algo_options=AlgoOptions(
+            # Duration in seconds of audio chunks (default 0.6)
+            audio_chunk_duration=0.6,
+            # If the chunk has more than started_talking_threshold seconds of speech, the user started talking (default 0.2)
+            started_talking_threshold=0.2,
+            # If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
+            speech_threshold=0.1,
+        ),
+        model_options=SileroVadOptions(
+            # Threshold for what is considered speech (default 0.5)
+            threshold=0.5,
+            # Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
+            min_speech_duration_ms=250,
+            # Max duration of speech chunks, longer will be split (default float('inf'))
+            max_speech_duration_s=30,
+            # Wait for ms at the end of each speech chunk before separating it (default 2000)
+            min_silence_duration_ms=2000,
+            # Chunk size for VAD model. Can be 512, 1024, 1536 for 16k s.r. (default 1024)
+            window_size_samples=1024,
+            # Final speech chunks are padded by speech_pad_ms each side (default 400)
+            speech_pad_ms=400,
+        ),
+    ),
+    # send-receive: bidirectional streaming (default)
+    # send: client to server only
+    # receive: server to client only
+    modality="audio",
+    mode="send",
+    additional_outputs=[
+        gr.Textbox(label="Transcript"),
+    ],
+    additional_outputs_handler=lambda current, new: current + " " + new,
+    rtc_configuration=get_rtc_credentials(provider="hf") if os.getenv("APP_MODE") == "deployed" else None
+)
+app = FastAPI()
+stream.mount(app)
+@app.get("/transcript")
+def _(webrtc_id: str):
+    logger.debug(f"New transcript stream request for webrtc_id: {webrtc_id}")
+    async def output_stream():
+        try:
+            async for output in stream.output_stream(webrtc_id):
+                transcript = output.args[0]
+                logger.debug(f"Sending transcript for {webrtc_id}: {transcript[:50]}...")
+                yield f"event: output\ndata: {transcript}\n\n"
+        except Exception as e:
+            logger.error(f"Error in transcript stream for {webrtc_id}: {str(e)}")
+            raise
+    return StreamingResponse(output_stream(), media_type="text/event-stream")
+@app.get("/")
+async def root():
+    return HTMLResponse(content=open("static/index.html").read())

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+accelerate==1.4.0
+fastrtc==0.0.10
+fastrtc[vad]
+python-dotenv==1.0.1
+transformers==4.49.0
+torch==2.6.0
+torchaudio==2.6.0
+fastapi
+uvicorn[standard]

static/client.js ADDED Viewed

	@@ -0,0 +1,127 @@

+// Global variables
+let peerConnection = null;
+let dataChannel = null;
+let webrtcId = null;
+// Helper function to generate unique ID
+function generateUniqueId() {
+    return Math.random().toString(36).substring(7);
+}
+// Update UI status
+function updateStatus(connected) {
+    const statusDiv = document.getElementById('status');
+    const connectBtn = document.getElementById('connectBtn');
+    const disconnectBtn = document.getElementById('disconnectBtn');
+    statusDiv.textContent = connected ? 'Connected' : 'Disconnected';
+    statusDiv.className = connected ? 'connected' : 'disconnected';
+    connectBtn.disabled = connected;
+    disconnectBtn.disabled = !connected;
+}
+// Setup WebRTC connection
+async function setupWebRTC() {
+    try {
+        // Create peer connection
+        peerConnection = new RTCPeerConnection();
+        webrtcId = generateUniqueId();
+        // Get audio stream from microphone
+        const stream = await navigator.mediaDevices.getUserMedia({
+            audio: true
+        });
+        // Add audio stream to peer connection
+        stream.getTracks().forEach(track => {
+            peerConnection.addTrack(track, stream);
+        });
+        // Create data channel
+        dataChannel = peerConnection.createDataChannel("text");
+        // Handle data channel messages
+        dataChannel.onmessage = (event) => {
+            const message = JSON.parse(event.data);
+            console.log("Received message:", message);
+            // Handle different message types
+            switch(message.type) {
+                case 'log':
+                    console.log("Server log:", message.data);
+                    break;
+                case 'error':
+                    console.error("Server error:", message.data);
+                    break;
+                case 'warning':
+                    console.warn("Server warning:", message.data);
+                    break;
+            }
+        };
+        // Create and send offer
+        const offer = await peerConnection.createOffer();
+        await peerConnection.setLocalDescription(offer);
+        // Send offer to server
+        const response = await fetch('/webrtc/offer', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({
+                sdp: offer.sdp,
+                type: offer.type,
+                webrtc_id: webrtcId
+            })
+        });
+        if (!response.ok) {
+            throw new Error(`HTTP error! status: ${response.status}`);
+        }
+        // Handle server response
+        const serverResponse = await response.json();
+        // Check for error response
+        if (serverResponse.status === 'failed') {
+            throw new Error(serverResponse.meta.error);
+        }
+        // Set remote description
+        await peerConnection.setRemoteDescription(serverResponse);
+        // Update UI
+        updateStatus(true);
+        // Add to setupWebRTC():
+        const eventSource = new EventSource(`/transcript?webrtc_id=${webrtcId}`);
+        eventSource.onmessage = (event) => {
+            const transcriptDiv = document.getElementById('transcript');
+            transcriptDiv.innerHTML += `<p>${event.data}</p>`;
+        };
+    } catch (error) {
+        console.error("Error setting up WebRTC:", error);
+        updateStatus(false);
+    }
+}
+// Cleanup function
+function disconnect() {
+    if (peerConnection) {
+        peerConnection.close();
+        peerConnection = null;
+    }
+    if (dataChannel) {
+        dataChannel.close();
+        dataChannel = null;
+    }
+    webrtcId = null;
+    updateStatus(false);
+}
+// Add event listeners when page loads
+document.addEventListener('DOMContentLoaded', () => {
+    document.getElementById('connectBtn').addEventListener('click', setupWebRTC);
+    document.getElementById('disconnectBtn').addEventListener('click', disconnect);
+});

static/index.html ADDED Viewed

	@@ -0,0 +1,53 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>FastRTC Audio Client</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            max-width: 800px;
+            margin: 0 auto;
+            padding: 20px;
+        }
+        .controls {
+            margin: 20px 0;
+        }
+        button {
+            padding: 10px 20px;
+            margin: 5px;
+        }
+        #status {
+            margin: 10px 0;
+            padding: 10px;
+            border-radius: 4px;
+        }
+        .connected {
+            background-color: #d4edda;
+            color: #155724;
+        }
+        .disconnected {
+            background-color: #f8d7da;
+            color: #721c24;
+        }
+    </style>
+</head>
+<body>
+    <h1>FastRTC Audio Client</h1>
+    <div id="status" class="disconnected">Disconnected</div>
+    <div class="controls">
+        <button id="connectBtn">Connect</button>
+        <button id="disconnectBtn" disabled>Disconnect</button>
+    </div>
+    <!-- Audio element for playback -->
+    <audio id="audioOutput" autoplay></audio>
+    <div id="transcript" style="margin-top: 20px; padding: 10px; border: 1px solid #ccc;"></div>
+    <!-- Load our WebRTC client code -->
+    <script src="/static/client.js"></script>
+</body>
+</html>

utils/__init__.py ADDED Viewed

File without changes

utils/device.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch
+import numpy as np
+def get_device(force_cpu=False):
+    if force_cpu:
+        return "cpu"
+    if torch.cuda.is_available():
+        return "cuda"
+    elif torch.backends.mps.is_available():
+        torch.mps.empty_cache()
+        return "mps"
+    else:
+        return "cpu"
+def get_torch_and_np_dtypes(device, use_bfloat16=False):
+    if device == "cuda":
+        torch_dtype = torch.bfloat16 if use_bfloat16 else torch.float16
+        np_dtype = np.float16
+    elif device == "mps":
+        torch_dtype = torch.bfloat16 if use_bfloat16 else torch.float16
+        np_dtype = np.float16
+    else:
+        torch_dtype = torch.float32
+        np_dtype = np.float32
+    return torch_dtype, np_dtype

utils/logger_config.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import logging
+import sys
+import os
+LOGS_DIR = "logs"
+class ColorFormatter(logging.Formatter):
+    """Custom formatter that adds colors to log levels"""
+    grey = "\x1b[38;20m"
+    yellow = "\x1b[33;20m"
+    red = "\x1b[31;20m"
+    bold_red = "\x1b[31;1m"
+    blue = "\x1b[34;20m"
+    green = "\x1b[32;20m"
+    reset = "\x1b[0m"
+    format_str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    FORMATS = {
+        logging.DEBUG: blue + format_str + reset,
+        logging.INFO: green + format_str + reset,
+        logging.WARNING: yellow + format_str + reset,
+        logging.ERROR: red + format_str + reset,
+        logging.CRITICAL: bold_red + format_str + reset
+    }
+    def format(self, record):
+        log_fmt = self.FORMATS.get(record.levelno)
+        formatter = logging.Formatter(log_fmt, datefmt='%Y-%m-%d %H:%M:%S')
+        return formatter.format(record)
+def configure_logfire():
+    import logfire
+    # First run `logfire auth`
+    # -> Your Logfire credentials are stored in <path>/.logfire/default.toml
+    def scrubbing_callback(m: logfire.ScrubMatch):
+        if m.pattern_match.group(0) == 'Credit Card':
+            return m.value
+    logfire.configure(scrubbing=logfire.ScrubbingOptions(callback=scrubbing_callback))
+def setup_logging(level=None, with_logfire=False):
+    """Configure logging for the entire application"""
+    if with_logfire:
+        configure_logfire()
+    # Get level from environment variable or use default
+    if level is None:
+        level_name = os.getenv('LOG_LEVEL', 'INFO')
+        level = getattr(logging, level_name.upper(), logging.INFO)
+    # Configure stream handler (console output) with color formatter
+    stream_handler = logging.StreamHandler(sys.stdout)
+    stream_handler.setFormatter(ColorFormatter())
+    # Configure root logger
+    root_logger = logging.getLogger()
+    root_logger.setLevel(level)
+    # Remove existing handlers
+    root_logger.handlers = []
+    root_logger.addHandler(stream_handler)
+    # Prevent duplicate logging
+    root_logger.propagate = False
+    # Optionally configure file handler
+    os.makedirs(LOGS_DIR, exist_ok=True)
+    file_handler = logging.FileHandler(os.path.join(LOGS_DIR, 'app.log'))
+    file_handler.setFormatter(logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    ))
+    root_logger.addHandler(file_handler)
+    # Get comma-separated list of loggers to suppress from env
+    suppress_loggers = os.getenv('SUPPRESS_LOGGERS', '').strip()
+    if suppress_loggers:
+        for logger_name in suppress_loggers.split(','):
+            logger_name = logger_name.strip()
+            if logger_name:
+                logging.getLogger(logger_name).setLevel(logging.WARNING)
+    logging.info(f"Logging configured with level: {logging.getLevelName(level)}")

utils/turn_server.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import os
+from typing import Literal, Optional, Dict, Any
+import requests
+from fastrtc import get_hf_turn_credentials, get_twilio_turn_credentials
+def get_rtc_credentials(
+    provider: Literal["hf", "twilio", "cloudflare"] = "hf",
+    **kwargs
+) -> Dict[str, Any]:
+    """
+    Get RTC configuration for different TURN server providers.
+    Args:
+        provider: The TURN server provider to use ('hf', 'twilio', or 'cloudflare')
+        **kwargs: Additional arguments passed to the specific provider's function
+    Returns:
+        Dictionary containing the RTC configuration
+    """
+    try:
+        if provider == "hf":
+            return get_hf_credentials(**kwargs)
+        elif provider == "twilio":
+            return get_twilio_credentials(**kwargs)
+        elif provider == "cloudflare":
+            return get_cloudflare_credentials(**kwargs)
+    except Exception as e:
+        raise Exception(f"Failed to get RTC credentials ({provider}): {str(e)}")
+def get_hf_credentials(token: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Get credentials for Hugging Face's community TURN server.
+    Required setup:
+    1. Create a Hugging Face account at huggingface.co
+    2. Visit: https://huggingface.co/spaces/fastrtc/turn-server-login
+    3. Set HF_TOKEN environment variable or pass token directly
+    """
+    token = token or os.environ.get("HF_TOKEN")
+    if not token:
+        raise ValueError("HF_TOKEN environment variable not set")
+    try:
+        return get_hf_turn_credentials(token=token)
+    except Exception as e:
+        raise Exception(f"Failed to get HF TURN credentials: {str(e)}")
+def get_twilio_credentials(
+    account_sid: Optional[str] = None,
+    auth_token: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Get credentials for Twilio's TURN server.
+    Required setup:
+    1. Create a free Twilio account at: https://login.twilio.com/u/signup
+    2. Get your Account SID and Auth Token from the Twilio Console
+    3. Set environment variables:
+       - TWILIO_ACCOUNT_SID (or pass directly)
+       - TWILIO_AUTH_TOKEN (or pass directly)
+    """
+    account_sid = account_sid or os.environ.get("TWILIO_ACCOUNT_SID")
+    auth_token = auth_token or os.environ.get("TWILIO_AUTH_TOKEN")
+    if not account_sid or not auth_token:
+        raise ValueError("Twilio credentials not found. Set TWILIO_ACCOUNT_SID and TWILIO_AUTH_TOKEN env vars")
+    try:
+        return get_twilio_turn_credentials(account_sid=account_sid, auth_token=auth_token)
+    except Exception as e:
+        raise Exception(f"Failed to get Twilio TURN credentials: {str(e)}")
+def get_cloudflare_credentials(
+    key_id: Optional[str] = None,
+    api_token: Optional[str] = None,
+    ttl: int = 86400
+) -> Dict[str, Any]:
+    """
+    Get credentials for Cloudflare's TURN server.
+    Required setup:
+    1. Create a free Cloudflare account
+    2. Go to Cloudflare dashboard -> Calls section
+    3. Create a TURN App and get the Turn Token ID and API Token
+    4. Set environment variables:
+       - TURN_KEY_ID
+       - TURN_KEY_API_TOKEN
+    Args:
+        key_id: Cloudflare Turn Token ID (optional, will use env var if not provided)
+        api_token: Cloudflare API Token (optional, will use env var if not provided)
+        ttl: Time-to-live for credentials in seconds (default: 24 hours)
+    """
+    key_id = key_id or os.environ.get("TURN_KEY_ID")
+    api_token = api_token or os.environ.get("TURN_KEY_API_TOKEN")
+    if not key_id or not api_token:
+        raise ValueError("Cloudflare credentials not found. Set TURN_KEY_ID and TURN_KEY_API_TOKEN env vars")
+    response = requests.post(
+        f"https://rtc.live.cloudflare.com/v1/turn/keys/{key_id}/credentials/generate",
+        headers={
+            "Authorization": f"Bearer {api_token}",
+            "Content-Type": "application/json",
+        },
+        json={"ttl": ttl},
+    )
+    if response.ok:
+        return {"iceServers": [response.json()["iceServers"]]}
+    else:
+        raise Exception(
+            f"Failed to get Cloudflare TURN credentials: {response.status_code} {response.text}"
+        )