Spaces:

sofdog
/

realtime-transcription

Sleeping

App Files Files Community

Sofia Casadei commited on 27 days ago

Commit

489ba9a

1 Parent(s): 953b94e

add: big screen ui

Browse files

Files changed (2) hide show

index-screen.html +632 -0
main.py +26 -22

index-screen.html ADDED Viewed

	@@ -0,0 +1,632 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Real-time Whisper Transcription</title>
+    <style>
+        :root {
+            --background-dark: #000000;
+            --text-light: #ffffff;
+        }
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
+            margin: 0; /* Removes default margin */
+            padding: 0; /* Removes default padding */
+            background-color: var(--background-dark); /* Sets background to black */
+            color: var(--text-light); /* Sets text to white */
+            min-height: 100vh; /* Ensures page fills entire viewport height */
+        }
+        /* Hide the header in presentation mode */
+        .hero {
+            display: none; /* Hides the hero section completely */
+        }
+        .container {
+            max-width: 100%;  /* Makes container full width */
+            margin: 0; /* Removes margin */
+            padding: 1rem; /* Adds small padding all around */
+        }
+        /* Base styling for transcript container */
+        .transcript-container {
+            height: 90vh;  /* Sets height to 90% of viewport height */
+            border: none; /* Removes border */
+            padding: 2rem; /* Adds generous padding inside */
+            background: var(--background-dark); /* Ensures background is black */
+            color: var(--text-light); /* Ensures text is white */
+            overflow-y: auto; /* Enables vertical scrolling when content overflows */
+            margin-bottom: 0; /* Removes bottom margin */
+            display: block; /* Makes element a block to take full width */
+            width: 100%; /* Sets width to 100% */
+        }
+        /* Styling for transcript paragraphs */
+        .transcript-container p {
+            margin: 0.5rem 0; /* Small vertical margin between paragraphs */
+            padding: 0.5rem 0; /* Small vertical padding within paragraphs */
+            background: transparent; /* Transparent background (no highlighting) */
+            border-radius: 0; /* No rounded corners */
+            line-height: 1.6; /* Increases line spacing for readability */
+            font-size: 3.5rem;  /* rem means relative to the root font size */
+            font-weight: 500; /* 500 = medium weight, 700 = bold */
+            max-width: 98%; /* Full width within container */
+            white-space: normal;  /* Allows text to wrap normally */
+            word-wrap: break-word; /* Prevents overflow of long words */
+            color: white; /* Explicitly sets text color to white */
+            display: block; /* Each paragraph takes full width */
+        }
+        /* Current paragraph styling - slightly brighter for emphasis */
+        .transcript-container p.current {
+            background: transparent;  /* No background color */
+            color: rgba(255, 255, 255, 1.0); /* Full brightness white for current text */
+        }
+        /* Ensure all paragraphs have full opacity (keeps history visible) */
+        .transcript-container p:nth-last-child(n+4) {
+            opacity: 1.0; /* Shows all paragraphs at full opacity */
+        }
+        /* Controls for starting/stopping transcription */
+        .controls {
+            position: fixed; /* Fixes controls to viewport */
+            bottom: 2rem; /* Positions 2rem from bottom */
+            right: 2rem; /* Positions 2rem from right */
+            margin: 0; /* No margin */
+            opacity: 0.8;  /* Slightly transparent when not hovered */
+            transition: opacity 0.3s ease; /* Smooth transition for opacity changes */
+            z-index: 1000; /* Ensures controls appear above other elements */
+        }
+        .controls:hover {
+            opacity: 1; /* Full opacity on hover */
+        }
+        /* Button styling - orange with black text for good contrast */
+        button {
+            background: rgba(249, 164, 92, 1.0);  /* Solid orange background */
+            backdrop-filter: blur(5px); /* Blur effect for elements behind */
+            font-size: 1.2rem; /* Large text */
+            min-width: 160px; /* Minimum width for button */
+            padding: 15px 30px; /* Generous padding inside button */
+            color: black !important;  /* Forces black text color */
+            font-weight: bold; /* Bold text for better visibility */
+            border: 2px solid rgba(255, 255, 255, 0.2); /* Subtle border */
+            border-radius: 8px; /* Rounded corners */
+            cursor: pointer; /* Shows pointer cursor on hover */
+            transition: all 0.2s ease; /* Smooth transition for hover effects */
+            display: block; /* Makes button take up full width */
+        }
+        button:hover {
+            background: rgba(249, 164, 92, 0.9); /* Slightly more transparent on hover */
+            transform: translateY(-2px); /* Slight upward movement on hover */
+        }
+        /* Spinner animation for loading state */
+        .icon-with-spinner .spinner {
+            border: 3px solid black; /* Spinner border */
+            border-top: 3px solid transparent; /* Transparent top creates spinning effect */
+            border-radius: 50%; /* Makes it circular */
+            width: 24px; /* Width of spinner */
+            height: 24px; /* Height of spinner */
+            animation: spin 1s linear infinite; /* Animation for spinning effect */
+        }
+        @keyframes spin {
+            0% { transform: rotate(0deg); } /* Starting rotation */
+            100% { transform: rotate(360deg); } /* Full 360° rotation */
+        }
+        /* Recording indicator pulse animation */
+        .pulse-circle {
+            display: inline-block; /* Allows other elements inline */
+            width: 12px; /* Width of pulse circle */
+            height: 12px; /* Height of pulse circle */
+            border-radius: 50%; /* Makes it circular */
+            background-color: red; /* Red color for recording indicator */
+            margin-right: 8px; /* Space to right of circle */
+            animation: pulse 1.5s ease infinite; /* Continuous pulsing animation */
+        }
+        @keyframes pulse {
+            0% { transform: scale(0.95); opacity: 0.7; } /* Slightly smaller and transparent */
+            50% { transform: scale(1.1); opacity: 1; } /* Larger and fully opaque */
+            100% { transform: scale(0.95); opacity: 0.7; } /* Back to starting state */
+        }
+        /* Custom scrollbar styling */
+        .transcript-container::-webkit-scrollbar {
+            width: 8px; /* Width of scrollbar */
+        }
+        .transcript-container::-webkit-scrollbar-track {
+            background: var(--background-dark); /* Black scrollbar track */
+        }
+        .transcript-container::-webkit-scrollbar-thumb {
+            background: rgba(249, 164, 92, 0.3); /* Semi-transparent orange scrollbar thumb */
+            border-radius: 4px; /* Rounded corners on scrollbar thumb */
+        }
+        /* Error toast styling */
+        .toast {
+            background: rgba(0, 0, 0, 0.8); /* Semi-transparent black background */
+            backdrop-filter: blur(5px); /* Blur effect behind toast */
+            color: var(--text-light); /* White text */
+            font-size: 1.2rem; /* Large text size */
+        }
+    </style>
+</head>
+<body>
+    <!-- Error message container that slides in when needed -->
+    <div id="error-toast" class="toast"></div>
+    <!-- Header section (hidden in presentation mode) -->
+    <div class="hero">
+        <h1>Real-time Transcription</h1>
+        <p>Powered by FastRTC and Local Whisper 🤗</p>
+    </div>
+    <!-- Main content container -->
+    <div class="container">
+        <!-- Container for transcript text -->
+        <div class="transcript-container" id="transcript"></div>
+        <!-- Controls for starting/stopping recording -->
+        <div class="controls">
+            <button id="start-button">Start Recording</button>
+        </div>
+    </div>
+    <script>
+        // Global variables for WebRTC connection
+        let peerConnection;      // Stores the WebRTC connection object for audio streaming
+        let webrtc_id;           // A unique ID to identify this connection on the server
+        let audioContext, analyser, audioSource;  // Audio processing objects for visualization
+        let audioLevel = 0;      // Stores the current audio level (volume) from 0-1
+        let animationFrame;      // Reference to the animation frame for audio visualization
+        let isRecording = false; // Tracks whether we're currently recording or not
+        let eventSource;         // Object that receives transcription results from the server
+        // DOM element references
+        const startButton = document.getElementById('start-button');    // The button to start/stop recording
+        const transcriptDiv = document.getElementById('transcript');    // The container for transcription text
+        // Variables for managing the transcript display
+        let currentParagraph = null;    // Reference to the current paragraph being updated
+        let lastUpdateTime = Date.now(); // Timestamp of when we last updated the transcript
+        // Show error messages to the user in a toast notification
+        function showError(message) {
+            const toast = document.getElementById('error-toast');   // Get the toast element
+            toast.textContent = message;                           // Set the error message
+            toast.style.display = 'block';                         // Make the toast visible
+            // Hide toast after 5 seconds
+            setTimeout(() => {
+                toast.style.display = 'none';                      // Hide the toast
+            }, 5000);
+        }
+        // Handle messages received from the server through WebRTC data channel
+        function handleMessage(event) {
+            // Parse JSON message
+            const eventJson = JSON.parse(event.data);
+            // Display errors to the user
+            if (eventJson.type === "error") {
+                showError(eventJson.message);
+            }
+            // Log all messages to console for debugging
+            console.log('Received message:', event.data);
+        }
+        // Update button appearance based on connection state
+        function updateButtonState() {
+            // If connecting, show spinner
+            if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
+                startButton.innerHTML = `
+                    <div class="icon-with-spinner">
+                        <div class="spinner"></div>
+                        <span>Connecting...</span>
+                    </div>
+                `;
+                isRecording = false;  // Not recording while connecting
+            // If connected, show pulsing recording indicator
+            } else if (peerConnection && peerConnection.connectionState === 'connected') {
+                startButton.innerHTML = `
+                    <div class="pulse-container">
+                        <div class="pulse-circle"></div>
+                        <span>Stop Recording</span>
+                    </div>
+                `;
+                isRecording = true;   // Set recording state to true
+            // Default state - ready to start
+            } else {
+                startButton.innerHTML = 'Start Recording';
+                isRecording = false;  // Not recording when not connected
+            }
+        }
+        // Set up audio visualization to show when the user is speaking
+        function setupAudioVisualization(stream) {
+            // Create or resume the audio context
+            if (!audioContext) {
+                // Create new audio context with browser compatibility handling
+                audioContext = new (window.AudioContext || window.webkitAudioContext)();
+            } else {
+                // Resume context if it was suspended
+                if (audioContext.state === 'suspended') {
+                    audioContext.resume();
+                }
+            }
+            // Create audio analyzer for processing audio data
+            analyser = audioContext.createAnalyser();
+            // Create media source from microphone stream
+            audioSource = audioContext.createMediaStreamSource(stream);
+            // Connect source to analyzer
+            audioSource.connect(analyser);
+            // Set FFT size (controls frequency data resolution)
+            analyser.fftSize = 64;
+            // Create array to store frequency data
+            const dataArray = new Uint8Array(analyser.frequencyBinCount);
+            // Function to continuously update audio level visualization
+            function updateAudioLevel() {
+                // Get audio frequency data
+                analyser.getByteFrequencyData(dataArray);
+                // Calculate average volume across all frequencies
+                const average = Array.from(dataArray).reduce((a, b) => a + b, 0) / dataArray.length;
+                // Convert to 0-1 scale
+                audioLevel = average / 255;
+                // Update pulse circle size based on audio level
+                const pulseCircle = document.querySelector('.pulse-circle');
+                if (pulseCircle) {
+                    pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
+                }
+                // Continue animation loop
+                animationFrame = requestAnimationFrame(updateAudioLevel);
+            }
+            // Start audio visualization loop
+            updateAudioLevel();
+        }
+        // Set up WebRTC connection for streaming audio to server
+        async function setupWebRTC() {
+            // Get WebRTC configuration from global variable
+            const config = __RTC_CONFIGURATION__;
+            // Create new peer connection
+            peerConnection = new RTCPeerConnection(config);
+            // Set connection timeout (15 seconds)
+            const connectionTimeout = setTimeout(() => {
+                if (peerConnection && peerConnection.connectionState !== 'connected') {
+                    showError('Connection timeout. Please check your network and try again.');
+                    stop(); // Stop connection attempt
+                }
+            }, 15000);
+            // Set warning for slow connection (5 seconds)
+            const timeoutId = setTimeout(() => {
+                const toast = document.getElementById('error-toast');
+                toast.textContent = "Connection is taking longer than usual. Are you on a VPN?";
+                toast.className = 'toast warning';
+                toast.style.display = 'block';
+                // Hide warning after 5 seconds
+                setTimeout(() => {
+                    toast.style.display = 'none';
+                }, 5000);
+            }, 5000);
+            try {
+                // Request access to user's microphone
+                const stream = await navigator.mediaDevices.getUserMedia({
+                    audio: true // Only request audio access
+                });
+                // Set up audio visualization
+                setupAudioVisualization(stream);
+                // Add audio tracks to WebRTC connection
+                stream.getTracks().forEach(track => {
+                    peerConnection.addTrack(track, stream);
+                });
+                // Monitor connection state changes
+                peerConnection.addEventListener('connectionstatechange', () => {
+                    // Log state changes
+                    console.log('connectionstatechange', peerConnection.connectionState);
+                    // Handle successful connection
+                    if (peerConnection.connectionState === 'connected') {
+                        clearTimeout(timeoutId);
+                        clearTimeout(connectionTimeout);
+                        const toast = document.getElementById('error-toast');
+                        toast.style.display = 'none';
+                    // Handle connection failures
+                    } else if (peerConnection.connectionState === 'failed' ||
+                               peerConnection.connectionState === 'disconnected' ||
+                               peerConnection.connectionState === 'closed') {
+                        showError('Connection lost. Please try again.');
+                        stop();
+                    }
+                    // Update button appearance
+                    updateButtonState();
+                });
+                // Create data channel for server messages
+                const dataChannel = peerConnection.createDataChannel('text');
+                dataChannel.onmessage = handleMessage;  // Set message handler
+                // Create connection offer
+                const offer = await peerConnection.createOffer();
+                // Set local description (our end of connection)
+                await peerConnection.setLocalDescription(offer);
+                // Wait for ICE gathering to complete (finding connection methods)
+                await new Promise((resolve) => {
+                    if (peerConnection.iceGatheringState === "complete") {
+                        resolve(); // Already complete
+                    } else {
+                        // Function to check ICE gathering state
+                        const checkState = () => {
+                            if (peerConnection.iceGatheringState === "complete") {
+                                peerConnection.removeEventListener("icegatheringstatechange", checkState);
+                                resolve(); // Complete gathering
+                            }
+                        };
+                        // Listen for ICE gathering state changes
+                        peerConnection.addEventListener("icegatheringstatechange", checkState);
+                    }
+                });
+                // Generate random ID for this connection
+                webrtc_id = Math.random().toString(36).substring(7);
+                // Send connection offer to server
+                const response = await fetch('/webrtc/offer', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({
+                        sdp: peerConnection.localDescription.sdp,        // Session description
+                        type: peerConnection.localDescription.type,      // Offer type
+                        webrtc_id: webrtc_id                             // Unique connection ID
+                    })
+                });
+                // Parse server response
+                const serverResponse = await response.json();
+                // Handle server errors
+                if (serverResponse.status === 'failed') {
+                    showError(serverResponse.meta.error === 'concurrency_limit_reached'
+                        ? `Too many connections. Maximum limit is ${serverResponse.meta.limit}`
+                        : serverResponse.meta.error);
+                    stop();
+                    startButton.textContent = 'Start Recording';
+                    return;
+                }
+                // Complete connection with server's description
+                await peerConnection.setRemoteDescription(serverResponse);
+                // Create event source for receiving transcription results
+                eventSource = new EventSource('/transcript?webrtc_id=' + webrtc_id);
+                // Handle event source errors
+                eventSource.onerror = (event) => {
+                    console.error("EventSource error:", event);
+                    showError("Transcription connection lost. Please try again.");
+                };
+                // Process transcription results as they arrive
+                eventSource.addEventListener("output", (event) => {
+                    console.log("Received transcript chunk:", event.data);
+                    // Add text to display
+                    appendTranscript(event.data);
+                    //appendTranscriptSimple(event.data);
+                });
+            } catch (err) {
+                // Handle any setup errors
+                clearTimeout(timeoutId);
+                console.error('Error setting up WebRTC:', err);
+                showError('Failed to establish connection. Please try again.');
+                stop();
+                startButton.textContent = 'Start Recording';
+            }
+        }
+        function appendTranscriptSimple(text) {
+            const p = document.createElement('p');
+            p.textContent = text;
+            transcriptDiv.appendChild(p);
+            transcriptDiv.scrollTop = transcriptDiv.scrollHeight;
+        }
+        // Add transcription text to display
+        function appendTranscript(text) {
+            // Clean up text
+            const formattedText = text.trim();
+            if (!formattedText) return;
+            const now = Date.now();
+            const timeSinceLastUpdate = now - lastUpdateTime;
+            lastUpdateTime = now;
+            // Handle transcript display
+            if (!currentParagraph) {
+                // Create new paragraph
+                currentParagraph = document.createElement('p');
+                currentParagraph.classList.add('current');
+                transcriptDiv.appendChild(currentParagraph);
+                currentParagraph.textContent = formattedText;
+            } else {
+                // Get current text
+                const currentText = currentParagraph.textContent;
+                // Fix spacing issues by normalizing
+                let cleanedText = formattedText;
+                // 1. Check for simple word repetition - last word repeated
+                const words = currentText.split(/\s+/);
+                const lastWord = words[words.length - 1].replace(/[^\w]/g, '').toLowerCase();
+                if (lastWord && lastWord.length > 2) {
+                    // Check if new text starts with the same word
+                    const regex = new RegExp(`^${lastWord}`, 'i');
+                    if (regex.test(cleanedText.replace(/[^\w]/g, ''))) {
+                        // Remove the first word if it's a duplicate
+                        cleanedText = cleanedText.replace(regex, '').trim();
+                    }
+                }
+                // 2. Add proper spacing
+                let finalText = currentText;
+                // Only add space if current text doesn't end with space or punctuation
+                // and new text doesn't start with punctuation
+                if (!/[\s.,!?]$/.test(finalText) && !/^[.,!?]/.test(cleanedText) && cleanedText) {
+                    finalText += ' ';
+                }
+                // 3. Add the cleaned text
+                finalText += cleanedText;
+                // 4. Fix any run-together words by adding spaces after punctuation
+                finalText = finalText.replace(/([.,!?])([a-zA-Z])/g, '$1 $2');
+                // Update the paragraph text
+                currentParagraph.textContent = finalText;
+            }
+            // Create new paragraph on sentence end or pause
+            if (/[.!?]$/.test(formattedText) || timeSinceLastUpdate > 5000) {
+                // End current paragraph
+                if (currentParagraph) {
+                    currentParagraph.classList.remove('current');
+                }
+                // Prepare for next paragraph
+                currentParagraph = null;
+            }
+            // Limit number of displayed paragraphs
+            const paragraphs = transcriptDiv.getElementsByTagName('p');
+            while (paragraphs.length > 10) { // Keep last 10 paragraphs
+                transcriptDiv.removeChild(paragraphs[0]);
+            }
+            // Scroll to show newest text
+            requestAnimationFrame(() => {
+                transcriptDiv.scrollTop = transcriptDiv.scrollHeight;
+            });
+        }
+        // Stop recording and clean up resources
+        function stop() {
+            // Stop audio visualization
+            if (animationFrame) {
+                cancelAnimationFrame(animationFrame);
+                animationFrame = null;
+            }
+            // Pause audio processing
+            if (audioContext) {
+                audioContext.suspend();
+            }
+            // Stop all media tracks
+            if (peerConnection) {
+                const senders = peerConnection.getSenders();
+                if (senders) {
+                    senders.forEach(sender => {
+                        if (sender.track) {
+                            sender.track.stop();  // Release microphone
+                        }
+                    });
+                }
+                // Close WebRTC connection
+                peerConnection.close();
+                peerConnection = null;
+            }
+            // Close transcription connection
+            if (eventSource) {
+                eventSource.close();
+                eventSource = null;
+            }
+            // Reset audio level
+            audioLevel = 0;
+            // Update button display
+            updateButtonState();
+            // Ask about clearing transcript
+            if (window.confirm('Clear transcript?')) {
+                // Clear all transcript text
+                transcriptDiv.innerHTML = '';
+                currentParagraph = null;
+            } else {
+                // Just end current paragraph
+                if (currentParagraph) {
+                    currentParagraph.classList.remove('current');
+                    currentParagraph = null;
+                }
+            }
+            // Reset timestamp
+            lastUpdateTime = Date.now();
+        }
+        // Clean up resources when page is closed
+        window.addEventListener('beforeunload', () => {
+            stop();  // Stop recording and release resources
+        });
+        // Handle start/stop button clicks
+        startButton.addEventListener('click', () => {
+            if (!isRecording) {
+                // Start recording if not already recording
+                setupWebRTC();
+            } else {
+                // Stop recording if currently recording
+                stop();
+            }
+        });
+        // Initialize UI when page loads
+        document.addEventListener('DOMContentLoaded', () => {
+            // Ensure all UI elements are visible
+            const elementsToCheck = [
+                transcriptDiv,
+                startButton,
+                document.getElementById('error-toast')
+            ];
+            // Set appropriate display for each element
+            elementsToCheck.forEach(el => {
+                if (el) {
+                    // Set appropriate display style based on element type
+                    el.style.display = el.tagName.toLowerCase() === 'button' ? 'block' :
+                                      (el.id === 'transcript' ? 'block' : 'none');
+                }
+            });
+            // Apply CSS variables to ensure theme is working
+            document.body.style.backgroundColor = 'var(--background-dark)';
+            document.body.style.color = 'var(--text-light)';
+            // Force button colors for consistency
+            startButton.style.backgroundColor = 'rgba(249, 164, 92, 1.0)';
+            startButton.style.color = 'black';
+        });
+    </script>
+</body>
+</html>

main.py CHANGED Viewed

@@ -14,7 +14,7 @@ from fastrtc import (
     ReplyOnPause,
     Stream,
     AlgoOptions,
-    #SileroVadOptions,
     audio_to_bytes,
 )
 from transformers import (
@@ -34,6 +34,11 @@ setup_logging(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 device = get_device(force_cpu=False)
 torch_dtype, np_dtype = get_torch_and_np_dtypes(device, use_bfloat16=False)
 logger.info(f"Using device: {device}, torch_dtype: {torch_dtype}, np_dtype: {np_dtype}")
@@ -44,10 +49,9 @@ logger.info(f"CUDA Version: {cuda_version}, GPU Device: {device_name}")
 attention = "flash_attention_2" if is_flash_attn_2_available() else "sdpa"
 logger.info(f"Using attention: {attention}")
-model_id = os.getenv("MODEL_ID", "openai/whisper-large-v3-turbo")
-logger.info(f"Loading Whisper model: {model_id}")
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    model_id,
     torch_dtype=torch_dtype,
     low_cpu_mem_usage=True,
     use_safetensors=True,
@@ -55,7 +59,7 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(
 )
 model.to(device)
-processor = AutoProcessor.from_pretrained(model_id)
 transcribe_pipeline = pipeline(
     task="automatic-speech-recognition",
@@ -102,20 +106,20 @@ stream = Stream(
             # If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
             speech_threshold=0.1,
         ),
-        #model_options=SileroVadOptions(
-        #    # Threshold for what is considered speech (default 0.5)
-        #    threshold=0.5,
-        # Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
-        #    min_speech_duration_ms=250,
-        # Max duration of speech chunks, longer will be split (default float('inf'))
-        #    max_speech_duration_s=30,
-        # Wait for ms at the end of each speech chunk before separating it (default 2000)
-        #    min_silence_duration_ms=2000,
-        # Chunk size for VAD model. Can be 512, 1024, 1536 for 16k s.r. (default 1024)
-        #    window_size_samples=1024,
-        # Final speech chunks are padded by speech_pad_ms each side (default 400)
-        #    speech_pad_ms=400,
-        #),
     ),
     # send-receive: bidirectional streaming (default)
     # send: client to server only
@@ -126,7 +130,7 @@ stream = Stream(
         gr.Textbox(label="Transcript"),
     ],
     additional_outputs_handler=lambda current, new: current + " " + new,
-    rtc_configuration=get_rtc_credentials(provider="hf") if os.getenv("APP_MODE") == "deployed" else None,
     concurrency_limit=6
 )
@@ -135,8 +139,8 @@ stream.mount(app)
 @app.get("/")
 async def index():
-    html_content = open("index.html").read()
-    rtc_config = get_rtc_credentials(provider="hf") if os.getenv("APP_MODE") == "deployed" else None
     return HTMLResponse(content=html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config)))
 @app.get("/transcript")

     ReplyOnPause,
     Stream,
     AlgoOptions,
+    SileroVadOptions,
     audio_to_bytes,
 )
 from transformers import (
 logger = logging.getLogger(__name__)
+APP_MODE = os.getenv("APP_MODE", "deployed")
+MODEL_ID = os.getenv("MODEL_ID", "openai/whisper-large-v3-turbo")
+UI_FILE = os.getenv("UI_FILE", "index.html")
 device = get_device(force_cpu=False)
 torch_dtype, np_dtype = get_torch_and_np_dtypes(device, use_bfloat16=False)
 logger.info(f"Using device: {device}, torch_dtype: {torch_dtype}, np_dtype: {np_dtype}")
 attention = "flash_attention_2" if is_flash_attn_2_available() else "sdpa"
 logger.info(f"Using attention: {attention}")
+logger.info(f"Loading Whisper model: {MODEL_ID}")
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    MODEL_ID,
     torch_dtype=torch_dtype,
     low_cpu_mem_usage=True,
     use_safetensors=True,
 )
 model.to(device)
+processor = AutoProcessor.from_pretrained(MODEL_ID)
 transcribe_pipeline = pipeline(
     task="automatic-speech-recognition",
             # If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
             speech_threshold=0.1,
         ),
+        model_options=SileroVadOptions(
+            # Threshold for what is considered speech (default 0.5)
+            threshold=0.5,
+            # Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
+            min_speech_duration_ms=250,
+            # Max duration of speech chunks, longer will be split (default float('inf'))
+            max_speech_duration_s=15,
+            # Wait for ms at the end of each speech chunk before separating it (default 2000)
+            min_silence_duration_ms=2000,
+            # Chunk size for VAD model. Can be 512, 1024, 1536 for 16k s.r. (default 1024)
+            window_size_samples=1024,
+            # Final speech chunks are padded by speech_pad_ms each side (default 400)
+            speech_pad_ms=400,
+        ),
     ),
     # send-receive: bidirectional streaming (default)
     # send: client to server only
         gr.Textbox(label="Transcript"),
     ],
     additional_outputs_handler=lambda current, new: current + " " + new,
+    rtc_configuration=get_rtc_credentials(provider="hf") if APP_MODE == "deployed" else None,
     concurrency_limit=6
 )
 @app.get("/")
 async def index():
+    html_content = open(UI_FILE).read()
+    rtc_config = get_rtc_credentials(provider="hf") if APP_MODE == "deployed" else None
     return HTMLResponse(content=html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config)))
 @app.get("/transcript")