Sofia Casadei commited on
Commit
489ba9a
·
1 Parent(s): 953b94e

add: big screen ui

Browse files
Files changed (2) hide show
  1. index-screen.html +632 -0
  2. main.py +26 -22
index-screen.html ADDED
@@ -0,0 +1,632 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>Real-time Whisper Transcription</title>
8
+ <style>
9
+ :root {
10
+ --background-dark: #000000;
11
+ --text-light: #ffffff;
12
+ }
13
+
14
+ body {
15
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
16
+ margin: 0; /* Removes default margin */
17
+ padding: 0; /* Removes default padding */
18
+ background-color: var(--background-dark); /* Sets background to black */
19
+ color: var(--text-light); /* Sets text to white */
20
+ min-height: 100vh; /* Ensures page fills entire viewport height */
21
+ }
22
+
23
+ /* Hide the header in presentation mode */
24
+ .hero {
25
+ display: none; /* Hides the hero section completely */
26
+ }
27
+
28
+ .container {
29
+ max-width: 100%; /* Makes container full width */
30
+ margin: 0; /* Removes margin */
31
+ padding: 1rem; /* Adds small padding all around */
32
+ }
33
+
34
+ /* Base styling for transcript container */
35
+ .transcript-container {
36
+ height: 90vh; /* Sets height to 90% of viewport height */
37
+ border: none; /* Removes border */
38
+ padding: 2rem; /* Adds generous padding inside */
39
+ background: var(--background-dark); /* Ensures background is black */
40
+ color: var(--text-light); /* Ensures text is white */
41
+ overflow-y: auto; /* Enables vertical scrolling when content overflows */
42
+ margin-bottom: 0; /* Removes bottom margin */
43
+ display: block; /* Makes element a block to take full width */
44
+ width: 100%; /* Sets width to 100% */
45
+ }
46
+
47
+ /* Styling for transcript paragraphs */
48
+ .transcript-container p {
49
+ margin: 0.5rem 0; /* Small vertical margin between paragraphs */
50
+ padding: 0.5rem 0; /* Small vertical padding within paragraphs */
51
+ background: transparent; /* Transparent background (no highlighting) */
52
+ border-radius: 0; /* No rounded corners */
53
+ line-height: 1.6; /* Increases line spacing for readability */
54
+ font-size: 3.5rem; /* rem means relative to the root font size */
55
+ font-weight: 500; /* 500 = medium weight, 700 = bold */
56
+ max-width: 98%; /* Full width within container */
57
+ white-space: normal; /* Allows text to wrap normally */
58
+ word-wrap: break-word; /* Prevents overflow of long words */
59
+ color: white; /* Explicitly sets text color to white */
60
+ display: block; /* Each paragraph takes full width */
61
+ }
62
+
63
+ /* Current paragraph styling - slightly brighter for emphasis */
64
+ .transcript-container p.current {
65
+ background: transparent; /* No background color */
66
+ color: rgba(255, 255, 255, 1.0); /* Full brightness white for current text */
67
+ }
68
+
69
+ /* Ensure all paragraphs have full opacity (keeps history visible) */
70
+ .transcript-container p:nth-last-child(n+4) {
71
+ opacity: 1.0; /* Shows all paragraphs at full opacity */
72
+ }
73
+
74
+ /* Controls for starting/stopping transcription */
75
+ .controls {
76
+ position: fixed; /* Fixes controls to viewport */
77
+ bottom: 2rem; /* Positions 2rem from bottom */
78
+ right: 2rem; /* Positions 2rem from right */
79
+ margin: 0; /* No margin */
80
+ opacity: 0.8; /* Slightly transparent when not hovered */
81
+ transition: opacity 0.3s ease; /* Smooth transition for opacity changes */
82
+ z-index: 1000; /* Ensures controls appear above other elements */
83
+ }
84
+
85
+ .controls:hover {
86
+ opacity: 1; /* Full opacity on hover */
87
+ }
88
+
89
+ /* Button styling - orange with black text for good contrast */
90
+ button {
91
+ background: rgba(249, 164, 92, 1.0); /* Solid orange background */
92
+ backdrop-filter: blur(5px); /* Blur effect for elements behind */
93
+ font-size: 1.2rem; /* Large text */
94
+ min-width: 160px; /* Minimum width for button */
95
+ padding: 15px 30px; /* Generous padding inside button */
96
+ color: black !important; /* Forces black text color */
97
+ font-weight: bold; /* Bold text for better visibility */
98
+ border: 2px solid rgba(255, 255, 255, 0.2); /* Subtle border */
99
+ border-radius: 8px; /* Rounded corners */
100
+ cursor: pointer; /* Shows pointer cursor on hover */
101
+ transition: all 0.2s ease; /* Smooth transition for hover effects */
102
+ display: block; /* Makes button take up full width */
103
+ }
104
+
105
+ button:hover {
106
+ background: rgba(249, 164, 92, 0.9); /* Slightly more transparent on hover */
107
+ transform: translateY(-2px); /* Slight upward movement on hover */
108
+ }
109
+
110
+ /* Spinner animation for loading state */
111
+ .icon-with-spinner .spinner {
112
+ border: 3px solid black; /* Spinner border */
113
+ border-top: 3px solid transparent; /* Transparent top creates spinning effect */
114
+ border-radius: 50%; /* Makes it circular */
115
+ width: 24px; /* Width of spinner */
116
+ height: 24px; /* Height of spinner */
117
+ animation: spin 1s linear infinite; /* Animation for spinning effect */
118
+ }
119
+
120
+ @keyframes spin {
121
+ 0% { transform: rotate(0deg); } /* Starting rotation */
122
+ 100% { transform: rotate(360deg); } /* Full 360° rotation */
123
+ }
124
+
125
+ /* Recording indicator pulse animation */
126
+ .pulse-circle {
127
+ display: inline-block; /* Allows other elements inline */
128
+ width: 12px; /* Width of pulse circle */
129
+ height: 12px; /* Height of pulse circle */
130
+ border-radius: 50%; /* Makes it circular */
131
+ background-color: red; /* Red color for recording indicator */
132
+ margin-right: 8px; /* Space to right of circle */
133
+ animation: pulse 1.5s ease infinite; /* Continuous pulsing animation */
134
+ }
135
+
136
+ @keyframes pulse {
137
+ 0% { transform: scale(0.95); opacity: 0.7; } /* Slightly smaller and transparent */
138
+ 50% { transform: scale(1.1); opacity: 1; } /* Larger and fully opaque */
139
+ 100% { transform: scale(0.95); opacity: 0.7; } /* Back to starting state */
140
+ }
141
+
142
+ /* Custom scrollbar styling */
143
+ .transcript-container::-webkit-scrollbar {
144
+ width: 8px; /* Width of scrollbar */
145
+ }
146
+
147
+ .transcript-container::-webkit-scrollbar-track {
148
+ background: var(--background-dark); /* Black scrollbar track */
149
+ }
150
+
151
+ .transcript-container::-webkit-scrollbar-thumb {
152
+ background: rgba(249, 164, 92, 0.3); /* Semi-transparent orange scrollbar thumb */
153
+ border-radius: 4px; /* Rounded corners on scrollbar thumb */
154
+ }
155
+
156
+ /* Error toast styling */
157
+ .toast {
158
+ background: rgba(0, 0, 0, 0.8); /* Semi-transparent black background */
159
+ backdrop-filter: blur(5px); /* Blur effect behind toast */
160
+ color: var(--text-light); /* White text */
161
+ font-size: 1.2rem; /* Large text size */
162
+ }
163
+ </style>
164
+ </head>
165
+
166
+ <body>
167
+ <!-- Error message container that slides in when needed -->
168
+ <div id="error-toast" class="toast"></div>
169
+ <!-- Header section (hidden in presentation mode) -->
170
+ <div class="hero">
171
+ <h1>Real-time Transcription</h1>
172
+ <p>Powered by FastRTC and Local Whisper 🤗</p>
173
+ </div>
174
+
175
+ <!-- Main content container -->
176
+ <div class="container">
177
+ <!-- Container for transcript text -->
178
+ <div class="transcript-container" id="transcript"></div>
179
+ <!-- Controls for starting/stopping recording -->
180
+ <div class="controls">
181
+ <button id="start-button">Start Recording</button>
182
+ </div>
183
+ </div>
184
+
185
+ <script>
186
+ // Global variables for WebRTC connection
187
+ let peerConnection; // Stores the WebRTC connection object for audio streaming
188
+ let webrtc_id; // A unique ID to identify this connection on the server
189
+ let audioContext, analyser, audioSource; // Audio processing objects for visualization
190
+ let audioLevel = 0; // Stores the current audio level (volume) from 0-1
191
+ let animationFrame; // Reference to the animation frame for audio visualization
192
+ let isRecording = false; // Tracks whether we're currently recording or not
193
+ let eventSource; // Object that receives transcription results from the server
194
+
195
+ // DOM element references
196
+ const startButton = document.getElementById('start-button'); // The button to start/stop recording
197
+ const transcriptDiv = document.getElementById('transcript'); // The container for transcription text
198
+
199
+ // Variables for managing the transcript display
200
+ let currentParagraph = null; // Reference to the current paragraph being updated
201
+ let lastUpdateTime = Date.now(); // Timestamp of when we last updated the transcript
202
+
203
+ // Show error messages to the user in a toast notification
204
+ function showError(message) {
205
+ const toast = document.getElementById('error-toast'); // Get the toast element
206
+ toast.textContent = message; // Set the error message
207
+ toast.style.display = 'block'; // Make the toast visible
208
+
209
+ // Hide toast after 5 seconds
210
+ setTimeout(() => {
211
+ toast.style.display = 'none'; // Hide the toast
212
+ }, 5000);
213
+ }
214
+
215
+ // Handle messages received from the server through WebRTC data channel
216
+ function handleMessage(event) {
217
+ // Parse JSON message
218
+ const eventJson = JSON.parse(event.data);
219
+ // Display errors to the user
220
+ if (eventJson.type === "error") {
221
+ showError(eventJson.message);
222
+ }
223
+ // Log all messages to console for debugging
224
+ console.log('Received message:', event.data);
225
+ }
226
+
227
+ // Update button appearance based on connection state
228
+ function updateButtonState() {
229
+ // If connecting, show spinner
230
+ if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
231
+ startButton.innerHTML = `
232
+ <div class="icon-with-spinner">
233
+ <div class="spinner"></div>
234
+ <span>Connecting...</span>
235
+ </div>
236
+ `;
237
+ isRecording = false; // Not recording while connecting
238
+ // If connected, show pulsing recording indicator
239
+ } else if (peerConnection && peerConnection.connectionState === 'connected') {
240
+ startButton.innerHTML = `
241
+ <div class="pulse-container">
242
+ <div class="pulse-circle"></div>
243
+ <span>Stop Recording</span>
244
+ </div>
245
+ `;
246
+ isRecording = true; // Set recording state to true
247
+ // Default state - ready to start
248
+ } else {
249
+ startButton.innerHTML = 'Start Recording';
250
+ isRecording = false; // Not recording when not connected
251
+ }
252
+ }
253
+
254
+ // Set up audio visualization to show when the user is speaking
255
+ function setupAudioVisualization(stream) {
256
+ // Create or resume the audio context
257
+ if (!audioContext) {
258
+ // Create new audio context with browser compatibility handling
259
+ audioContext = new (window.AudioContext || window.webkitAudioContext)();
260
+ } else {
261
+ // Resume context if it was suspended
262
+ if (audioContext.state === 'suspended') {
263
+ audioContext.resume();
264
+ }
265
+ }
266
+
267
+ // Create audio analyzer for processing audio data
268
+ analyser = audioContext.createAnalyser();
269
+ // Create media source from microphone stream
270
+ audioSource = audioContext.createMediaStreamSource(stream);
271
+ // Connect source to analyzer
272
+ audioSource.connect(analyser);
273
+ // Set FFT size (controls frequency data resolution)
274
+ analyser.fftSize = 64;
275
+ // Create array to store frequency data
276
+ const dataArray = new Uint8Array(analyser.frequencyBinCount);
277
+
278
+ // Function to continuously update audio level visualization
279
+ function updateAudioLevel() {
280
+ // Get audio frequency data
281
+ analyser.getByteFrequencyData(dataArray);
282
+ // Calculate average volume across all frequencies
283
+ const average = Array.from(dataArray).reduce((a, b) => a + b, 0) / dataArray.length;
284
+ // Convert to 0-1 scale
285
+ audioLevel = average / 255;
286
+
287
+ // Update pulse circle size based on audio level
288
+ const pulseCircle = document.querySelector('.pulse-circle');
289
+ if (pulseCircle) {
290
+ pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
291
+ }
292
+
293
+ // Continue animation loop
294
+ animationFrame = requestAnimationFrame(updateAudioLevel);
295
+ }
296
+ // Start audio visualization loop
297
+ updateAudioLevel();
298
+ }
299
+
300
+ // Set up WebRTC connection for streaming audio to server
301
+ async function setupWebRTC() {
302
+ // Get WebRTC configuration from global variable
303
+ const config = __RTC_CONFIGURATION__;
304
+ // Create new peer connection
305
+ peerConnection = new RTCPeerConnection(config);
306
+
307
+ // Set connection timeout (15 seconds)
308
+ const connectionTimeout = setTimeout(() => {
309
+ if (peerConnection && peerConnection.connectionState !== 'connected') {
310
+ showError('Connection timeout. Please check your network and try again.');
311
+ stop(); // Stop connection attempt
312
+ }
313
+ }, 15000);
314
+
315
+ // Set warning for slow connection (5 seconds)
316
+ const timeoutId = setTimeout(() => {
317
+ const toast = document.getElementById('error-toast');
318
+ toast.textContent = "Connection is taking longer than usual. Are you on a VPN?";
319
+ toast.className = 'toast warning';
320
+ toast.style.display = 'block';
321
+
322
+ // Hide warning after 5 seconds
323
+ setTimeout(() => {
324
+ toast.style.display = 'none';
325
+ }, 5000);
326
+ }, 5000);
327
+
328
+ try {
329
+ // Request access to user's microphone
330
+ const stream = await navigator.mediaDevices.getUserMedia({
331
+ audio: true // Only request audio access
332
+ });
333
+
334
+ // Set up audio visualization
335
+ setupAudioVisualization(stream);
336
+
337
+ // Add audio tracks to WebRTC connection
338
+ stream.getTracks().forEach(track => {
339
+ peerConnection.addTrack(track, stream);
340
+ });
341
+
342
+ // Monitor connection state changes
343
+ peerConnection.addEventListener('connectionstatechange', () => {
344
+ // Log state changes
345
+ console.log('connectionstatechange', peerConnection.connectionState);
346
+
347
+ // Handle successful connection
348
+ if (peerConnection.connectionState === 'connected') {
349
+ clearTimeout(timeoutId);
350
+ clearTimeout(connectionTimeout);
351
+ const toast = document.getElementById('error-toast');
352
+ toast.style.display = 'none';
353
+ // Handle connection failures
354
+ } else if (peerConnection.connectionState === 'failed' ||
355
+ peerConnection.connectionState === 'disconnected' ||
356
+ peerConnection.connectionState === 'closed') {
357
+ showError('Connection lost. Please try again.');
358
+ stop();
359
+ }
360
+ // Update button appearance
361
+ updateButtonState();
362
+ });
363
+
364
+ // Create data channel for server messages
365
+ const dataChannel = peerConnection.createDataChannel('text');
366
+ dataChannel.onmessage = handleMessage; // Set message handler
367
+
368
+ // Create connection offer
369
+ const offer = await peerConnection.createOffer();
370
+ // Set local description (our end of connection)
371
+ await peerConnection.setLocalDescription(offer);
372
+
373
+ // Wait for ICE gathering to complete (finding connection methods)
374
+ await new Promise((resolve) => {
375
+ if (peerConnection.iceGatheringState === "complete") {
376
+ resolve(); // Already complete
377
+ } else {
378
+ // Function to check ICE gathering state
379
+ const checkState = () => {
380
+ if (peerConnection.iceGatheringState === "complete") {
381
+ peerConnection.removeEventListener("icegatheringstatechange", checkState);
382
+ resolve(); // Complete gathering
383
+ }
384
+ };
385
+ // Listen for ICE gathering state changes
386
+ peerConnection.addEventListener("icegatheringstatechange", checkState);
387
+ }
388
+ });
389
+
390
+ // Generate random ID for this connection
391
+ webrtc_id = Math.random().toString(36).substring(7);
392
+
393
+ // Send connection offer to server
394
+ const response = await fetch('/webrtc/offer', {
395
+ method: 'POST',
396
+ headers: { 'Content-Type': 'application/json' },
397
+ body: JSON.stringify({
398
+ sdp: peerConnection.localDescription.sdp, // Session description
399
+ type: peerConnection.localDescription.type, // Offer type
400
+ webrtc_id: webrtc_id // Unique connection ID
401
+ })
402
+ });
403
+
404
+ // Parse server response
405
+ const serverResponse = await response.json();
406
+
407
+ // Handle server errors
408
+ if (serverResponse.status === 'failed') {
409
+ showError(serverResponse.meta.error === 'concurrency_limit_reached'
410
+ ? `Too many connections. Maximum limit is ${serverResponse.meta.limit}`
411
+ : serverResponse.meta.error);
412
+ stop();
413
+ startButton.textContent = 'Start Recording';
414
+ return;
415
+ }
416
+
417
+ // Complete connection with server's description
418
+ await peerConnection.setRemoteDescription(serverResponse);
419
+
420
+ // Create event source for receiving transcription results
421
+ eventSource = new EventSource('/transcript?webrtc_id=' + webrtc_id);
422
+ // Handle event source errors
423
+ eventSource.onerror = (event) => {
424
+ console.error("EventSource error:", event);
425
+ showError("Transcription connection lost. Please try again.");
426
+ };
427
+ // Process transcription results as they arrive
428
+ eventSource.addEventListener("output", (event) => {
429
+ console.log("Received transcript chunk:", event.data);
430
+ // Add text to display
431
+ appendTranscript(event.data);
432
+ //appendTranscriptSimple(event.data);
433
+ });
434
+ } catch (err) {
435
+ // Handle any setup errors
436
+ clearTimeout(timeoutId);
437
+ console.error('Error setting up WebRTC:', err);
438
+ showError('Failed to establish connection. Please try again.');
439
+ stop();
440
+ startButton.textContent = 'Start Recording';
441
+ }
442
+ }
443
+
444
+ function appendTranscriptSimple(text) {
445
+ const p = document.createElement('p');
446
+ p.textContent = text;
447
+ transcriptDiv.appendChild(p);
448
+ transcriptDiv.scrollTop = transcriptDiv.scrollHeight;
449
+ }
450
+
451
+ // Add transcription text to display
452
+ function appendTranscript(text) {
453
+ // Clean up text
454
+ const formattedText = text.trim();
455
+ if (!formattedText) return;
456
+
457
+ const now = Date.now();
458
+ const timeSinceLastUpdate = now - lastUpdateTime;
459
+ lastUpdateTime = now;
460
+
461
+ // Handle transcript display
462
+ if (!currentParagraph) {
463
+ // Create new paragraph
464
+ currentParagraph = document.createElement('p');
465
+ currentParagraph.classList.add('current');
466
+ transcriptDiv.appendChild(currentParagraph);
467
+ currentParagraph.textContent = formattedText;
468
+ } else {
469
+ // Get current text
470
+ const currentText = currentParagraph.textContent;
471
+
472
+ // Fix spacing issues by normalizing
473
+ let cleanedText = formattedText;
474
+
475
+ // 1. Check for simple word repetition - last word repeated
476
+ const words = currentText.split(/\s+/);
477
+ const lastWord = words[words.length - 1].replace(/[^\w]/g, '').toLowerCase();
478
+
479
+ if (lastWord && lastWord.length > 2) {
480
+ // Check if new text starts with the same word
481
+ const regex = new RegExp(`^${lastWord}`, 'i');
482
+ if (regex.test(cleanedText.replace(/[^\w]/g, ''))) {
483
+ // Remove the first word if it's a duplicate
484
+ cleanedText = cleanedText.replace(regex, '').trim();
485
+ }
486
+ }
487
+
488
+ // 2. Add proper spacing
489
+ let finalText = currentText;
490
+
491
+ // Only add space if current text doesn't end with space or punctuation
492
+ // and new text doesn't start with punctuation
493
+ if (!/[\s.,!?]$/.test(finalText) && !/^[.,!?]/.test(cleanedText) && cleanedText) {
494
+ finalText += ' ';
495
+ }
496
+
497
+ // 3. Add the cleaned text
498
+ finalText += cleanedText;
499
+
500
+ // 4. Fix any run-together words by adding spaces after punctuation
501
+ finalText = finalText.replace(/([.,!?])([a-zA-Z])/g, '$1 $2');
502
+
503
+ // Update the paragraph text
504
+ currentParagraph.textContent = finalText;
505
+ }
506
+
507
+ // Create new paragraph on sentence end or pause
508
+ if (/[.!?]$/.test(formattedText) || timeSinceLastUpdate > 5000) {
509
+ // End current paragraph
510
+ if (currentParagraph) {
511
+ currentParagraph.classList.remove('current');
512
+ }
513
+
514
+ // Prepare for next paragraph
515
+ currentParagraph = null;
516
+ }
517
+
518
+ // Limit number of displayed paragraphs
519
+ const paragraphs = transcriptDiv.getElementsByTagName('p');
520
+ while (paragraphs.length > 10) { // Keep last 10 paragraphs
521
+ transcriptDiv.removeChild(paragraphs[0]);
522
+ }
523
+
524
+ // Scroll to show newest text
525
+ requestAnimationFrame(() => {
526
+ transcriptDiv.scrollTop = transcriptDiv.scrollHeight;
527
+ });
528
+ }
529
+
530
+ // Stop recording and clean up resources
531
+ function stop() {
532
+ // Stop audio visualization
533
+ if (animationFrame) {
534
+ cancelAnimationFrame(animationFrame);
535
+ animationFrame = null;
536
+ }
537
+
538
+ // Pause audio processing
539
+ if (audioContext) {
540
+ audioContext.suspend();
541
+ }
542
+
543
+ // Stop all media tracks
544
+ if (peerConnection) {
545
+ const senders = peerConnection.getSenders();
546
+ if (senders) {
547
+ senders.forEach(sender => {
548
+ if (sender.track) {
549
+ sender.track.stop(); // Release microphone
550
+ }
551
+ });
552
+ }
553
+
554
+ // Close WebRTC connection
555
+ peerConnection.close();
556
+ peerConnection = null;
557
+ }
558
+
559
+ // Close transcription connection
560
+ if (eventSource) {
561
+ eventSource.close();
562
+ eventSource = null;
563
+ }
564
+
565
+ // Reset audio level
566
+ audioLevel = 0;
567
+ // Update button display
568
+ updateButtonState();
569
+
570
+ // Ask about clearing transcript
571
+ if (window.confirm('Clear transcript?')) {
572
+ // Clear all transcript text
573
+ transcriptDiv.innerHTML = '';
574
+ currentParagraph = null;
575
+ } else {
576
+ // Just end current paragraph
577
+ if (currentParagraph) {
578
+ currentParagraph.classList.remove('current');
579
+ currentParagraph = null;
580
+ }
581
+ }
582
+
583
+ // Reset timestamp
584
+ lastUpdateTime = Date.now();
585
+ }
586
+
587
+ // Clean up resources when page is closed
588
+ window.addEventListener('beforeunload', () => {
589
+ stop(); // Stop recording and release resources
590
+ });
591
+
592
+ // Handle start/stop button clicks
593
+ startButton.addEventListener('click', () => {
594
+ if (!isRecording) {
595
+ // Start recording if not already recording
596
+ setupWebRTC();
597
+ } else {
598
+ // Stop recording if currently recording
599
+ stop();
600
+ }
601
+ });
602
+
603
+ // Initialize UI when page loads
604
+ document.addEventListener('DOMContentLoaded', () => {
605
+ // Ensure all UI elements are visible
606
+ const elementsToCheck = [
607
+ transcriptDiv,
608
+ startButton,
609
+ document.getElementById('error-toast')
610
+ ];
611
+
612
+ // Set appropriate display for each element
613
+ elementsToCheck.forEach(el => {
614
+ if (el) {
615
+ // Set appropriate display style based on element type
616
+ el.style.display = el.tagName.toLowerCase() === 'button' ? 'block' :
617
+ (el.id === 'transcript' ? 'block' : 'none');
618
+ }
619
+ });
620
+
621
+ // Apply CSS variables to ensure theme is working
622
+ document.body.style.backgroundColor = 'var(--background-dark)';
623
+ document.body.style.color = 'var(--text-light)';
624
+
625
+ // Force button colors for consistency
626
+ startButton.style.backgroundColor = 'rgba(249, 164, 92, 1.0)';
627
+ startButton.style.color = 'black';
628
+ });
629
+ </script>
630
+ </body>
631
+
632
+ </html>
main.py CHANGED
@@ -14,7 +14,7 @@ from fastrtc import (
14
  ReplyOnPause,
15
  Stream,
16
  AlgoOptions,
17
- #SileroVadOptions,
18
  audio_to_bytes,
19
  )
20
  from transformers import (
@@ -34,6 +34,11 @@ setup_logging(level=logging.DEBUG)
34
  logger = logging.getLogger(__name__)
35
 
36
 
 
 
 
 
 
37
  device = get_device(force_cpu=False)
38
  torch_dtype, np_dtype = get_torch_and_np_dtypes(device, use_bfloat16=False)
39
  logger.info(f"Using device: {device}, torch_dtype: {torch_dtype}, np_dtype: {np_dtype}")
@@ -44,10 +49,9 @@ logger.info(f"CUDA Version: {cuda_version}, GPU Device: {device_name}")
44
  attention = "flash_attention_2" if is_flash_attn_2_available() else "sdpa"
45
  logger.info(f"Using attention: {attention}")
46
 
47
- model_id = os.getenv("MODEL_ID", "openai/whisper-large-v3-turbo")
48
- logger.info(f"Loading Whisper model: {model_id}")
49
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
50
- model_id,
51
  torch_dtype=torch_dtype,
52
  low_cpu_mem_usage=True,
53
  use_safetensors=True,
@@ -55,7 +59,7 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(
55
  )
56
  model.to(device)
57
 
58
- processor = AutoProcessor.from_pretrained(model_id)
59
 
60
  transcribe_pipeline = pipeline(
61
  task="automatic-speech-recognition",
@@ -102,20 +106,20 @@ stream = Stream(
102
  # If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
103
  speech_threshold=0.1,
104
  ),
105
- #model_options=SileroVadOptions(
106
- # # Threshold for what is considered speech (default 0.5)
107
- # threshold=0.5,
108
- # Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
109
- # min_speech_duration_ms=250,
110
- # Max duration of speech chunks, longer will be split (default float('inf'))
111
- # max_speech_duration_s=30,
112
- # Wait for ms at the end of each speech chunk before separating it (default 2000)
113
- # min_silence_duration_ms=2000,
114
- # Chunk size for VAD model. Can be 512, 1024, 1536 for 16k s.r. (default 1024)
115
- # window_size_samples=1024,
116
- # Final speech chunks are padded by speech_pad_ms each side (default 400)
117
- # speech_pad_ms=400,
118
- #),
119
  ),
120
  # send-receive: bidirectional streaming (default)
121
  # send: client to server only
@@ -126,7 +130,7 @@ stream = Stream(
126
  gr.Textbox(label="Transcript"),
127
  ],
128
  additional_outputs_handler=lambda current, new: current + " " + new,
129
- rtc_configuration=get_rtc_credentials(provider="hf") if os.getenv("APP_MODE") == "deployed" else None,
130
  concurrency_limit=6
131
  )
132
 
@@ -135,8 +139,8 @@ stream.mount(app)
135
 
136
  @app.get("/")
137
  async def index():
138
- html_content = open("index.html").read()
139
- rtc_config = get_rtc_credentials(provider="hf") if os.getenv("APP_MODE") == "deployed" else None
140
  return HTMLResponse(content=html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config)))
141
 
142
  @app.get("/transcript")
 
14
  ReplyOnPause,
15
  Stream,
16
  AlgoOptions,
17
+ SileroVadOptions,
18
  audio_to_bytes,
19
  )
20
  from transformers import (
 
34
  logger = logging.getLogger(__name__)
35
 
36
 
37
+ APP_MODE = os.getenv("APP_MODE", "deployed")
38
+ MODEL_ID = os.getenv("MODEL_ID", "openai/whisper-large-v3-turbo")
39
+ UI_FILE = os.getenv("UI_FILE", "index.html")
40
+
41
+
42
  device = get_device(force_cpu=False)
43
  torch_dtype, np_dtype = get_torch_and_np_dtypes(device, use_bfloat16=False)
44
  logger.info(f"Using device: {device}, torch_dtype: {torch_dtype}, np_dtype: {np_dtype}")
 
49
  attention = "flash_attention_2" if is_flash_attn_2_available() else "sdpa"
50
  logger.info(f"Using attention: {attention}")
51
 
52
+ logger.info(f"Loading Whisper model: {MODEL_ID}")
 
53
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
54
+ MODEL_ID,
55
  torch_dtype=torch_dtype,
56
  low_cpu_mem_usage=True,
57
  use_safetensors=True,
 
59
  )
60
  model.to(device)
61
 
62
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
63
 
64
  transcribe_pipeline = pipeline(
65
  task="automatic-speech-recognition",
 
106
  # If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
107
  speech_threshold=0.1,
108
  ),
109
+ model_options=SileroVadOptions(
110
+ # Threshold for what is considered speech (default 0.5)
111
+ threshold=0.5,
112
+ # Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
113
+ min_speech_duration_ms=250,
114
+ # Max duration of speech chunks, longer will be split (default float('inf'))
115
+ max_speech_duration_s=15,
116
+ # Wait for ms at the end of each speech chunk before separating it (default 2000)
117
+ min_silence_duration_ms=2000,
118
+ # Chunk size for VAD model. Can be 512, 1024, 1536 for 16k s.r. (default 1024)
119
+ window_size_samples=1024,
120
+ # Final speech chunks are padded by speech_pad_ms each side (default 400)
121
+ speech_pad_ms=400,
122
+ ),
123
  ),
124
  # send-receive: bidirectional streaming (default)
125
  # send: client to server only
 
130
  gr.Textbox(label="Transcript"),
131
  ],
132
  additional_outputs_handler=lambda current, new: current + " " + new,
133
+ rtc_configuration=get_rtc_credentials(provider="hf") if APP_MODE == "deployed" else None,
134
  concurrency_limit=6
135
  )
136
 
 
139
 
140
  @app.get("/")
141
  async def index():
142
+ html_content = open(UI_FILE).read()
143
+ rtc_config = get_rtc_credentials(provider="hf") if APP_MODE == "deployed" else None
144
  return HTMLResponse(content=html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config)))
145
 
146
  @app.get("/transcript")