Spaces:
Sleeping
Sleeping
Sofia Casadei
commited on
Commit
·
489ba9a
1
Parent(s):
953b94e
add: big screen ui
Browse files- index-screen.html +632 -0
- main.py +26 -22
index-screen.html
ADDED
@@ -0,0 +1,632 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
|
4 |
+
<head>
|
5 |
+
<meta charset="UTF-8">
|
6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
7 |
+
<title>Real-time Whisper Transcription</title>
|
8 |
+
<style>
|
9 |
+
:root {
|
10 |
+
--background-dark: #000000;
|
11 |
+
--text-light: #ffffff;
|
12 |
+
}
|
13 |
+
|
14 |
+
body {
|
15 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
|
16 |
+
margin: 0; /* Removes default margin */
|
17 |
+
padding: 0; /* Removes default padding */
|
18 |
+
background-color: var(--background-dark); /* Sets background to black */
|
19 |
+
color: var(--text-light); /* Sets text to white */
|
20 |
+
min-height: 100vh; /* Ensures page fills entire viewport height */
|
21 |
+
}
|
22 |
+
|
23 |
+
/* Hide the header in presentation mode */
|
24 |
+
.hero {
|
25 |
+
display: none; /* Hides the hero section completely */
|
26 |
+
}
|
27 |
+
|
28 |
+
.container {
|
29 |
+
max-width: 100%; /* Makes container full width */
|
30 |
+
margin: 0; /* Removes margin */
|
31 |
+
padding: 1rem; /* Adds small padding all around */
|
32 |
+
}
|
33 |
+
|
34 |
+
/* Base styling for transcript container */
|
35 |
+
.transcript-container {
|
36 |
+
height: 90vh; /* Sets height to 90% of viewport height */
|
37 |
+
border: none; /* Removes border */
|
38 |
+
padding: 2rem; /* Adds generous padding inside */
|
39 |
+
background: var(--background-dark); /* Ensures background is black */
|
40 |
+
color: var(--text-light); /* Ensures text is white */
|
41 |
+
overflow-y: auto; /* Enables vertical scrolling when content overflows */
|
42 |
+
margin-bottom: 0; /* Removes bottom margin */
|
43 |
+
display: block; /* Makes element a block to take full width */
|
44 |
+
width: 100%; /* Sets width to 100% */
|
45 |
+
}
|
46 |
+
|
47 |
+
/* Styling for transcript paragraphs */
|
48 |
+
.transcript-container p {
|
49 |
+
margin: 0.5rem 0; /* Small vertical margin between paragraphs */
|
50 |
+
padding: 0.5rem 0; /* Small vertical padding within paragraphs */
|
51 |
+
background: transparent; /* Transparent background (no highlighting) */
|
52 |
+
border-radius: 0; /* No rounded corners */
|
53 |
+
line-height: 1.6; /* Increases line spacing for readability */
|
54 |
+
font-size: 3.5rem; /* rem means relative to the root font size */
|
55 |
+
font-weight: 500; /* 500 = medium weight, 700 = bold */
|
56 |
+
max-width: 98%; /* Full width within container */
|
57 |
+
white-space: normal; /* Allows text to wrap normally */
|
58 |
+
word-wrap: break-word; /* Prevents overflow of long words */
|
59 |
+
color: white; /* Explicitly sets text color to white */
|
60 |
+
display: block; /* Each paragraph takes full width */
|
61 |
+
}
|
62 |
+
|
63 |
+
/* Current paragraph styling - slightly brighter for emphasis */
|
64 |
+
.transcript-container p.current {
|
65 |
+
background: transparent; /* No background color */
|
66 |
+
color: rgba(255, 255, 255, 1.0); /* Full brightness white for current text */
|
67 |
+
}
|
68 |
+
|
69 |
+
/* Ensure all paragraphs have full opacity (keeps history visible) */
|
70 |
+
.transcript-container p:nth-last-child(n+4) {
|
71 |
+
opacity: 1.0; /* Shows all paragraphs at full opacity */
|
72 |
+
}
|
73 |
+
|
74 |
+
/* Controls for starting/stopping transcription */
|
75 |
+
.controls {
|
76 |
+
position: fixed; /* Fixes controls to viewport */
|
77 |
+
bottom: 2rem; /* Positions 2rem from bottom */
|
78 |
+
right: 2rem; /* Positions 2rem from right */
|
79 |
+
margin: 0; /* No margin */
|
80 |
+
opacity: 0.8; /* Slightly transparent when not hovered */
|
81 |
+
transition: opacity 0.3s ease; /* Smooth transition for opacity changes */
|
82 |
+
z-index: 1000; /* Ensures controls appear above other elements */
|
83 |
+
}
|
84 |
+
|
85 |
+
.controls:hover {
|
86 |
+
opacity: 1; /* Full opacity on hover */
|
87 |
+
}
|
88 |
+
|
89 |
+
/* Button styling - orange with black text for good contrast */
|
90 |
+
button {
|
91 |
+
background: rgba(249, 164, 92, 1.0); /* Solid orange background */
|
92 |
+
backdrop-filter: blur(5px); /* Blur effect for elements behind */
|
93 |
+
font-size: 1.2rem; /* Large text */
|
94 |
+
min-width: 160px; /* Minimum width for button */
|
95 |
+
padding: 15px 30px; /* Generous padding inside button */
|
96 |
+
color: black !important; /* Forces black text color */
|
97 |
+
font-weight: bold; /* Bold text for better visibility */
|
98 |
+
border: 2px solid rgba(255, 255, 255, 0.2); /* Subtle border */
|
99 |
+
border-radius: 8px; /* Rounded corners */
|
100 |
+
cursor: pointer; /* Shows pointer cursor on hover */
|
101 |
+
transition: all 0.2s ease; /* Smooth transition for hover effects */
|
102 |
+
display: block; /* Makes button take up full width */
|
103 |
+
}
|
104 |
+
|
105 |
+
button:hover {
|
106 |
+
background: rgba(249, 164, 92, 0.9); /* Slightly more transparent on hover */
|
107 |
+
transform: translateY(-2px); /* Slight upward movement on hover */
|
108 |
+
}
|
109 |
+
|
110 |
+
/* Spinner animation for loading state */
|
111 |
+
.icon-with-spinner .spinner {
|
112 |
+
border: 3px solid black; /* Spinner border */
|
113 |
+
border-top: 3px solid transparent; /* Transparent top creates spinning effect */
|
114 |
+
border-radius: 50%; /* Makes it circular */
|
115 |
+
width: 24px; /* Width of spinner */
|
116 |
+
height: 24px; /* Height of spinner */
|
117 |
+
animation: spin 1s linear infinite; /* Animation for spinning effect */
|
118 |
+
}
|
119 |
+
|
120 |
+
@keyframes spin {
|
121 |
+
0% { transform: rotate(0deg); } /* Starting rotation */
|
122 |
+
100% { transform: rotate(360deg); } /* Full 360° rotation */
|
123 |
+
}
|
124 |
+
|
125 |
+
/* Recording indicator pulse animation */
|
126 |
+
.pulse-circle {
|
127 |
+
display: inline-block; /* Allows other elements inline */
|
128 |
+
width: 12px; /* Width of pulse circle */
|
129 |
+
height: 12px; /* Height of pulse circle */
|
130 |
+
border-radius: 50%; /* Makes it circular */
|
131 |
+
background-color: red; /* Red color for recording indicator */
|
132 |
+
margin-right: 8px; /* Space to right of circle */
|
133 |
+
animation: pulse 1.5s ease infinite; /* Continuous pulsing animation */
|
134 |
+
}
|
135 |
+
|
136 |
+
@keyframes pulse {
|
137 |
+
0% { transform: scale(0.95); opacity: 0.7; } /* Slightly smaller and transparent */
|
138 |
+
50% { transform: scale(1.1); opacity: 1; } /* Larger and fully opaque */
|
139 |
+
100% { transform: scale(0.95); opacity: 0.7; } /* Back to starting state */
|
140 |
+
}
|
141 |
+
|
142 |
+
/* Custom scrollbar styling */
|
143 |
+
.transcript-container::-webkit-scrollbar {
|
144 |
+
width: 8px; /* Width of scrollbar */
|
145 |
+
}
|
146 |
+
|
147 |
+
.transcript-container::-webkit-scrollbar-track {
|
148 |
+
background: var(--background-dark); /* Black scrollbar track */
|
149 |
+
}
|
150 |
+
|
151 |
+
.transcript-container::-webkit-scrollbar-thumb {
|
152 |
+
background: rgba(249, 164, 92, 0.3); /* Semi-transparent orange scrollbar thumb */
|
153 |
+
border-radius: 4px; /* Rounded corners on scrollbar thumb */
|
154 |
+
}
|
155 |
+
|
156 |
+
/* Error toast styling */
|
157 |
+
.toast {
|
158 |
+
background: rgba(0, 0, 0, 0.8); /* Semi-transparent black background */
|
159 |
+
backdrop-filter: blur(5px); /* Blur effect behind toast */
|
160 |
+
color: var(--text-light); /* White text */
|
161 |
+
font-size: 1.2rem; /* Large text size */
|
162 |
+
}
|
163 |
+
</style>
|
164 |
+
</head>
|
165 |
+
|
166 |
+
<body>
|
167 |
+
<!-- Error message container that slides in when needed -->
|
168 |
+
<div id="error-toast" class="toast"></div>
|
169 |
+
<!-- Header section (hidden in presentation mode) -->
|
170 |
+
<div class="hero">
|
171 |
+
<h1>Real-time Transcription</h1>
|
172 |
+
<p>Powered by FastRTC and Local Whisper 🤗</p>
|
173 |
+
</div>
|
174 |
+
|
175 |
+
<!-- Main content container -->
|
176 |
+
<div class="container">
|
177 |
+
<!-- Container for transcript text -->
|
178 |
+
<div class="transcript-container" id="transcript"></div>
|
179 |
+
<!-- Controls for starting/stopping recording -->
|
180 |
+
<div class="controls">
|
181 |
+
<button id="start-button">Start Recording</button>
|
182 |
+
</div>
|
183 |
+
</div>
|
184 |
+
|
185 |
+
<script>
|
186 |
+
// Global variables for WebRTC connection
|
187 |
+
let peerConnection; // Stores the WebRTC connection object for audio streaming
|
188 |
+
let webrtc_id; // A unique ID to identify this connection on the server
|
189 |
+
let audioContext, analyser, audioSource; // Audio processing objects for visualization
|
190 |
+
let audioLevel = 0; // Stores the current audio level (volume) from 0-1
|
191 |
+
let animationFrame; // Reference to the animation frame for audio visualization
|
192 |
+
let isRecording = false; // Tracks whether we're currently recording or not
|
193 |
+
let eventSource; // Object that receives transcription results from the server
|
194 |
+
|
195 |
+
// DOM element references
|
196 |
+
const startButton = document.getElementById('start-button'); // The button to start/stop recording
|
197 |
+
const transcriptDiv = document.getElementById('transcript'); // The container for transcription text
|
198 |
+
|
199 |
+
// Variables for managing the transcript display
|
200 |
+
let currentParagraph = null; // Reference to the current paragraph being updated
|
201 |
+
let lastUpdateTime = Date.now(); // Timestamp of when we last updated the transcript
|
202 |
+
|
203 |
+
// Show error messages to the user in a toast notification
|
204 |
+
function showError(message) {
|
205 |
+
const toast = document.getElementById('error-toast'); // Get the toast element
|
206 |
+
toast.textContent = message; // Set the error message
|
207 |
+
toast.style.display = 'block'; // Make the toast visible
|
208 |
+
|
209 |
+
// Hide toast after 5 seconds
|
210 |
+
setTimeout(() => {
|
211 |
+
toast.style.display = 'none'; // Hide the toast
|
212 |
+
}, 5000);
|
213 |
+
}
|
214 |
+
|
215 |
+
// Handle messages received from the server through WebRTC data channel
|
216 |
+
function handleMessage(event) {
|
217 |
+
// Parse JSON message
|
218 |
+
const eventJson = JSON.parse(event.data);
|
219 |
+
// Display errors to the user
|
220 |
+
if (eventJson.type === "error") {
|
221 |
+
showError(eventJson.message);
|
222 |
+
}
|
223 |
+
// Log all messages to console for debugging
|
224 |
+
console.log('Received message:', event.data);
|
225 |
+
}
|
226 |
+
|
227 |
+
// Update button appearance based on connection state
|
228 |
+
function updateButtonState() {
|
229 |
+
// If connecting, show spinner
|
230 |
+
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
|
231 |
+
startButton.innerHTML = `
|
232 |
+
<div class="icon-with-spinner">
|
233 |
+
<div class="spinner"></div>
|
234 |
+
<span>Connecting...</span>
|
235 |
+
</div>
|
236 |
+
`;
|
237 |
+
isRecording = false; // Not recording while connecting
|
238 |
+
// If connected, show pulsing recording indicator
|
239 |
+
} else if (peerConnection && peerConnection.connectionState === 'connected') {
|
240 |
+
startButton.innerHTML = `
|
241 |
+
<div class="pulse-container">
|
242 |
+
<div class="pulse-circle"></div>
|
243 |
+
<span>Stop Recording</span>
|
244 |
+
</div>
|
245 |
+
`;
|
246 |
+
isRecording = true; // Set recording state to true
|
247 |
+
// Default state - ready to start
|
248 |
+
} else {
|
249 |
+
startButton.innerHTML = 'Start Recording';
|
250 |
+
isRecording = false; // Not recording when not connected
|
251 |
+
}
|
252 |
+
}
|
253 |
+
|
254 |
+
// Set up audio visualization to show when the user is speaking
|
255 |
+
function setupAudioVisualization(stream) {
|
256 |
+
// Create or resume the audio context
|
257 |
+
if (!audioContext) {
|
258 |
+
// Create new audio context with browser compatibility handling
|
259 |
+
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
260 |
+
} else {
|
261 |
+
// Resume context if it was suspended
|
262 |
+
if (audioContext.state === 'suspended') {
|
263 |
+
audioContext.resume();
|
264 |
+
}
|
265 |
+
}
|
266 |
+
|
267 |
+
// Create audio analyzer for processing audio data
|
268 |
+
analyser = audioContext.createAnalyser();
|
269 |
+
// Create media source from microphone stream
|
270 |
+
audioSource = audioContext.createMediaStreamSource(stream);
|
271 |
+
// Connect source to analyzer
|
272 |
+
audioSource.connect(analyser);
|
273 |
+
// Set FFT size (controls frequency data resolution)
|
274 |
+
analyser.fftSize = 64;
|
275 |
+
// Create array to store frequency data
|
276 |
+
const dataArray = new Uint8Array(analyser.frequencyBinCount);
|
277 |
+
|
278 |
+
// Function to continuously update audio level visualization
|
279 |
+
function updateAudioLevel() {
|
280 |
+
// Get audio frequency data
|
281 |
+
analyser.getByteFrequencyData(dataArray);
|
282 |
+
// Calculate average volume across all frequencies
|
283 |
+
const average = Array.from(dataArray).reduce((a, b) => a + b, 0) / dataArray.length;
|
284 |
+
// Convert to 0-1 scale
|
285 |
+
audioLevel = average / 255;
|
286 |
+
|
287 |
+
// Update pulse circle size based on audio level
|
288 |
+
const pulseCircle = document.querySelector('.pulse-circle');
|
289 |
+
if (pulseCircle) {
|
290 |
+
pulseCircle.style.setProperty('--audio-level', 1 + audioLevel);
|
291 |
+
}
|
292 |
+
|
293 |
+
// Continue animation loop
|
294 |
+
animationFrame = requestAnimationFrame(updateAudioLevel);
|
295 |
+
}
|
296 |
+
// Start audio visualization loop
|
297 |
+
updateAudioLevel();
|
298 |
+
}
|
299 |
+
|
300 |
+
// Set up WebRTC connection for streaming audio to server
|
301 |
+
async function setupWebRTC() {
|
302 |
+
// Get WebRTC configuration from global variable
|
303 |
+
const config = __RTC_CONFIGURATION__;
|
304 |
+
// Create new peer connection
|
305 |
+
peerConnection = new RTCPeerConnection(config);
|
306 |
+
|
307 |
+
// Set connection timeout (15 seconds)
|
308 |
+
const connectionTimeout = setTimeout(() => {
|
309 |
+
if (peerConnection && peerConnection.connectionState !== 'connected') {
|
310 |
+
showError('Connection timeout. Please check your network and try again.');
|
311 |
+
stop(); // Stop connection attempt
|
312 |
+
}
|
313 |
+
}, 15000);
|
314 |
+
|
315 |
+
// Set warning for slow connection (5 seconds)
|
316 |
+
const timeoutId = setTimeout(() => {
|
317 |
+
const toast = document.getElementById('error-toast');
|
318 |
+
toast.textContent = "Connection is taking longer than usual. Are you on a VPN?";
|
319 |
+
toast.className = 'toast warning';
|
320 |
+
toast.style.display = 'block';
|
321 |
+
|
322 |
+
// Hide warning after 5 seconds
|
323 |
+
setTimeout(() => {
|
324 |
+
toast.style.display = 'none';
|
325 |
+
}, 5000);
|
326 |
+
}, 5000);
|
327 |
+
|
328 |
+
try {
|
329 |
+
// Request access to user's microphone
|
330 |
+
const stream = await navigator.mediaDevices.getUserMedia({
|
331 |
+
audio: true // Only request audio access
|
332 |
+
});
|
333 |
+
|
334 |
+
// Set up audio visualization
|
335 |
+
setupAudioVisualization(stream);
|
336 |
+
|
337 |
+
// Add audio tracks to WebRTC connection
|
338 |
+
stream.getTracks().forEach(track => {
|
339 |
+
peerConnection.addTrack(track, stream);
|
340 |
+
});
|
341 |
+
|
342 |
+
// Monitor connection state changes
|
343 |
+
peerConnection.addEventListener('connectionstatechange', () => {
|
344 |
+
// Log state changes
|
345 |
+
console.log('connectionstatechange', peerConnection.connectionState);
|
346 |
+
|
347 |
+
// Handle successful connection
|
348 |
+
if (peerConnection.connectionState === 'connected') {
|
349 |
+
clearTimeout(timeoutId);
|
350 |
+
clearTimeout(connectionTimeout);
|
351 |
+
const toast = document.getElementById('error-toast');
|
352 |
+
toast.style.display = 'none';
|
353 |
+
// Handle connection failures
|
354 |
+
} else if (peerConnection.connectionState === 'failed' ||
|
355 |
+
peerConnection.connectionState === 'disconnected' ||
|
356 |
+
peerConnection.connectionState === 'closed') {
|
357 |
+
showError('Connection lost. Please try again.');
|
358 |
+
stop();
|
359 |
+
}
|
360 |
+
// Update button appearance
|
361 |
+
updateButtonState();
|
362 |
+
});
|
363 |
+
|
364 |
+
// Create data channel for server messages
|
365 |
+
const dataChannel = peerConnection.createDataChannel('text');
|
366 |
+
dataChannel.onmessage = handleMessage; // Set message handler
|
367 |
+
|
368 |
+
// Create connection offer
|
369 |
+
const offer = await peerConnection.createOffer();
|
370 |
+
// Set local description (our end of connection)
|
371 |
+
await peerConnection.setLocalDescription(offer);
|
372 |
+
|
373 |
+
// Wait for ICE gathering to complete (finding connection methods)
|
374 |
+
await new Promise((resolve) => {
|
375 |
+
if (peerConnection.iceGatheringState === "complete") {
|
376 |
+
resolve(); // Already complete
|
377 |
+
} else {
|
378 |
+
// Function to check ICE gathering state
|
379 |
+
const checkState = () => {
|
380 |
+
if (peerConnection.iceGatheringState === "complete") {
|
381 |
+
peerConnection.removeEventListener("icegatheringstatechange", checkState);
|
382 |
+
resolve(); // Complete gathering
|
383 |
+
}
|
384 |
+
};
|
385 |
+
// Listen for ICE gathering state changes
|
386 |
+
peerConnection.addEventListener("icegatheringstatechange", checkState);
|
387 |
+
}
|
388 |
+
});
|
389 |
+
|
390 |
+
// Generate random ID for this connection
|
391 |
+
webrtc_id = Math.random().toString(36).substring(7);
|
392 |
+
|
393 |
+
// Send connection offer to server
|
394 |
+
const response = await fetch('/webrtc/offer', {
|
395 |
+
method: 'POST',
|
396 |
+
headers: { 'Content-Type': 'application/json' },
|
397 |
+
body: JSON.stringify({
|
398 |
+
sdp: peerConnection.localDescription.sdp, // Session description
|
399 |
+
type: peerConnection.localDescription.type, // Offer type
|
400 |
+
webrtc_id: webrtc_id // Unique connection ID
|
401 |
+
})
|
402 |
+
});
|
403 |
+
|
404 |
+
// Parse server response
|
405 |
+
const serverResponse = await response.json();
|
406 |
+
|
407 |
+
// Handle server errors
|
408 |
+
if (serverResponse.status === 'failed') {
|
409 |
+
showError(serverResponse.meta.error === 'concurrency_limit_reached'
|
410 |
+
? `Too many connections. Maximum limit is ${serverResponse.meta.limit}`
|
411 |
+
: serverResponse.meta.error);
|
412 |
+
stop();
|
413 |
+
startButton.textContent = 'Start Recording';
|
414 |
+
return;
|
415 |
+
}
|
416 |
+
|
417 |
+
// Complete connection with server's description
|
418 |
+
await peerConnection.setRemoteDescription(serverResponse);
|
419 |
+
|
420 |
+
// Create event source for receiving transcription results
|
421 |
+
eventSource = new EventSource('/transcript?webrtc_id=' + webrtc_id);
|
422 |
+
// Handle event source errors
|
423 |
+
eventSource.onerror = (event) => {
|
424 |
+
console.error("EventSource error:", event);
|
425 |
+
showError("Transcription connection lost. Please try again.");
|
426 |
+
};
|
427 |
+
// Process transcription results as they arrive
|
428 |
+
eventSource.addEventListener("output", (event) => {
|
429 |
+
console.log("Received transcript chunk:", event.data);
|
430 |
+
// Add text to display
|
431 |
+
appendTranscript(event.data);
|
432 |
+
//appendTranscriptSimple(event.data);
|
433 |
+
});
|
434 |
+
} catch (err) {
|
435 |
+
// Handle any setup errors
|
436 |
+
clearTimeout(timeoutId);
|
437 |
+
console.error('Error setting up WebRTC:', err);
|
438 |
+
showError('Failed to establish connection. Please try again.');
|
439 |
+
stop();
|
440 |
+
startButton.textContent = 'Start Recording';
|
441 |
+
}
|
442 |
+
}
|
443 |
+
|
444 |
+
function appendTranscriptSimple(text) {
|
445 |
+
const p = document.createElement('p');
|
446 |
+
p.textContent = text;
|
447 |
+
transcriptDiv.appendChild(p);
|
448 |
+
transcriptDiv.scrollTop = transcriptDiv.scrollHeight;
|
449 |
+
}
|
450 |
+
|
451 |
+
// Add transcription text to display
|
452 |
+
function appendTranscript(text) {
|
453 |
+
// Clean up text
|
454 |
+
const formattedText = text.trim();
|
455 |
+
if (!formattedText) return;
|
456 |
+
|
457 |
+
const now = Date.now();
|
458 |
+
const timeSinceLastUpdate = now - lastUpdateTime;
|
459 |
+
lastUpdateTime = now;
|
460 |
+
|
461 |
+
// Handle transcript display
|
462 |
+
if (!currentParagraph) {
|
463 |
+
// Create new paragraph
|
464 |
+
currentParagraph = document.createElement('p');
|
465 |
+
currentParagraph.classList.add('current');
|
466 |
+
transcriptDiv.appendChild(currentParagraph);
|
467 |
+
currentParagraph.textContent = formattedText;
|
468 |
+
} else {
|
469 |
+
// Get current text
|
470 |
+
const currentText = currentParagraph.textContent;
|
471 |
+
|
472 |
+
// Fix spacing issues by normalizing
|
473 |
+
let cleanedText = formattedText;
|
474 |
+
|
475 |
+
// 1. Check for simple word repetition - last word repeated
|
476 |
+
const words = currentText.split(/\s+/);
|
477 |
+
const lastWord = words[words.length - 1].replace(/[^\w]/g, '').toLowerCase();
|
478 |
+
|
479 |
+
if (lastWord && lastWord.length > 2) {
|
480 |
+
// Check if new text starts with the same word
|
481 |
+
const regex = new RegExp(`^${lastWord}`, 'i');
|
482 |
+
if (regex.test(cleanedText.replace(/[^\w]/g, ''))) {
|
483 |
+
// Remove the first word if it's a duplicate
|
484 |
+
cleanedText = cleanedText.replace(regex, '').trim();
|
485 |
+
}
|
486 |
+
}
|
487 |
+
|
488 |
+
// 2. Add proper spacing
|
489 |
+
let finalText = currentText;
|
490 |
+
|
491 |
+
// Only add space if current text doesn't end with space or punctuation
|
492 |
+
// and new text doesn't start with punctuation
|
493 |
+
if (!/[\s.,!?]$/.test(finalText) && !/^[.,!?]/.test(cleanedText) && cleanedText) {
|
494 |
+
finalText += ' ';
|
495 |
+
}
|
496 |
+
|
497 |
+
// 3. Add the cleaned text
|
498 |
+
finalText += cleanedText;
|
499 |
+
|
500 |
+
// 4. Fix any run-together words by adding spaces after punctuation
|
501 |
+
finalText = finalText.replace(/([.,!?])([a-zA-Z])/g, '$1 $2');
|
502 |
+
|
503 |
+
// Update the paragraph text
|
504 |
+
currentParagraph.textContent = finalText;
|
505 |
+
}
|
506 |
+
|
507 |
+
// Create new paragraph on sentence end or pause
|
508 |
+
if (/[.!?]$/.test(formattedText) || timeSinceLastUpdate > 5000) {
|
509 |
+
// End current paragraph
|
510 |
+
if (currentParagraph) {
|
511 |
+
currentParagraph.classList.remove('current');
|
512 |
+
}
|
513 |
+
|
514 |
+
// Prepare for next paragraph
|
515 |
+
currentParagraph = null;
|
516 |
+
}
|
517 |
+
|
518 |
+
// Limit number of displayed paragraphs
|
519 |
+
const paragraphs = transcriptDiv.getElementsByTagName('p');
|
520 |
+
while (paragraphs.length > 10) { // Keep last 10 paragraphs
|
521 |
+
transcriptDiv.removeChild(paragraphs[0]);
|
522 |
+
}
|
523 |
+
|
524 |
+
// Scroll to show newest text
|
525 |
+
requestAnimationFrame(() => {
|
526 |
+
transcriptDiv.scrollTop = transcriptDiv.scrollHeight;
|
527 |
+
});
|
528 |
+
}
|
529 |
+
|
530 |
+
// Stop recording and clean up resources
|
531 |
+
function stop() {
|
532 |
+
// Stop audio visualization
|
533 |
+
if (animationFrame) {
|
534 |
+
cancelAnimationFrame(animationFrame);
|
535 |
+
animationFrame = null;
|
536 |
+
}
|
537 |
+
|
538 |
+
// Pause audio processing
|
539 |
+
if (audioContext) {
|
540 |
+
audioContext.suspend();
|
541 |
+
}
|
542 |
+
|
543 |
+
// Stop all media tracks
|
544 |
+
if (peerConnection) {
|
545 |
+
const senders = peerConnection.getSenders();
|
546 |
+
if (senders) {
|
547 |
+
senders.forEach(sender => {
|
548 |
+
if (sender.track) {
|
549 |
+
sender.track.stop(); // Release microphone
|
550 |
+
}
|
551 |
+
});
|
552 |
+
}
|
553 |
+
|
554 |
+
// Close WebRTC connection
|
555 |
+
peerConnection.close();
|
556 |
+
peerConnection = null;
|
557 |
+
}
|
558 |
+
|
559 |
+
// Close transcription connection
|
560 |
+
if (eventSource) {
|
561 |
+
eventSource.close();
|
562 |
+
eventSource = null;
|
563 |
+
}
|
564 |
+
|
565 |
+
// Reset audio level
|
566 |
+
audioLevel = 0;
|
567 |
+
// Update button display
|
568 |
+
updateButtonState();
|
569 |
+
|
570 |
+
// Ask about clearing transcript
|
571 |
+
if (window.confirm('Clear transcript?')) {
|
572 |
+
// Clear all transcript text
|
573 |
+
transcriptDiv.innerHTML = '';
|
574 |
+
currentParagraph = null;
|
575 |
+
} else {
|
576 |
+
// Just end current paragraph
|
577 |
+
if (currentParagraph) {
|
578 |
+
currentParagraph.classList.remove('current');
|
579 |
+
currentParagraph = null;
|
580 |
+
}
|
581 |
+
}
|
582 |
+
|
583 |
+
// Reset timestamp
|
584 |
+
lastUpdateTime = Date.now();
|
585 |
+
}
|
586 |
+
|
587 |
+
// Clean up resources when page is closed
|
588 |
+
window.addEventListener('beforeunload', () => {
|
589 |
+
stop(); // Stop recording and release resources
|
590 |
+
});
|
591 |
+
|
592 |
+
// Handle start/stop button clicks
|
593 |
+
startButton.addEventListener('click', () => {
|
594 |
+
if (!isRecording) {
|
595 |
+
// Start recording if not already recording
|
596 |
+
setupWebRTC();
|
597 |
+
} else {
|
598 |
+
// Stop recording if currently recording
|
599 |
+
stop();
|
600 |
+
}
|
601 |
+
});
|
602 |
+
|
603 |
+
// Initialize UI when page loads
|
604 |
+
document.addEventListener('DOMContentLoaded', () => {
|
605 |
+
// Ensure all UI elements are visible
|
606 |
+
const elementsToCheck = [
|
607 |
+
transcriptDiv,
|
608 |
+
startButton,
|
609 |
+
document.getElementById('error-toast')
|
610 |
+
];
|
611 |
+
|
612 |
+
// Set appropriate display for each element
|
613 |
+
elementsToCheck.forEach(el => {
|
614 |
+
if (el) {
|
615 |
+
// Set appropriate display style based on element type
|
616 |
+
el.style.display = el.tagName.toLowerCase() === 'button' ? 'block' :
|
617 |
+
(el.id === 'transcript' ? 'block' : 'none');
|
618 |
+
}
|
619 |
+
});
|
620 |
+
|
621 |
+
// Apply CSS variables to ensure theme is working
|
622 |
+
document.body.style.backgroundColor = 'var(--background-dark)';
|
623 |
+
document.body.style.color = 'var(--text-light)';
|
624 |
+
|
625 |
+
// Force button colors for consistency
|
626 |
+
startButton.style.backgroundColor = 'rgba(249, 164, 92, 1.0)';
|
627 |
+
startButton.style.color = 'black';
|
628 |
+
});
|
629 |
+
</script>
|
630 |
+
</body>
|
631 |
+
|
632 |
+
</html>
|
main.py
CHANGED
@@ -14,7 +14,7 @@ from fastrtc import (
|
|
14 |
ReplyOnPause,
|
15 |
Stream,
|
16 |
AlgoOptions,
|
17 |
-
|
18 |
audio_to_bytes,
|
19 |
)
|
20 |
from transformers import (
|
@@ -34,6 +34,11 @@ setup_logging(level=logging.DEBUG)
|
|
34 |
logger = logging.getLogger(__name__)
|
35 |
|
36 |
|
|
|
|
|
|
|
|
|
|
|
37 |
device = get_device(force_cpu=False)
|
38 |
torch_dtype, np_dtype = get_torch_and_np_dtypes(device, use_bfloat16=False)
|
39 |
logger.info(f"Using device: {device}, torch_dtype: {torch_dtype}, np_dtype: {np_dtype}")
|
@@ -44,10 +49,9 @@ logger.info(f"CUDA Version: {cuda_version}, GPU Device: {device_name}")
|
|
44 |
attention = "flash_attention_2" if is_flash_attn_2_available() else "sdpa"
|
45 |
logger.info(f"Using attention: {attention}")
|
46 |
|
47 |
-
|
48 |
-
logger.info(f"Loading Whisper model: {model_id}")
|
49 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
50 |
-
|
51 |
torch_dtype=torch_dtype,
|
52 |
low_cpu_mem_usage=True,
|
53 |
use_safetensors=True,
|
@@ -55,7 +59,7 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
|
55 |
)
|
56 |
model.to(device)
|
57 |
|
58 |
-
processor = AutoProcessor.from_pretrained(
|
59 |
|
60 |
transcribe_pipeline = pipeline(
|
61 |
task="automatic-speech-recognition",
|
@@ -102,20 +106,20 @@ stream = Stream(
|
|
102 |
# If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
|
103 |
speech_threshold=0.1,
|
104 |
),
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
),
|
120 |
# send-receive: bidirectional streaming (default)
|
121 |
# send: client to server only
|
@@ -126,7 +130,7 @@ stream = Stream(
|
|
126 |
gr.Textbox(label="Transcript"),
|
127 |
],
|
128 |
additional_outputs_handler=lambda current, new: current + " " + new,
|
129 |
-
rtc_configuration=get_rtc_credentials(provider="hf") if
|
130 |
concurrency_limit=6
|
131 |
)
|
132 |
|
@@ -135,8 +139,8 @@ stream.mount(app)
|
|
135 |
|
136 |
@app.get("/")
|
137 |
async def index():
|
138 |
-
html_content = open(
|
139 |
-
rtc_config = get_rtc_credentials(provider="hf") if
|
140 |
return HTMLResponse(content=html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config)))
|
141 |
|
142 |
@app.get("/transcript")
|
|
|
14 |
ReplyOnPause,
|
15 |
Stream,
|
16 |
AlgoOptions,
|
17 |
+
SileroVadOptions,
|
18 |
audio_to_bytes,
|
19 |
)
|
20 |
from transformers import (
|
|
|
34 |
logger = logging.getLogger(__name__)
|
35 |
|
36 |
|
37 |
+
APP_MODE = os.getenv("APP_MODE", "deployed")
|
38 |
+
MODEL_ID = os.getenv("MODEL_ID", "openai/whisper-large-v3-turbo")
|
39 |
+
UI_FILE = os.getenv("UI_FILE", "index.html")
|
40 |
+
|
41 |
+
|
42 |
device = get_device(force_cpu=False)
|
43 |
torch_dtype, np_dtype = get_torch_and_np_dtypes(device, use_bfloat16=False)
|
44 |
logger.info(f"Using device: {device}, torch_dtype: {torch_dtype}, np_dtype: {np_dtype}")
|
|
|
49 |
attention = "flash_attention_2" if is_flash_attn_2_available() else "sdpa"
|
50 |
logger.info(f"Using attention: {attention}")
|
51 |
|
52 |
+
logger.info(f"Loading Whisper model: {MODEL_ID}")
|
|
|
53 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
54 |
+
MODEL_ID,
|
55 |
torch_dtype=torch_dtype,
|
56 |
low_cpu_mem_usage=True,
|
57 |
use_safetensors=True,
|
|
|
59 |
)
|
60 |
model.to(device)
|
61 |
|
62 |
+
processor = AutoProcessor.from_pretrained(MODEL_ID)
|
63 |
|
64 |
transcribe_pipeline = pipeline(
|
65 |
task="automatic-speech-recognition",
|
|
|
106 |
# If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
|
107 |
speech_threshold=0.1,
|
108 |
),
|
109 |
+
model_options=SileroVadOptions(
|
110 |
+
# Threshold for what is considered speech (default 0.5)
|
111 |
+
threshold=0.5,
|
112 |
+
# Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
|
113 |
+
min_speech_duration_ms=250,
|
114 |
+
# Max duration of speech chunks, longer will be split (default float('inf'))
|
115 |
+
max_speech_duration_s=15,
|
116 |
+
# Wait for ms at the end of each speech chunk before separating it (default 2000)
|
117 |
+
min_silence_duration_ms=2000,
|
118 |
+
# Chunk size for VAD model. Can be 512, 1024, 1536 for 16k s.r. (default 1024)
|
119 |
+
window_size_samples=1024,
|
120 |
+
# Final speech chunks are padded by speech_pad_ms each side (default 400)
|
121 |
+
speech_pad_ms=400,
|
122 |
+
),
|
123 |
),
|
124 |
# send-receive: bidirectional streaming (default)
|
125 |
# send: client to server only
|
|
|
130 |
gr.Textbox(label="Transcript"),
|
131 |
],
|
132 |
additional_outputs_handler=lambda current, new: current + " " + new,
|
133 |
+
rtc_configuration=get_rtc_credentials(provider="hf") if APP_MODE == "deployed" else None,
|
134 |
concurrency_limit=6
|
135 |
)
|
136 |
|
|
|
139 |
|
140 |
@app.get("/")
|
141 |
async def index():
|
142 |
+
html_content = open(UI_FILE).read()
|
143 |
+
rtc_config = get_rtc_credentials(provider="hf") if APP_MODE == "deployed" else None
|
144 |
return HTMLResponse(content=html_content.replace("__RTC_CONFIGURATION__", json.dumps(rtc_config)))
|
145 |
|
146 |
@app.get("/transcript")
|