Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -368,6 +368,62 @@ def calculate_similarity(detected: str, expected: str) -> float:
|
|
368 |
expected_norm = normalize_phoneme_string(expected)
|
369 |
return SequenceMatcher(None, detected_norm, expected_norm).ratio()
|
370 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
371 |
def extract_audio_segment(waveform: torch.Tensor, sample_rate: int,
|
372 |
start_time: float, end_time: float, word: str,
|
373 |
verbose: bool = True) -> torch.Tensor:
|
@@ -965,8 +1021,17 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
|
|
965 |
else:
|
966 |
log(f"🔗 No gap (continuous)")
|
967 |
|
968 |
-
# Calculate expanded timing
|
969 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
970 |
audio_duration = waveform.shape[-1] / sample_rate
|
971 |
|
972 |
expanded_start = max(0, start_time - expansion_seconds)
|
@@ -977,6 +1042,16 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
|
|
977 |
# Extract expanded audio segment
|
978 |
expanded_audio_segment = extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word_clean, verbose=True)
|
979 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
980 |
# Also extract WhisperX original timing for comparison
|
981 |
whisperx_audio_segment = extract_audio_segment(waveform, sample_rate, start_time, end_time, word_clean, verbose=False)
|
982 |
|
|
|
368 |
expected_norm = normalize_phoneme_string(expected)
|
369 |
return SequenceMatcher(None, detected_norm, expected_norm).ratio()
|
370 |
|
371 |
+
def detect_word_boundary_overlap(audio_segment: torch.Tensor, sample_rate: int, word: str) -> float:
|
372 |
+
"""
|
373 |
+
Analyze first 1/3 of audio segment for: [noise] → [silence] → [noise] pattern
|
374 |
+
Returns: offset in seconds to skip initial noise, or 0.0 if no pattern found
|
375 |
+
"""
|
376 |
+
if audio_segment.shape[-1] == 0:
|
377 |
+
return 0.0
|
378 |
+
|
379 |
+
# Analyze only first 1/3 of segment
|
380 |
+
first_third_samples = audio_segment.shape[-1] // 3
|
381 |
+
if first_third_samples < sample_rate * 0.1: # Less than 100ms total
|
382 |
+
return 0.0
|
383 |
+
|
384 |
+
first_third = audio_segment[:, :first_third_samples]
|
385 |
+
|
386 |
+
# Calculate energy in small windows (50ms chunks)
|
387 |
+
window_size = int(0.05 * sample_rate) # 50ms windows
|
388 |
+
if window_size <= 0:
|
389 |
+
return 0.0
|
390 |
+
|
391 |
+
energy_levels = []
|
392 |
+
|
393 |
+
for i in range(0, first_third_samples - window_size, window_size):
|
394 |
+
window = first_third[:, i:i + window_size]
|
395 |
+
energy = torch.mean(window ** 2).item() # RMS energy
|
396 |
+
energy_levels.append(energy)
|
397 |
+
|
398 |
+
if len(energy_levels) < 3:
|
399 |
+
return 0.0
|
400 |
+
|
401 |
+
# Look for pattern: [high energy] → [low energy] → [high energy]
|
402 |
+
silence_threshold = np.percentile(energy_levels, 20) # Bottom 20%
|
403 |
+
noise_threshold = silence_threshold * 3
|
404 |
+
|
405 |
+
log(f"🔍 Boundary analysis for '{word}': {len(energy_levels)} windows, silence_thresh={silence_threshold:.6f}")
|
406 |
+
|
407 |
+
# Find sustained silence (2+ consecutive low-energy windows)
|
408 |
+
for i in range(len(energy_levels) - 1):
|
409 |
+
if (energy_levels[i] < silence_threshold and
|
410 |
+
energy_levels[i + 1] < silence_threshold):
|
411 |
+
|
412 |
+
# Check if there was noise before silence
|
413 |
+
noise_before = any(e > noise_threshold for e in energy_levels[:i])
|
414 |
+
# Check if there's noise after silence
|
415 |
+
noise_after = any(e > noise_threshold for e in energy_levels[i + 2:])
|
416 |
+
|
417 |
+
if noise_before and noise_after:
|
418 |
+
# Found the pattern! Return offset to end of silence
|
419 |
+
silence_end_sample = (i + 2) * window_size
|
420 |
+
offset_seconds = silence_end_sample / sample_rate
|
421 |
+
log(f" ✅ Found overlap pattern: noise→silence(pos {i})→noise, trimming {offset_seconds:.3f}s")
|
422 |
+
return offset_seconds
|
423 |
+
|
424 |
+
log(f" ❌ No overlap pattern detected")
|
425 |
+
return 0.0 # No pattern detected
|
426 |
+
|
427 |
def extract_audio_segment(waveform: torch.Tensor, sample_rate: int,
|
428 |
start_time: float, end_time: float, word: str,
|
429 |
verbose: bool = True) -> torch.Tensor:
|
|
|
1021 |
else:
|
1022 |
log(f"🔗 No gap (continuous)")
|
1023 |
|
1024 |
+
# Calculate expanded timing with adaptive expansion for short words
|
1025 |
+
original_duration = end_time - start_time
|
1026 |
+
|
1027 |
+
# Use larger expansion for very short words
|
1028 |
+
if original_duration < 0.5:
|
1029 |
+
expansion_seconds = 0.25 # Larger expansion for short words
|
1030 |
+
log(f"🔍 Short word detected ({original_duration:.3f}s), using expanded timing: ±{expansion_seconds}s")
|
1031 |
+
else:
|
1032 |
+
expansion_seconds = 0.125 # Normal expansion
|
1033 |
+
log(f"📏 Normal word length ({original_duration:.3f}s), using standard timing: ±{expansion_seconds}s")
|
1034 |
+
|
1035 |
audio_duration = waveform.shape[-1] / sample_rate
|
1036 |
|
1037 |
expanded_start = max(0, start_time - expansion_seconds)
|
|
|
1042 |
# Extract expanded audio segment
|
1043 |
expanded_audio_segment = extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word_clean, verbose=True)
|
1044 |
|
1045 |
+
# Check for word boundary overlap and trim if needed
|
1046 |
+
boundary_offset = detect_word_boundary_overlap(expanded_audio_segment, sample_rate, word_clean)
|
1047 |
+
if boundary_offset > 0:
|
1048 |
+
log(f"🔧 Detected word overlap, trimming {boundary_offset:.3f}s from start")
|
1049 |
+
trim_samples = int(boundary_offset * sample_rate)
|
1050 |
+
expanded_audio_segment = expanded_audio_segment[:, trim_samples:]
|
1051 |
+
# Update expanded_start for accurate timing logs
|
1052 |
+
expanded_start += boundary_offset
|
1053 |
+
log(f" Updated expanded start: {expanded_start:.3f}s")
|
1054 |
+
|
1055 |
# Also extract WhisperX original timing for comparison
|
1056 |
whisperx_audio_segment = extract_audio_segment(waveform, sample_rate, start_time, end_time, word_clean, verbose=False)
|
1057 |
|