Spaces:

greg0rs
/

fonetik-fast

Running

App Files Files Community

greg0rs commited on Jul 25

Commit

6210a22

verified ·

1 Parent(s): 0e377d6

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -2

app.py CHANGED Viewed

@@ -368,6 +368,62 @@ def calculate_similarity(detected: str, expected: str) -> float:
     expected_norm = normalize_phoneme_string(expected)
     return SequenceMatcher(None, detected_norm, expected_norm).ratio()
 def extract_audio_segment(waveform: torch.Tensor, sample_rate: int,
                          start_time: float, end_time: float, word: str,
                          verbose: bool = True) -> torch.Tensor:
@@ -965,8 +1021,17 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
                 else:
                     log(f"🔗 No gap (continuous)")
-            # Calculate expanded timing (±0.125s with boundary protection)
-            expansion_seconds = 0.125
             audio_duration = waveform.shape[-1] / sample_rate
             expanded_start = max(0, start_time - expansion_seconds)
@@ -977,6 +1042,16 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
             # Extract expanded audio segment
             expanded_audio_segment = extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word_clean, verbose=True)
             # Also extract WhisperX original timing for comparison
             whisperx_audio_segment = extract_audio_segment(waveform, sample_rate, start_time, end_time, word_clean, verbose=False)

     expected_norm = normalize_phoneme_string(expected)
     return SequenceMatcher(None, detected_norm, expected_norm).ratio()
+def detect_word_boundary_overlap(audio_segment: torch.Tensor, sample_rate: int, word: str) -> float:
+    """
+    Analyze first 1/3 of audio segment for: [noise] → [silence] → [noise] pattern
+    Returns: offset in seconds to skip initial noise, or 0.0 if no pattern found
+    """
+    if audio_segment.shape[-1] == 0:
+        return 0.0
+    # Analyze only first 1/3 of segment
+    first_third_samples = audio_segment.shape[-1] // 3
+    if first_third_samples < sample_rate * 0.1:  # Less than 100ms total
+        return 0.0
+    first_third = audio_segment[:, :first_third_samples]
+    # Calculate energy in small windows (50ms chunks)
+    window_size = int(0.05 * sample_rate)  # 50ms windows
+    if window_size <= 0:
+        return 0.0
+    energy_levels = []
+    for i in range(0, first_third_samples - window_size, window_size):
+        window = first_third[:, i:i + window_size]
+        energy = torch.mean(window ** 2).item()  # RMS energy
+        energy_levels.append(energy)
+    if len(energy_levels) < 3:
+        return 0.0
+    # Look for pattern: [high energy] → [low energy] → [high energy]
+    silence_threshold = np.percentile(energy_levels, 20)  # Bottom 20%
+    noise_threshold = silence_threshold * 3
+    log(f"🔍 Boundary analysis for '{word}': {len(energy_levels)} windows, silence_thresh={silence_threshold:.6f}")
+    # Find sustained silence (2+ consecutive low-energy windows)
+    for i in range(len(energy_levels) - 1):
+        if (energy_levels[i] < silence_threshold and
+            energy_levels[i + 1] < silence_threshold):
+            # Check if there was noise before silence
+            noise_before = any(e > noise_threshold for e in energy_levels[:i])
+            # Check if there's noise after silence
+            noise_after = any(e > noise_threshold for e in energy_levels[i + 2:])
+            if noise_before and noise_after:
+                # Found the pattern! Return offset to end of silence
+                silence_end_sample = (i + 2) * window_size
+                offset_seconds = silence_end_sample / sample_rate
+                log(f"   ✅ Found overlap pattern: noise→silence(pos {i})→noise, trimming {offset_seconds:.3f}s")
+                return offset_seconds
+    log(f"   ❌ No overlap pattern detected")
+    return 0.0  # No pattern detected
 def extract_audio_segment(waveform: torch.Tensor, sample_rate: int,
                          start_time: float, end_time: float, word: str,
                          verbose: bool = True) -> torch.Tensor:
                 else:
                     log(f"🔗 No gap (continuous)")
+            # Calculate expanded timing with adaptive expansion for short words
+            original_duration = end_time - start_time
+            # Use larger expansion for very short words
+            if original_duration < 0.5:
+                expansion_seconds = 0.25  # Larger expansion for short words
+                log(f"🔍 Short word detected ({original_duration:.3f}s), using expanded timing: ±{expansion_seconds}s")
+            else:
+                expansion_seconds = 0.125  # Normal expansion
+                log(f"📏 Normal word length ({original_duration:.3f}s), using standard timing: ±{expansion_seconds}s")
             audio_duration = waveform.shape[-1] / sample_rate
             expanded_start = max(0, start_time - expansion_seconds)
             # Extract expanded audio segment
             expanded_audio_segment = extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word_clean, verbose=True)
+            # Check for word boundary overlap and trim if needed
+            boundary_offset = detect_word_boundary_overlap(expanded_audio_segment, sample_rate, word_clean)
+            if boundary_offset > 0:
+                log(f"🔧 Detected word overlap, trimming {boundary_offset:.3f}s from start")
+                trim_samples = int(boundary_offset * sample_rate)
+                expanded_audio_segment = expanded_audio_segment[:, trim_samples:]
+                # Update expanded_start for accurate timing logs
+                expanded_start += boundary_offset
+                log(f"   Updated expanded start: {expanded_start:.3f}s")
             # Also extract WhisperX original timing for comparison
             whisperx_audio_segment = extract_audio_segment(waveform, sample_rate, start_time, end_time, word_clean, verbose=False)