greg0rs commited on
Commit
6210a22
·
verified ·
1 Parent(s): 0e377d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -2
app.py CHANGED
@@ -368,6 +368,62 @@ def calculate_similarity(detected: str, expected: str) -> float:
368
  expected_norm = normalize_phoneme_string(expected)
369
  return SequenceMatcher(None, detected_norm, expected_norm).ratio()
370
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  def extract_audio_segment(waveform: torch.Tensor, sample_rate: int,
372
  start_time: float, end_time: float, word: str,
373
  verbose: bool = True) -> torch.Tensor:
@@ -965,8 +1021,17 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
965
  else:
966
  log(f"🔗 No gap (continuous)")
967
 
968
- # Calculate expanded timing (±0.125s with boundary protection)
969
- expansion_seconds = 0.125
 
 
 
 
 
 
 
 
 
970
  audio_duration = waveform.shape[-1] / sample_rate
971
 
972
  expanded_start = max(0, start_time - expansion_seconds)
@@ -977,6 +1042,16 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
977
  # Extract expanded audio segment
978
  expanded_audio_segment = extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word_clean, verbose=True)
979
 
 
 
 
 
 
 
 
 
 
 
980
  # Also extract WhisperX original timing for comparison
981
  whisperx_audio_segment = extract_audio_segment(waveform, sample_rate, start_time, end_time, word_clean, verbose=False)
982
 
 
368
  expected_norm = normalize_phoneme_string(expected)
369
  return SequenceMatcher(None, detected_norm, expected_norm).ratio()
370
 
371
+ def detect_word_boundary_overlap(audio_segment: torch.Tensor, sample_rate: int, word: str) -> float:
372
+ """
373
+ Analyze first 1/3 of audio segment for: [noise] → [silence] → [noise] pattern
374
+ Returns: offset in seconds to skip initial noise, or 0.0 if no pattern found
375
+ """
376
+ if audio_segment.shape[-1] == 0:
377
+ return 0.0
378
+
379
+ # Analyze only first 1/3 of segment
380
+ first_third_samples = audio_segment.shape[-1] // 3
381
+ if first_third_samples < sample_rate * 0.1: # Less than 100ms total
382
+ return 0.0
383
+
384
+ first_third = audio_segment[:, :first_third_samples]
385
+
386
+ # Calculate energy in small windows (50ms chunks)
387
+ window_size = int(0.05 * sample_rate) # 50ms windows
388
+ if window_size <= 0:
389
+ return 0.0
390
+
391
+ energy_levels = []
392
+
393
+ for i in range(0, first_third_samples - window_size, window_size):
394
+ window = first_third[:, i:i + window_size]
395
+ energy = torch.mean(window ** 2).item() # RMS energy
396
+ energy_levels.append(energy)
397
+
398
+ if len(energy_levels) < 3:
399
+ return 0.0
400
+
401
+ # Look for pattern: [high energy] → [low energy] → [high energy]
402
+ silence_threshold = np.percentile(energy_levels, 20) # Bottom 20%
403
+ noise_threshold = silence_threshold * 3
404
+
405
+ log(f"🔍 Boundary analysis for '{word}': {len(energy_levels)} windows, silence_thresh={silence_threshold:.6f}")
406
+
407
+ # Find sustained silence (2+ consecutive low-energy windows)
408
+ for i in range(len(energy_levels) - 1):
409
+ if (energy_levels[i] < silence_threshold and
410
+ energy_levels[i + 1] < silence_threshold):
411
+
412
+ # Check if there was noise before silence
413
+ noise_before = any(e > noise_threshold for e in energy_levels[:i])
414
+ # Check if there's noise after silence
415
+ noise_after = any(e > noise_threshold for e in energy_levels[i + 2:])
416
+
417
+ if noise_before and noise_after:
418
+ # Found the pattern! Return offset to end of silence
419
+ silence_end_sample = (i + 2) * window_size
420
+ offset_seconds = silence_end_sample / sample_rate
421
+ log(f" ✅ Found overlap pattern: noise→silence(pos {i})→noise, trimming {offset_seconds:.3f}s")
422
+ return offset_seconds
423
+
424
+ log(f" ❌ No overlap pattern detected")
425
+ return 0.0 # No pattern detected
426
+
427
  def extract_audio_segment(waveform: torch.Tensor, sample_rate: int,
428
  start_time: float, end_time: float, word: str,
429
  verbose: bool = True) -> torch.Tensor:
 
1021
  else:
1022
  log(f"🔗 No gap (continuous)")
1023
 
1024
+ # Calculate expanded timing with adaptive expansion for short words
1025
+ original_duration = end_time - start_time
1026
+
1027
+ # Use larger expansion for very short words
1028
+ if original_duration < 0.5:
1029
+ expansion_seconds = 0.25 # Larger expansion for short words
1030
+ log(f"🔍 Short word detected ({original_duration:.3f}s), using expanded timing: ±{expansion_seconds}s")
1031
+ else:
1032
+ expansion_seconds = 0.125 # Normal expansion
1033
+ log(f"📏 Normal word length ({original_duration:.3f}s), using standard timing: ±{expansion_seconds}s")
1034
+
1035
  audio_duration = waveform.shape[-1] / sample_rate
1036
 
1037
  expanded_start = max(0, start_time - expansion_seconds)
 
1042
  # Extract expanded audio segment
1043
  expanded_audio_segment = extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word_clean, verbose=True)
1044
 
1045
+ # Check for word boundary overlap and trim if needed
1046
+ boundary_offset = detect_word_boundary_overlap(expanded_audio_segment, sample_rate, word_clean)
1047
+ if boundary_offset > 0:
1048
+ log(f"🔧 Detected word overlap, trimming {boundary_offset:.3f}s from start")
1049
+ trim_samples = int(boundary_offset * sample_rate)
1050
+ expanded_audio_segment = expanded_audio_segment[:, trim_samples:]
1051
+ # Update expanded_start for accurate timing logs
1052
+ expanded_start += boundary_offset
1053
+ log(f" Updated expanded start: {expanded_start:.3f}s")
1054
+
1055
  # Also extract WhisperX original timing for comparison
1056
  whisperx_audio_segment = extract_audio_segment(waveform, sample_rate, start_time, end_time, word_clean, verbose=False)
1057