greg0rs commited on
Commit
02bce6c
Β·
verified Β·
1 Parent(s): a7c5e4a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -3
app.py CHANGED
@@ -293,6 +293,7 @@ def detect_phoneme_from_audio(audio_segment: torch.Tensor, sample_rate: int, wor
293
  def sliding_window_phoneme_match(detected_phoneme: str, expected_phoneme: str, word: str) -> Tuple[str, float, int, int]:
294
  """
295
  Find the best matching substring in detected phoneme using sliding window.
 
296
  Returns: (best_match_substring, best_score, start_index, end_index)
297
  """
298
  detected_norm = normalize_phoneme_string(detected_phoneme)
@@ -336,7 +337,26 @@ def sliding_window_phoneme_match(detected_phoneme: str, expected_phoneme: str, w
336
  log(f" 🎯 Perfect match found, stopping search")
337
  break
338
 
339
- log(f" πŸ† Best match: '{best_match}' at position {best_start}-{best_end} (score: {best_score:.3f})")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  return best_match, best_score, best_start, best_end
341
 
342
  def trim_audio_segment_by_phoneme_position(audio_segment: torch.Tensor,
@@ -551,8 +571,8 @@ async def transcribe(audio: UploadFile = File(...)):
551
  else:
552
  log(f"πŸ”— No gap (continuous)")
553
 
554
- # Calculate expanded timing (Β±0.15s with boundary protection)
555
- expansion_seconds = 0.15
556
  audio_duration = waveform.shape[-1] / sample_rate
557
 
558
  expanded_start = max(0, start_time - expansion_seconds)
 
293
  def sliding_window_phoneme_match(detected_phoneme: str, expected_phoneme: str, word: str) -> Tuple[str, float, int, int]:
294
  """
295
  Find the best matching substring in detected phoneme using sliding window.
296
+ For zero scores, intelligently selects which phoneme substring to return.
297
  Returns: (best_match_substring, best_score, start_index, end_index)
298
  """
299
  detected_norm = normalize_phoneme_string(detected_phoneme)
 
337
  log(f" 🎯 Perfect match found, stopping search")
338
  break
339
 
340
+ # Handle zero score case - aim for middle substring when possible
341
+ if best_score == 0:
342
+ log(f" ⚠️ Zero score detected, selecting middle substring for audio alignment")
343
+ total_detected_len = len(detected_norm)
344
+
345
+ if total_detected_len == expected_len:
346
+ # Same length - use the whole string
347
+ best_start = 0
348
+ best_end = expected_len
349
+ best_match = detected_norm
350
+ log(f" πŸ“ Same length: using full string")
351
+ else:
352
+ # Longer detected - aim for middle
353
+ middle_start = max(0, (total_detected_len - expected_len) // 2)
354
+ best_start = middle_start
355
+ best_end = middle_start + expected_len
356
+ best_match = detected_norm[best_start:best_end]
357
+ log(f" πŸ“ Aiming for middle: position {best_start}-{best_end}")
358
+
359
+ log(f" πŸ† Final selection: '{best_match}' at position {best_start}-{best_end} (score: {best_score:.3f})")
360
  return best_match, best_score, best_start, best_end
361
 
362
  def trim_audio_segment_by_phoneme_position(audio_segment: torch.Tensor,
 
571
  else:
572
  log(f"πŸ”— No gap (continuous)")
573
 
574
+ # Calculate expanded timing (Β±0.125s with boundary protection)
575
+ expansion_seconds = 0.125
576
  audio_duration = waveform.shape[-1] / sample_rate
577
 
578
  expanded_start = max(0, start_time - expansion_seconds)