Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -293,6 +293,7 @@ def detect_phoneme_from_audio(audio_segment: torch.Tensor, sample_rate: int, wor
|
|
293 |
def sliding_window_phoneme_match(detected_phoneme: str, expected_phoneme: str, word: str) -> Tuple[str, float, int, int]:
|
294 |
"""
|
295 |
Find the best matching substring in detected phoneme using sliding window.
|
|
|
296 |
Returns: (best_match_substring, best_score, start_index, end_index)
|
297 |
"""
|
298 |
detected_norm = normalize_phoneme_string(detected_phoneme)
|
@@ -336,7 +337,26 @@ def sliding_window_phoneme_match(detected_phoneme: str, expected_phoneme: str, w
|
|
336 |
log(f" π― Perfect match found, stopping search")
|
337 |
break
|
338 |
|
339 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
return best_match, best_score, best_start, best_end
|
341 |
|
342 |
def trim_audio_segment_by_phoneme_position(audio_segment: torch.Tensor,
|
@@ -551,8 +571,8 @@ async def transcribe(audio: UploadFile = File(...)):
|
|
551 |
else:
|
552 |
log(f"π No gap (continuous)")
|
553 |
|
554 |
-
# Calculate expanded timing (Β±0.
|
555 |
-
expansion_seconds = 0.
|
556 |
audio_duration = waveform.shape[-1] / sample_rate
|
557 |
|
558 |
expanded_start = max(0, start_time - expansion_seconds)
|
|
|
293 |
def sliding_window_phoneme_match(detected_phoneme: str, expected_phoneme: str, word: str) -> Tuple[str, float, int, int]:
|
294 |
"""
|
295 |
Find the best matching substring in detected phoneme using sliding window.
|
296 |
+
For zero scores, intelligently selects which phoneme substring to return.
|
297 |
Returns: (best_match_substring, best_score, start_index, end_index)
|
298 |
"""
|
299 |
detected_norm = normalize_phoneme_string(detected_phoneme)
|
|
|
337 |
log(f" π― Perfect match found, stopping search")
|
338 |
break
|
339 |
|
340 |
+
# Handle zero score case - aim for middle substring when possible
|
341 |
+
if best_score == 0:
|
342 |
+
log(f" β οΈ Zero score detected, selecting middle substring for audio alignment")
|
343 |
+
total_detected_len = len(detected_norm)
|
344 |
+
|
345 |
+
if total_detected_len == expected_len:
|
346 |
+
# Same length - use the whole string
|
347 |
+
best_start = 0
|
348 |
+
best_end = expected_len
|
349 |
+
best_match = detected_norm
|
350 |
+
log(f" π Same length: using full string")
|
351 |
+
else:
|
352 |
+
# Longer detected - aim for middle
|
353 |
+
middle_start = max(0, (total_detected_len - expected_len) // 2)
|
354 |
+
best_start = middle_start
|
355 |
+
best_end = middle_start + expected_len
|
356 |
+
best_match = detected_norm[best_start:best_end]
|
357 |
+
log(f" π Aiming for middle: position {best_start}-{best_end}")
|
358 |
+
|
359 |
+
log(f" π Final selection: '{best_match}' at position {best_start}-{best_end} (score: {best_score:.3f})")
|
360 |
return best_match, best_score, best_start, best_end
|
361 |
|
362 |
def trim_audio_segment_by_phoneme_position(audio_segment: torch.Tensor,
|
|
|
571 |
else:
|
572 |
log(f"π No gap (continuous)")
|
573 |
|
574 |
+
# Calculate expanded timing (Β±0.125s with boundary protection)
|
575 |
+
expansion_seconds = 0.125
|
576 |
audio_duration = waveform.shape[-1] / sample_rate
|
577 |
|
578 |
expanded_start = max(0, start_time - expansion_seconds)
|