Spaces:

greg0rs
/

fonetik-fast

Sleeping

App Files Files Community

greg0rs commited on Jul 19

Commit

02bce6c

verified ·

1 Parent(s): a7c5e4a

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -3

app.py CHANGED Viewed

@@ -293,6 +293,7 @@ def detect_phoneme_from_audio(audio_segment: torch.Tensor, sample_rate: int, wor
 def sliding_window_phoneme_match(detected_phoneme: str, expected_phoneme: str, word: str) -> Tuple[str, float, int, int]:
     """
     Find the best matching substring in detected phoneme using sliding window.
     Returns: (best_match_substring, best_score, start_index, end_index)
     """
     detected_norm = normalize_phoneme_string(detected_phoneme)
@@ -336,7 +337,26 @@ def sliding_window_phoneme_match(detected_phoneme: str, expected_phoneme: str, w
                 log(f"      🎯 Perfect match found, stopping search")
                 break
-    log(f"  🏆 Best match: '{best_match}' at position {best_start}-{best_end} (score: {best_score:.3f})")
     return best_match, best_score, best_start, best_end
 def trim_audio_segment_by_phoneme_position(audio_segment: torch.Tensor,
@@ -551,8 +571,8 @@ async def transcribe(audio: UploadFile = File(...)):
                 else:
                     log(f"🔗 No gap (continuous)")
-            # Calculate expanded timing (±0.15s with boundary protection)
-            expansion_seconds = 0.15
             audio_duration = waveform.shape[-1] / sample_rate
             expanded_start = max(0, start_time - expansion_seconds)

 def sliding_window_phoneme_match(detected_phoneme: str, expected_phoneme: str, word: str) -> Tuple[str, float, int, int]:
     """
     Find the best matching substring in detected phoneme using sliding window.
+    For zero scores, intelligently selects which phoneme substring to return.
     Returns: (best_match_substring, best_score, start_index, end_index)
     """
     detected_norm = normalize_phoneme_string(detected_phoneme)
                 log(f"      🎯 Perfect match found, stopping search")
                 break
+    # Handle zero score case - aim for middle substring when possible
+    if best_score == 0:
+        log(f"  ⚠️  Zero score detected, selecting middle substring for audio alignment")
+        total_detected_len = len(detected_norm)
+        if total_detected_len == expected_len:
+            # Same length - use the whole string
+            best_start = 0
+            best_end = expected_len
+            best_match = detected_norm
+            log(f"    📍 Same length: using full string")
+        else:
+            # Longer detected - aim for middle
+            middle_start = max(0, (total_detected_len - expected_len) // 2)
+            best_start = middle_start
+            best_end = middle_start + expected_len
+            best_match = detected_norm[best_start:best_end]
+            log(f"    📍 Aiming for middle: position {best_start}-{best_end}")
+    log(f"  🏆 Final selection: '{best_match}' at position {best_start}-{best_end} (score: {best_score:.3f})")
     return best_match, best_score, best_start, best_end
 def trim_audio_segment_by_phoneme_position(audio_segment: torch.Tensor,
                 else:
                     log(f"🔗 No gap (continuous)")
+            # Calculate expanded timing (±0.125s with boundary protection)
+            expansion_seconds = 0.125
             audio_duration = waveform.shape[-1] / sample_rate
             expanded_start = max(0, start_time - expansion_seconds)