Spaces:

greg0rs
/

fonetik-fast

Sleeping

App Files Files Community

greg0rs commited on Aug 10

Commit

779a747

verified ·

1 Parent(s): c40ce88

Update app.py

Browse files

Files changed (1) hide show

app.py +131 -67

app.py CHANGED Viewed

@@ -569,76 +569,123 @@ def sliding_window_phoneme_match(detected_phoneme: str, expected_phoneme: str, w
 def create_word_phoneme_mapping(word: str, expected_phoneme: str) -> Dict[int, str]:
     """
     Create dynamic mapping from phoneme positions to original word letters.
-    Uses dynamic programming to find optimal alignment.
     Returns: {phoneme_position: word_letters}
     """
     word_normalized = word.lower()
     phoneme_normalized = normalize_phoneme_string(expected_phoneme)
     log(f"🗺️  Creating mapping for '{word}' → '{phoneme_normalized}'")
     if not phoneme_normalized:
         return {}
     # Simple case: if lengths are equal, try direct mapping
     if len(word_normalized) == len(phoneme_normalized):
         mapping = {i: word_normalized[i] for i in range(len(phoneme_normalized))}
         log(f"  Direct mapping (equal lengths): {mapping}")
         return mapping
-    # Dynamic programming alignment for different lengths
-    word_len = len(word_normalized)
-    phoneme_len = len(phoneme_normalized)
-    # Initialize DP matrix
-    dp = [[float('inf')] * (phoneme_len + 1) for _ in range(word_len + 1)]
-    dp[0][0] = 0
-    # Fill DP matrix
-    for i in range(word_len + 1):
-        for j in range(phoneme_len + 1):
-            if i < word_len and j < phoneme_len:
-                # Match/substitute
-                cost = 0 if word_normalized[i] == phoneme_normalized[j] else 1
-                dp[i+1][j+1] = min(dp[i+1][j+1], dp[i][j] + cost)
-            if i < word_len:
                 # Delete word character (silent letter)
-                dp[i+1][j] = min(dp[i+1][j], dp[i][j] + 1)
-            if j < phoneme_len:
-                # Insert phoneme (multiple letters for one phoneme)
-                dp[i][j+1] = min(dp[i][j+1], dp[i][j] + 1)
-    # Backtrack to find alignment
-    mapping = {}
-    i, j = word_len, phoneme_len
-    while i > 0 or j > 0:
-        if i > 0 and j > 0:
-            cost = 0 if word_normalized[i-1] == phoneme_normalized[j-1] else 1
-            if dp[i][j] == dp[i-1][j-1] + cost:
-                # Match/substitute
-                if j-1 not in mapping:
-                    mapping[j-1] = word_normalized[i-1]
-                else:
-                    mapping[j-1] = word_normalized[i-1] + mapping[j-1]
-                i, j = i-1, j-1
-                continue
-        if i > 0 and dp[i][j] == dp[i-1][j] + 1:
-            # Delete word character (silent letter) - add to previous phoneme if exists
-            if j < phoneme_len and j in mapping:
-                mapping[j] = word_normalized[i-1] + mapping[j]
-            elif j > 0 and j-1 in mapping:
-                mapping[j-1] = mapping[j-1] + word_normalized[i-1]
-            i -= 1
-        elif j > 0 and dp[i][j] == dp[i][j-1] + 1:
-            # Insert phoneme (skip phoneme, should not happen in our case)
-            j -= 1
-    log(f"  Dynamic mapping: {mapping}")
-    return mapping
 def create_character_level_feedback(word: str, expected_normalized: str,
                                   detected_normalized: str,
@@ -652,10 +699,18 @@ def create_character_level_feedback(word: str, expected_normalized: str,
     result = []
     log(f"🔍 Character-level feedback for '{word}':")
     log(f"  Expected: '{expected_normalized}' (length: {len(expected_normalized)})")
     log(f"  Detected: '{detected_normalized}' (length: {len(detected_normalized)})")
     log(f"  Mapping:  {phoneme_mapping}")
     # Handle length mismatches by using the expected length and padding detected if needed
     max_length = len(expected_normalized)
     detected_padded = detected_normalized + ' ' * max(0, max_length - len(detected_normalized))
@@ -703,10 +758,12 @@ def create_character_level_feedback(word: str, expected_normalized: str,
                 # MATCH: Use original word letters from mapping in natural case
                 if i in phoneme_mapping:
                     word_letters = phoneme_mapping[i]  # Keep natural case
-                    test_result.append(word_letters)
-                    tooltip_result.append(word_letters)  # Same for tooltip
-                    real_chars += len(word_letters)
-                    matches += 1
                 else:
                     test_result.append(expected_char)
                     tooltip_result.append(expected_char)
@@ -716,17 +773,19 @@ def create_character_level_feedback(word: str, expected_normalized: str,
                 # MISMATCH: Use original word letters with bold+underline formatting
                 if i in phoneme_mapping:
                     word_letters = phoneme_mapping[i]  # Keep natural case
-                    # Format with bold and underline for display
-                    formatted_letters = f'<strong><u>{word_letters}</u></strong>'
-                    test_result.append(formatted_letters)
-                    # For tooltip, convert detected phoneme to English equivalent
-                    detected_phoneme = detected_char
-                    if detected_phoneme in PHONEME_TO_ENGLISH:
-                        english_equiv = PHONEME_TO_ENGLISH[detected_phoneme].lower()
-                        tooltip_result.append(english_equiv)
-                    else:
-                        tooltip_result.append('?')
                 else:
                     # Fallback for cases without mapping
                     formatted_char = f'<strong><u>{expected_char}</u></strong>'
@@ -1144,6 +1203,11 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
             log(f"📍 WhisperX timing: {start_time:.3f}s - {end_time:.3f}s (duration: {end_time - start_time:.3f}s)")
             log(f"🎯 Expected phoneme: '{expected_phoneme}'")
             # For very short words, expand the WhisperX timing itself before processing
             original_duration = end_time - start_time
             if original_duration < 0.1:

 def create_word_phoneme_mapping(word: str, expected_phoneme: str) -> Dict[int, str]:
     """
     Create dynamic mapping from phoneme positions to original word letters.
+    Handles cases where phonemes are longer than the word (e.g., 'go' → 'ɡoʊ').
     Returns: {phoneme_position: word_letters}
     """
     word_normalized = word.lower()
     phoneme_normalized = normalize_phoneme_string(expected_phoneme)
     log(f"🗺️  Creating mapping for '{word}' → '{phoneme_normalized}'")
+    log(f"    Word length: {len(word_normalized)}, Phoneme length: {len(phoneme_normalized)}")
     if not phoneme_normalized:
         return {}
+    # CRITICAL FIX: Handle diphthongs and multi-phoneme characters
+    # Common diphthongs that represent single vowel sounds
+    diphthongs = ['aɪ', 'aʊ', 'eɪ', 'oʊ', 'ɔɪ', 'ɪə', 'eə', 'ʊə']
+    # Check if this is likely a diphthong case (phonemes longer than word)
+    if len(phoneme_normalized) > len(word_normalized):
+        log(f"  ⚠️  Phonemes longer than word - checking for diphthongs")
+        # Try to identify diphthongs in the phoneme string
+        for diphthong in diphthongs:
+            if diphthong in phoneme_normalized:
+                log(f"    Found diphthong: {diphthong}")
+        # Special handling for common patterns
+        if word_normalized == 'go' and phoneme_normalized in ['ɡoʊ', 'goʊ']:
+            mapping = {0: 'g', 1: 'o', 2: ''}  # Map 'oʊ' diphthong to 'o'
+            log(f"  Special case mapping for 'go': {mapping}")
+            return mapping
+        # General approach: distribute word letters proportionally
+        # But avoid duplicating letters
+        mapping = {}
+        word_idx = 0
+        phonemes_per_letter = len(phoneme_normalized) / len(word_normalized)
+        for phoneme_idx in range(len(phoneme_normalized)):
+            # Calculate which word letter this phoneme should map to
+            target_word_idx = min(int(phoneme_idx / phonemes_per_letter), len(word_normalized) - 1)
+            # Only assign the letter once (first phoneme that maps to it)
+            if target_word_idx not in [mapping.get(i, None) for i in range(phoneme_idx)]:
+                mapping[phoneme_idx] = word_normalized[target_word_idx]
+            else:
+                mapping[phoneme_idx] = ''  # Empty string for extra phonemes
+        log(f"  Proportional mapping (no duplication): {mapping}")
+        return mapping
     # Simple case: if lengths are equal, try direct mapping
     if len(word_normalized) == len(phoneme_normalized):
         mapping = {i: word_normalized[i] for i in range(len(phoneme_normalized))}
         log(f"  Direct mapping (equal lengths): {mapping}")
         return mapping
+    # Case: word is longer than phonemes (silent letters)
+    if len(word_normalized) > len(phoneme_normalized):
+        log(f"  Word longer than phonemes - likely has silent letters")
+        # Use DP alignment for this case (existing code works well)
+        word_len = len(word_normalized)
+        phoneme_len = len(phoneme_normalized)
+        # Initialize DP matrix
+        dp = [[float('inf')] * (phoneme_len + 1) for _ in range(word_len + 1)]
+        dp[0][0] = 0
+        # Fill DP matrix
+        for i in range(word_len + 1):
+            for j in range(phoneme_len + 1):
+                if i < word_len and j < phoneme_len:
+                    # Match/substitute
+                    cost = 0 if word_normalized[i] == phoneme_normalized[j] else 1
+                    dp[i+1][j+1] = min(dp[i+1][j+1], dp[i][j] + cost)
+                if i < word_len:
+                    # Delete word character (silent letter)
+                    dp[i+1][j] = min(dp[i+1][j], dp[i][j] + 1)
+                if j < phoneme_len:
+                    # Insert phoneme
+                    dp[i][j+1] = min(dp[i][j+1], dp[i][j] + 1)
+        # Backtrack to find alignment
+        mapping = {}
+        i, j = word_len, phoneme_len
+        while i > 0 or j > 0:
+            if i > 0 and j > 0:
+                cost = 0 if word_normalized[i-1] == phoneme_normalized[j-1] else 1
+                if dp[i][j] == dp[i-1][j-1] + cost:
+                    # Match/substitute
+                    if j-1 not in mapping:
+                        mapping[j-1] = word_normalized[i-1]
+                    else:
+                        mapping[j-1] = word_normalized[i-1] + mapping[j-1]
+                    i, j = i-1, j-1
+                    continue
+            if i > 0 and dp[i][j] == dp[i-1][j] + 1:
                 # Delete word character (silent letter)
+                if j < phoneme_len and j in mapping:
+                    mapping[j] = word_normalized[i-1] + mapping[j]
+                elif j > 0 and j-1 in mapping:
+                    mapping[j-1] = mapping[j-1] + word_normalized[i-1]
+                i -= 1
+            elif j > 0 and dp[i][j] == dp[i][j-1] + 1:
+                # Insert phoneme
+                j -= 1
+        log(f"  DP alignment mapping: {mapping}")
+        return mapping
+    # Shouldn't reach here, but fallback to empty mapping
+    log(f"  ⚠️  Unexpected case - returning empty mapping")
+    return {}
 def create_character_level_feedback(word: str, expected_normalized: str,
                                   detected_normalized: str,
     result = []
     log(f"🔍 Character-level feedback for '{word}':")
+    log(f"  Original word: '{word}' (length: {len(word)})")
     log(f"  Expected: '{expected_normalized}' (length: {len(expected_normalized)})")
     log(f"  Detected: '{detected_normalized}' (length: {len(detected_normalized)})")
     log(f"  Mapping:  {phoneme_mapping}")
+    # CRITICAL: Count actual word letters in mapping to detect duplication issues
+    mapped_letters = ''.join(phoneme_mapping.values())
+    log(f"  Mapped letters: '{mapped_letters}' (length: {len(mapped_letters)})")
+    if len(mapped_letters) > len(word):
+        log(f"  ⚠️  WARNING: Mapped letters ({len(mapped_letters)}) longer than word ({len(word)})!")
+        log(f"     This will cause incorrect display. Checking for duplicates...")
     # Handle length mismatches by using the expected length and padding detected if needed
     max_length = len(expected_normalized)
     detected_padded = detected_normalized + ' ' * max(0, max_length - len(detected_normalized))
                 # MATCH: Use original word letters from mapping in natural case
                 if i in phoneme_mapping:
                     word_letters = phoneme_mapping[i]  # Keep natural case
+                    # CRITICAL: Skip empty mappings (used for extra phonemes in diphthongs)
+                    if word_letters:  # Only add non-empty letters
+                        test_result.append(word_letters)
+                        tooltip_result.append(word_letters)  # Same for tooltip
+                        real_chars += len(word_letters)
+                        matches += 1
                 else:
                     test_result.append(expected_char)
                     tooltip_result.append(expected_char)
                 # MISMATCH: Use original word letters with bold+underline formatting
                 if i in phoneme_mapping:
                     word_letters = phoneme_mapping[i]  # Keep natural case
+                    # CRITICAL: Skip empty mappings (used for extra phonemes in diphthongs)
+                    if word_letters:  # Only add non-empty letters
+                        # Format with bold and underline for display
+                        formatted_letters = f'<strong><u>{word_letters}</u></strong>'
+                        test_result.append(formatted_letters)
+                        # For tooltip, convert detected phoneme to English equivalent
+                        detected_phoneme = detected_char
+                        if detected_phoneme in PHONEME_TO_ENGLISH:
+                            english_equiv = PHONEME_TO_ENGLISH[detected_phoneme].lower()
+                            tooltip_result.append(english_equiv)
+                        else:
+                            tooltip_result.append('?')
                 else:
                     # Fallback for cases without mapping
                     formatted_char = f'<strong><u>{expected_char}</u></strong>'
             log(f"📍 WhisperX timing: {start_time:.3f}s - {end_time:.3f}s (duration: {end_time - start_time:.3f}s)")
             log(f"🎯 Expected phoneme: '{expected_phoneme}'")
+            # DEBUGGING: Special attention to problematic words
+            if word_clean.lower() in ['go', 'no', 'so', 'to', 'do'] or len(expected_phoneme) > len(word_clean):
+                log(f"⚠️  SPECIAL CASE: Word '{word_clean}' has {len(word_clean)} letters but {len(expected_phoneme)} phonemes")
+                log(f"    This may be a diphthong case requiring special handling")
             # For very short words, expand the WhisperX timing itself before processing
             original_duration = end_time - start_time
             if original_duration < 0.1: